some additional error handling

This commit is contained in:
Ryan Voots 2024-12-19 16:51:24 -05:00
parent 59d9cb7a57
commit 11137f9f3c

View file

@ -4,6 +4,7 @@ import random
from playwright.sync_api import sync_playwright
from urllib.parse import urljoin
import datetime
import re
# Configuration
BASE_URL = 'https://minecraft.wiki' # Replace with your wiki's base URL
@ -88,7 +89,6 @@ def parse_links_from_file(filename):
content = file.read()
# Use a simple regex to extract wiki links
import re
pattern = r'\[\[([^\]]+)\]\]'
matches = re.findall(pattern, content)
@ -154,8 +154,17 @@ def bfs_scrape(start_title):
if new_title not in VISITED_PAGES:
queue.append((new_title, new_url))
else:
# Update the timestamp to current time even if no new content was fetched
os.utime(filename, None)
print(f"No content found for {title}. Skipping further processing.")
# Reparse links from the existing file if it exists
if os.path.exists(filename):
try:
new_links = parse_links_from_file(filename)
for new_title, new_url in new_links:
if new_title not in VISITED_PAGES:
queue.append((new_title, new_url))
except Exception as e:
print(f"Error parsing links from {filename}: {e}")
try:
page.goto(url)