some additional error handling
This commit is contained in:
parent
59d9cb7a57
commit
11137f9f3c
1 changed files with 12 additions and 3 deletions
15
backup.py
15
backup.py
|
@ -4,6 +4,7 @@ import random
|
|||
from playwright.sync_api import sync_playwright
|
||||
from urllib.parse import urljoin
|
||||
import datetime
|
||||
import re
|
||||
|
||||
# Configuration
|
||||
BASE_URL = 'https://minecraft.wiki' # Replace with your wiki's base URL
|
||||
|
@ -88,7 +89,6 @@ def parse_links_from_file(filename):
|
|||
content = file.read()
|
||||
|
||||
# Use a simple regex to extract wiki links
|
||||
import re
|
||||
pattern = r'\[\[([^\]]+)\]\]'
|
||||
matches = re.findall(pattern, content)
|
||||
|
||||
|
@ -154,8 +154,17 @@ def bfs_scrape(start_title):
|
|||
if new_title not in VISITED_PAGES:
|
||||
queue.append((new_title, new_url))
|
||||
else:
|
||||
# Update the timestamp to current time even if no new content was fetched
|
||||
os.utime(filename, None)
|
||||
print(f"No content found for {title}. Skipping further processing.")
|
||||
|
||||
# Reparse links from the existing file if it exists
|
||||
if os.path.exists(filename):
|
||||
try:
|
||||
new_links = parse_links_from_file(filename)
|
||||
for new_title, new_url in new_links:
|
||||
if new_title not in VISITED_PAGES:
|
||||
queue.append((new_title, new_url))
|
||||
except Exception as e:
|
||||
print(f"Error parsing links from {filename}: {e}")
|
||||
|
||||
try:
|
||||
page.goto(url)
|
||||
|
|
Loading…
Add table
Reference in a new issue