some additional error handling

2024-12-19 16:51:24 -05:00 · 2024-12-19 16:51:24 -05:00 · 11137f9f3c
commit 11137f9f3c
parent 59d9cb7a57
1 changed files with 12 additions and 3 deletions
--- a/backup.py
+++ b/backup.py
@ -4,6 +4,7 @@ import random
 from playwright.sync_api import sync_playwright
 from urllib.parse import urljoin
 import datetime
+import re

 # Configuration
 BASE_URL = 'https://minecraft.wiki'  # Replace with your wiki's base URL
@ -88,7 +89,6 @@ def parse_links_from_file(filename):
        content = file.read()
    
    # Use a simple regex to extract wiki links
-    import re
    pattern = r'\[\[([^\]]+)\]\]'
    matches = re.findall(pattern, content)
    
@ -154,8 +154,17 @@ def bfs_scrape(start_title):
                        if new_title not in VISITED_PAGES:
                            queue.append((new_title, new_url))
                else:
-                    # Update the timestamp to current time even if no new content was fetched
-                    os.utime(filename, None)
+                    print(f"No content found for {title}. Skipping further processing.")
+                    
+                    # Reparse links from the existing file if it exists
+                    if os.path.exists(filename):
+                        try:
+                            new_links = parse_links_from_file(filename)
+                            for new_title, new_url in new_links:
+                                if new_title not in VISITED_PAGES:
+                                    queue.append((new_title, new_url))
+                        except Exception as e:
+                            print(f"Error parsing links from {filename}: {e}")

                try:
                    page.goto(url)