Check update times

2024-12-19 16:21:13 -05:00 · 2024-12-19 16:21:13 -05:00 · 751c2044ba
commit 751c2044ba
parent 00a27f7300
1 changed files with 18 additions and 1 deletions
--- a/backup.py
+++ b/backup.py
@ -3,12 +3,14 @@ import time
 import random
 from playwright.sync_api import sync_playwright
 from urllib.parse import urljoin
+import datetime

 # Configuration
 BASE_URL = 'https://minecraft.wiki'  # Replace with your wiki's base URL
 DIRECTORY = 'wiki_backup'  # Directory to save the pages
 MAX_RETRIES = 5  # Maximum number of retries for each request
 VISITED_PAGES = set()  # To keep track of visited pages
+ONE_WEEK_SECONDS = 7 * 24 * 60 * 60  # One week in seconds

 def create_directory(directory):
    if not os.path.exists(directory):
@ -67,6 +69,9 @@ def save_page_content(directory, title, content):
    filename = os.path.join(directory, f"{title.replace('/', '_').replace(':', '-')}.txt")
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)
+    
+    # Update the timestamp to current time
+    os.utime(filename, None)

 def get_links_from_page(page, base_url):
    links = set()
@ -95,6 +100,15 @@ def bfs_scrape(start_title):
                if title in VISITED_PAGES:
                    continue

+                # Check if the file already exists and was last modified more than a week ago
+                filename = os.path.join(DIRECTORY, f"{title.replace('/', '_').replace(':', '-')}.txt")
+                if os.path.exists(filename):
+                    last_modified_time = os.path.getmtime(filename)
+                    current_time = time.time()
+                    if current_time - last_modified_time < ONE_WEEK_SECONDS:
+                        print(f"Skipping {title} as it was recently checked.")
+                        continue
+
                print(f"Processing {title}...")
                VISITED_PAGES.add(title)

@ -103,6 +117,9 @@ def bfs_scrape(start_title):
                    save_page_content(DIRECTORY, title, content)
                    page_count += 1
                    print(f"Saved {title}")
+                else:
+                    # Update the timestamp to current time even if no new content was fetched
+                    os.utime(filename, None)

                try:
                    page.goto(url)
@ -127,4 +144,4 @@ if __name__ == '__main__':
    start_title = 'Main_Page'  # Replace with the starting page title if different
    total_pages = bfs_scrape(start_title)

-    print(f"Successfully downloaded {total_pages} pages.")
+    print(f"Successfully downloaded or skipped {total_pages} pages.")