Check update times

This commit is contained in:
Ryan Voots 2024-12-19 16:21:13 -05:00
parent 00a27f7300
commit 751c2044ba

View file

@ -3,12 +3,14 @@ import time
import random
from playwright.sync_api import sync_playwright
from urllib.parse import urljoin
import datetime
# Configuration
BASE_URL = 'https://minecraft.wiki' # Replace with your wiki's base URL
DIRECTORY = 'wiki_backup' # Directory to save the pages
MAX_RETRIES = 5 # Maximum number of retries for each request
VISITED_PAGES = set() # To keep track of visited pages
ONE_WEEK_SECONDS = 7 * 24 * 60 * 60 # One week in seconds
def create_directory(directory):
if not os.path.exists(directory):
@ -67,6 +69,9 @@ def save_page_content(directory, title, content):
filename = os.path.join(directory, f"{title.replace('/', '_').replace(':', '-')}.txt")
with open(filename, 'w', encoding='utf-8') as file:
file.write(content)
# Update the timestamp to current time
os.utime(filename, None)
def get_links_from_page(page, base_url):
links = set()
@ -95,6 +100,15 @@ def bfs_scrape(start_title):
if title in VISITED_PAGES:
continue
# Check if the file already exists and was last modified more than a week ago
filename = os.path.join(DIRECTORY, f"{title.replace('/', '_').replace(':', '-')}.txt")
if os.path.exists(filename):
last_modified_time = os.path.getmtime(filename)
current_time = time.time()
if current_time - last_modified_time < ONE_WEEK_SECONDS:
print(f"Skipping {title} as it was recently checked.")
continue
print(f"Processing {title}...")
VISITED_PAGES.add(title)
@ -103,6 +117,9 @@ def bfs_scrape(start_title):
save_page_content(DIRECTORY, title, content)
page_count += 1
print(f"Saved {title}")
else:
# Update the timestamp to current time even if no new content was fetched
os.utime(filename, None)
try:
page.goto(url)
@ -127,4 +144,4 @@ if __name__ == '__main__':
start_title = 'Main_Page' # Replace with the starting page title if different
total_pages = bfs_scrape(start_title)
print(f"Successfully downloaded {total_pages} pages.")
print(f"Successfully downloaded or skipped {total_pages} pages.")