Check update times
This commit is contained in:
parent
00a27f7300
commit
751c2044ba
1 changed files with 18 additions and 1 deletions
19
backup.py
19
backup.py
|
@ -3,12 +3,14 @@ import time
|
|||
import random
|
||||
from playwright.sync_api import sync_playwright
|
||||
from urllib.parse import urljoin
|
||||
import datetime
|
||||
|
||||
# Configuration
|
||||
BASE_URL = 'https://minecraft.wiki' # Replace with your wiki's base URL
|
||||
DIRECTORY = 'wiki_backup' # Directory to save the pages
|
||||
MAX_RETRIES = 5 # Maximum number of retries for each request
|
||||
VISITED_PAGES = set() # To keep track of visited pages
|
||||
ONE_WEEK_SECONDS = 7 * 24 * 60 * 60 # One week in seconds
|
||||
|
||||
def create_directory(directory):
|
||||
if not os.path.exists(directory):
|
||||
|
@ -67,6 +69,9 @@ def save_page_content(directory, title, content):
|
|||
filename = os.path.join(directory, f"{title.replace('/', '_').replace(':', '-')}.txt")
|
||||
with open(filename, 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
|
||||
# Update the timestamp to current time
|
||||
os.utime(filename, None)
|
||||
|
||||
def get_links_from_page(page, base_url):
|
||||
links = set()
|
||||
|
@ -95,6 +100,15 @@ def bfs_scrape(start_title):
|
|||
if title in VISITED_PAGES:
|
||||
continue
|
||||
|
||||
# Check if the file already exists and was last modified more than a week ago
|
||||
filename = os.path.join(DIRECTORY, f"{title.replace('/', '_').replace(':', '-')}.txt")
|
||||
if os.path.exists(filename):
|
||||
last_modified_time = os.path.getmtime(filename)
|
||||
current_time = time.time()
|
||||
if current_time - last_modified_time < ONE_WEEK_SECONDS:
|
||||
print(f"Skipping {title} as it was recently checked.")
|
||||
continue
|
||||
|
||||
print(f"Processing {title}...")
|
||||
VISITED_PAGES.add(title)
|
||||
|
||||
|
@ -103,6 +117,9 @@ def bfs_scrape(start_title):
|
|||
save_page_content(DIRECTORY, title, content)
|
||||
page_count += 1
|
||||
print(f"Saved {title}")
|
||||
else:
|
||||
# Update the timestamp to current time even if no new content was fetched
|
||||
os.utime(filename, None)
|
||||
|
||||
try:
|
||||
page.goto(url)
|
||||
|
@ -127,4 +144,4 @@ if __name__ == '__main__':
|
|||
start_title = 'Main_Page' # Replace with the starting page title if different
|
||||
total_pages = bfs_scrape(start_title)
|
||||
|
||||
print(f"Successfully downloaded {total_pages} pages.")
|
||||
print(f"Successfully downloaded or skipped {total_pages} pages.")
|
||||
|
|
Loading…
Add table
Reference in a new issue