minecraft.wiki-mirror/backup.py

196 lines
7.5 KiB
Python

import os
import time
import random
from playwright.sync_api import sync_playwright
from urllib.parse import urljoin
import re
# Configuration
BASE_URL = 'https://minecraft.wiki' # Replace with your wiki's base URL
DIRECTORY = 'wiki_backup' # Directory to save the pages
MAX_RETRIES = 5 # Maximum number of retries for each request
VISITED_PAGES = set() # To keep track of visited pages
ONE_WEEK_SECONDS = 7 * 24 * 60 * 60 # One week in seconds
def create_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)
def fetch_page_content(page, url):
for attempt in range(MAX_RETRIES):
try:
page.goto(url)
time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior
# Wait for the edit links to be visible
edit_links = page.query_selector_all('a[href*="?action=edit"]')
if not edit_links:
print(f"No edit links found for {url}")
return None
edit_source_link = None
edit_wysiwyg_link = None
for link in edit_links:
href = link.get_attribute('href')
if href and '?action=edit&mode=source' in href:
edit_source_link = urljoin(BASE_URL, href)
elif href and '?action=edit' in href:
edit_wysiwyg_link = urljoin(BASE_URL, href)
# Prioritize the edit source link
if edit_source_link:
edit_url = edit_source_link
elif edit_wysiwyg_link:
edit_url = edit_wysiwyg_link
else:
print(f"Could not find valid edit URL for {url}")
return None
page.goto(edit_url)
time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior
# Wait for the textarea with the page content
textarea = page.query_selector('textarea#wpTextbox1')
if not textarea:
print(f"Textarea not found for {url}")
return None
content = textarea.input_value()
return content
except Exception as e:
print(f"Error fetching content for {url} (Attempt {attempt + 1}): {e}")
time.sleep(random.uniform(2, 5)) # Random delay between retries
return None
def save_page_content(directory, title, content):
# Replace problematic characters in the filename
filename = os.path.join(directory, f"{title.replace('/', '_').replace(':', '-')}.txt")
with open(filename, 'w', encoding='utf-8') as file:
file.write(content)
# Update the timestamp to current time
os.utime(filename, None)
def get_links_from_page(page, base_url):
links = set()
for link in page.query_selector_all('a'):
href = link.get_attribute('href')
if href and href.startswith('/w/'):
full_url = urljoin(base_url, href)
title = href.split('/w/')[-1]
# Ignore pages with language codes
if not re.match(r'^[a-z]{2}:', title):
links.add((title, full_url))
return links
def parse_links_from_file(filename):
with open(filename, 'r', encoding='utf-8') as file:
content = file.read()
# Use a simple regex to extract wiki links
pattern = r'\[\[([^\]]+)\]\]'
matches = re.findall(pattern, content)
links = set()
for match in matches:
if '|' in match:
title = match.split('|')[0]
else:
title = match
title = title.strip()
# Ignore pages with language codes
if title and not title.startswith(':') and not title.startswith('#') and not re.match(r'^[a-z]{2}:', title):
full_url = urljoin(BASE_URL, f'/w/{title}')
links.add((title, full_url))
return links
def bfs_scrape(start_title):
create_directory(DIRECTORY)
with sync_playwright() as p:
try:
browser = p.chromium.launch(headless=True) # Set to False if you want to see the browser
page = browser.new_page()
queue = [(start_title, f"{BASE_URL}/w/{start_title}")]
page_count = 0
while queue:
title, url = queue.pop(0)
if title in VISITED_PAGES:
continue
# Check if the file already exists and was last modified more than a week ago
filename = os.path.join(DIRECTORY, f"{title.replace('/', '_').replace(':', '-')}.txt")
if os.path.exists(filename):
last_modified_time = os.path.getmtime(filename)
current_time = time.time()
if current_time - last_modified_time < ONE_WEEK_SECONDS:
print(f"Skipping {title} as it was recently checked.")
# Reparse links from the existing file
new_links = parse_links_from_file(filename)
for new_title, new_url in new_links:
if new_title not in VISITED_PAGES:
queue.append((new_title, new_url))
continue
print(f"Processing {title}...")
VISITED_PAGES.add(title)
content = fetch_page_content(page, url)
if content:
save_page_content(DIRECTORY, title, content)
page_count += 1
print(f"Saved {title}")
# Get all links from the current page
new_links = get_links_from_page(page, BASE_URL)
for new_title, new_url in new_links:
if new_title not in VISITED_PAGES:
queue.append((new_title, new_url))
else:
print(f"No content found for {title}. Skipping further processing.")
# Reparse links from the existing file if it exists
if os.path.exists(filename):
try:
new_links = parse_links_from_file(filename)
for new_title, new_url in new_links:
if new_title not in VISITED_PAGES:
queue.append((new_title, new_url))
except Exception as e:
print(f"Error parsing links from {filename}: {e}")
try:
page.goto(url)
time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior
# Get all links from the current page
new_links = get_links_from_page(page, BASE_URL)
for new_title, new_url in new_links:
if new_title not in VISITED_PAGES:
queue.append((new_title, new_url))
except Exception as e:
print(f"Error fetching links from {url}: {e}")
continue
finally:
browser.close()
return page_count
if __name__ == '__main__':
start_title = 'Main_Page' # Replace with the starting page title if different
total_pages = bfs_scrape(start_title)
print(f"Successfully downloaded or skipped {total_pages} pages.")