minecraft.wiki-mirror/backup.py
2024-12-19 11:47:58 -05:00

119 lines
4.2 KiB
Python

import os
import time
import random
from playwright.sync_api import sync_playwright
from urllib.parse import urljoin
# Configuration
BASE_URL = 'https://minecraft.wiki' # Replace with your wiki's base URL
DIRECTORY = 'wiki_backup' # Directory to save the pages
MAX_RETRIES = 5 # Maximum number of retries for each request
VISITED_PAGES = set() # To keep track of visited pages
def create_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)
def fetch_page_content(page, url):
for attempt in range(MAX_RETRIES):
try:
page.goto(url)
time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior
# Wait for the edit link to be visible
edit_link = page.query_selector('a[href*="?action=edit"]')
if not edit_link:
print(f"Edit link not found for {url}")
return None
edit_href = edit_link.get_attribute('href')
if not edit_href:
print(f"Could not get edit URL for {url}")
return None
# Construct the full edit URL
edit_url = urljoin(BASE_URL, edit_href)
page.goto(edit_url)
time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior
# Wait for the textarea with the page content
textarea = page.query_selector('textarea#wpTextbox1')
if not textarea:
print(f"Textarea not found for {url}")
return None
content = textarea.input_value()
return content
except Exception as e:
print(f"Error fetching content for {url} (Attempt {attempt + 1}): {e}")
time.sleep(random.uniform(2, 5)) # Random delay between retries
return None
def save_page_content(directory, title, content):
# Replace problematic characters in the filename
filename = os.path.join(directory, f"{title.replace('/', '_').replace(':', '-')}.txt")
with open(filename, 'w', encoding='utf-8') as file:
file.write(content)
def get_links_from_page(page, base_url):
links = set()
for link in page.query_selector_all('a'):
href = link.get_attribute('href')
if href and href.startswith('/w/'):
full_url = urljoin(base_url, href)
title = href.split('/w/')[-1]
links.add((title, full_url))
return links
def bfs_scrape(start_title):
create_directory(DIRECTORY)
with sync_playwright() as p:
try:
browser = p.chromium.launch(headless=True) # Set to False if you want to see the browser
page = browser.new_page()
queue = [(start_title, f"{BASE_URL}/w/{start_title}")]
page_count = 0
while queue:
title, url = queue.pop(0)
if title in VISITED_PAGES:
continue
print(f"Processing {title}...")
VISITED_PAGES.add(title)
content = fetch_page_content(page, url)
if content:
save_page_content(DIRECTORY, title, content)
page_count += 1
print(f"Saved {title}")
try:
page.goto(url)
time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior
# Get all links from the current page
new_links = get_links_from_page(page, BASE_URL)
for new_title, new_url in new_links:
if new_title not in VISITED_PAGES:
queue.append((new_title, new_url))
except Exception as e:
print(f"Error fetching links from {url}: {e}")
continue
finally:
browser.close()
return page_count
if __name__ == '__main__':
start_title = 'Main_Page' # Replace with the starting page title if different
total_pages = bfs_scrape(start_title)
print(f"Successfully downloaded {total_pages} pages.")