minecraft.wiki-mirror/backup.py

import os
import time
import random
from playwright.sync_api import sync_playwright
from urllib.parse import urljoin

# Configuration
BASE_URL = 'https://minecraft.wiki'  # Replace with your wiki's base URL
DIRECTORY = 'wiki_backup'  # Directory to save the pages
MAX_RETRIES = 5  # Maximum number of retries for each request
VISITED_PAGES = set()  # To keep track of visited pages

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def fetch_page_content(page, url):
    for attempt in range(MAX_RETRIES):
        try:
            page.goto(url)
            time.sleep(random.uniform(1, 3))  # Random delay to mimic human behavior

            # Wait for the edit link to be visible
            edit_link = page.query_selector('a[href*="?action=edit"]')
            if not edit_link:
                print(f"Edit link not found for {url}")
                return None

            edit_href = edit_link.get_attribute('href')
            if not edit_href:
                print(f"Could not get edit URL for {url}")
                return None

            # Construct the full edit URL
            edit_url = urljoin(BASE_URL, edit_href)

            page.goto(edit_url)
            time.sleep(random.uniform(1, 3))  # Random delay to mimic human behavior

            # Wait for the textarea with the page content
            textarea = page.query_selector('textarea#wpTextbox1')
            if not textarea:
                print(f"Textarea not found for {url}")
                return None

            content = textarea.input_value()
            return content

        except Exception as e:
            print(f"Error fetching content for {url} (Attempt {attempt + 1}): {e}")
            time.sleep(random.uniform(2, 5))  # Random delay between retries
    return None

def save_page_content(directory, title, content):
    # Replace problematic characters in the filename
    filename = os.path.join(directory, f"{title.replace('/', '_').replace(':', '-')}.txt")
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

def get_links_from_page(page, base_url):
    links = set()
    for link in page.query_selector_all('a'):
        href = link.get_attribute('href')
        if href and href.startswith('/w/'):
            full_url = urljoin(base_url, href)
            title = href.split('/w/')[-1]
            links.add((title, full_url))
    return links

def bfs_scrape(start_title):
    create_directory(DIRECTORY)

    with sync_playwright() as p:
        try:
            browser = p.chromium.launch(headless=True)  # Set to False if you want to see the browser
            page = browser.new_page()

            queue = [(start_title, f"{BASE_URL}/w/{start_title}")]
            page_count = 0

            while queue:
                title, url = queue.pop(0)

                if title in VISITED_PAGES:
                    continue

                print(f"Processing {title}...")
                VISITED_PAGES.add(title)

                content = fetch_page_content(page, url)
                if content:
                    save_page_content(DIRECTORY, title, content)
                    page_count += 1
                    print(f"Saved {title}")

                try:
                    page.goto(url)
                    time.sleep(random.uniform(1, 3))  # Random delay to mimic human behavior

                    # Get all links from the current page
                    new_links = get_links_from_page(page, BASE_URL)
                    for new_title, new_url in new_links:
                        if new_title not in VISITED_PAGES:
                            queue.append((new_title, new_url))

                except Exception as e:
                    print(f"Error fetching links from {url}: {e}")
                    continue

        finally:
            browser.close()

    return page_count

if __name__ == '__main__':
    start_title = 'Main_Page'  # Replace with the starting page title if different
    total_pages = bfs_scrape(start_title)

    print(f"Successfully downloaded {total_pages} pages.")