minecraft.wiki-mirror/backup.py

import os
import time
import random
from playwright.sync_api import sync_playwright
from urllib.parse import urljoin
import re

# Configuration
BASE_URL = 'https://minecraft.wiki'  # Replace with your wiki's base URL
DIRECTORY = 'wiki_backup'  # Directory to save the pages
MAX_RETRIES = 5  # Maximum number of retries for each request
VISITED_PAGES = set()  # To keep track of visited pages
ONE_WEEK_SECONDS = 7 * 24 * 60 * 60  # One week in seconds

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def fetch_page_content(page, url):
    for attempt in range(MAX_RETRIES):
        try:
            page.goto(url)
            time.sleep(random.uniform(1, 3))  # Random delay to mimic human behavior

            # Wait for the edit links to be visible
            edit_links = page.query_selector_all('a[href*="?action=edit"]')
            if not edit_links:
                print(f"No edit links found for {url}")
                return None

            edit_source_link = None
            edit_wysiwyg_link = None

            for link in edit_links:
                href = link.get_attribute('href')
                if href and '?action=edit&mode=source' in href:
                    edit_source_link = urljoin(BASE_URL, href)
                elif href and '?action=edit' in href:
                    edit_wysiwyg_link = urljoin(BASE_URL, href)

            # Prioritize the edit source link
            if edit_source_link:
                edit_url = edit_source_link
            elif edit_wysiwyg_link:
                edit_url = edit_wysiwyg_link
            else:
                print(f"Could not find valid edit URL for {url}")
                return None

            page.goto(edit_url)
            time.sleep(random.uniform(1, 3))  # Random delay to mimic human behavior

            # Wait for the textarea with the page content
            textarea = page.query_selector('textarea#wpTextbox1')
            if not textarea:
                print(f"Textarea not found for {url}")
                return None

            content = textarea.input_value()
            return content

        except Exception as e:
            print(f"Error fetching content for {url} (Attempt {attempt + 1}): {e}")
            time.sleep(random.uniform(2, 5))  # Random delay between retries
    return None

def save_page_content(directory, title, content):
    # Replace problematic characters in the filename
    filename = os.path.join(directory, f"{title.replace('/', '_').replace(':', '-')}.txt")
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

    # Update the timestamp to current time
    os.utime(filename, None)

def get_links_from_page(page, base_url):
    links = set()
    for link in page.query_selector_all('a'):
        href = link.get_attribute('href')
        if href and href.startswith('/w/'):
            full_url = urljoin(base_url, href)
            title = href.split('/w/')[-1]

            # Ignore pages with language codes
            if not re.match(r'^[a-z]{2}:', title):
                links.add((title, full_url))

    return links

def parse_links_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()

    # Use a simple regex to extract wiki links
    pattern = r'\[\[([^\]]+)\]\]'
    matches = re.findall(pattern, content)

    links = set()
    for match in matches:
        if '|' in match:
            title = match.split('|')[0]
        else:
            title = match

        title = title.strip()
        # Ignore pages with language codes
        if title and not title.startswith(':') and not title.startswith('#') and not re.match(r'^[a-z]{2}:', title):
            full_url = urljoin(BASE_URL, f'/w/{title}')
            links.add((title, full_url))

    return links

def bfs_scrape(start_title):
    create_directory(DIRECTORY)

    with sync_playwright() as p:
        try:
            browser = p.chromium.launch(headless=True)  # Set to False if you want to see the browser
            page = browser.new_page()

            queue = [(start_title, f"{BASE_URL}/w/{start_title}")]
            page_count = 0

            while queue:
                title, url = queue.pop(0)

                if title in VISITED_PAGES:
                    continue

                # Check if the file already exists and was last modified more than a week ago
                filename = os.path.join(DIRECTORY, f"{title.replace('/', '_').replace(':', '-')}.txt")
                if os.path.exists(filename):
                    last_modified_time = os.path.getmtime(filename)
                    current_time = time.time()
                    if current_time - last_modified_time < ONE_WEEK_SECONDS:
                        print(f"Skipping {title} as it was recently checked.")

                        # Reparse links from the existing file
                        new_links = parse_links_from_file(filename)
                        for new_title, new_url in new_links:
                            if new_title not in VISITED_PAGES:
                                queue.append((new_title, new_url))

                        continue

                print(f"Processing {title}...")
                VISITED_PAGES.add(title)

                content = fetch_page_content(page, url)
                if content:
                    save_page_content(DIRECTORY, title, content)
                    page_count += 1
                    print(f"Saved {title}")

                    # Get all links from the current page
                    new_links = get_links_from_page(page, BASE_URL)
                    for new_title, new_url in new_links:
                        if new_title not in VISITED_PAGES:
                            queue.append((new_title, new_url))
                else:
                    print(f"No content found for {title}. Skipping further processing.")

                    # Reparse links from the existing file if it exists
                    if os.path.exists(filename):
                        try:
                            new_links = parse_links_from_file(filename)
                            for new_title, new_url in new_links:
                                if new_title not in VISITED_PAGES:
                                    queue.append((new_title, new_url))
                        except Exception as e:
                            print(f"Error parsing links from {filename}: {e}")

                try:
                    page.goto(url)
                    time.sleep(random.uniform(1, 3))  # Random delay to mimic human behavior

                    # Get all links from the current page
                    new_links = get_links_from_page(page, BASE_URL)
                    for new_title, new_url in new_links:
                        if new_title not in VISITED_PAGES:
                            queue.append((new_title, new_url))

                except Exception as e:
                    print(f"Error fetching links from {url}: {e}")
                    continue

        finally:
            browser.close()

    return page_count

if __name__ == '__main__':
    start_title = 'Main_Page'  # Replace with the starting page title if different
    total_pages = bfs_scrape(start_title)

    print(f"Successfully downloaded or skipped {total_pages} pages.")