import os import time import random from playwright.sync_api import sync_playwright from urllib.parse import urljoin import re # Configuration BASE_URL = 'https://minecraft.wiki' # Replace with your wiki's base URL DIRECTORY = 'wiki_backup' # Directory to save the pages MAX_RETRIES = 5 # Maximum number of retries for each request VISITED_PAGES = set() # To keep track of visited pages ONE_WEEK_SECONDS = 7 * 24 * 60 * 60 # One week in seconds def create_directory(directory): if not os.path.exists(directory): os.makedirs(directory) def fetch_page_content(page, url): for attempt in range(MAX_RETRIES): try: page.goto(url) time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior # Wait for the edit links to be visible edit_links = page.query_selector_all('a[href*="?action=edit"]') if not edit_links: print(f"No edit links found for {url}") return None edit_source_link = None edit_wysiwyg_link = None for link in edit_links: href = link.get_attribute('href') if href and '?action=edit&mode=source' in href: edit_source_link = urljoin(BASE_URL, href) elif href and '?action=edit' in href: edit_wysiwyg_link = urljoin(BASE_URL, href) # Prioritize the edit source link if edit_source_link: edit_url = edit_source_link elif edit_wysiwyg_link: edit_url = edit_wysiwyg_link else: print(f"Could not find valid edit URL for {url}") return None page.goto(edit_url) time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior # Wait for the textarea with the page content textarea = page.query_selector('textarea#wpTextbox1') if not textarea: print(f"Textarea not found for {url}") return None content = textarea.input_value() return content except Exception as e: print(f"Error fetching content for {url} (Attempt {attempt + 1}): {e}") time.sleep(random.uniform(2, 5)) # Random delay between retries return None def save_page_content(directory, title, content): # Replace problematic characters in the filename filename = os.path.join(directory, f"{title.replace('/', '_').replace(':', '-')}.txt") with open(filename, 'w', encoding='utf-8') as file: file.write(content) # Update the timestamp to current time os.utime(filename, None) def get_links_from_page(page, base_url): links = set() for link in page.query_selector_all('a'): href = link.get_attribute('href') if href and href.startswith('/w/'): full_url = urljoin(base_url, href) title = href.split('/w/')[-1] # Ignore pages with language codes if not re.match(r'^[a-z]{2}:', title): links.add((title, full_url)) return links def parse_links_from_file(filename): with open(filename, 'r', encoding='utf-8') as file: content = file.read() # Use a simple regex to extract wiki links pattern = r'\[\[([^\]]+)\]\]' matches = re.findall(pattern, content) links = set() for match in matches: if '|' in match: title = match.split('|')[0] else: title = match title = title.strip() # Ignore pages with language codes if title and not title.startswith(':') and not title.startswith('#') and not re.match(r'^[a-z]{2}:', title): full_url = urljoin(BASE_URL, f'/w/{title}') links.add((title, full_url)) return links def bfs_scrape(start_title): create_directory(DIRECTORY) with sync_playwright() as p: try: browser = p.chromium.launch(headless=True) # Set to False if you want to see the browser page = browser.new_page() queue = [(start_title, f"{BASE_URL}/w/{start_title}")] page_count = 0 while queue: title, url = queue.pop(0) if title in VISITED_PAGES: continue # Check if the file already exists and was last modified more than a week ago filename = os.path.join(DIRECTORY, f"{title.replace('/', '_').replace(':', '-')}.txt") if os.path.exists(filename): last_modified_time = os.path.getmtime(filename) current_time = time.time() if current_time - last_modified_time < ONE_WEEK_SECONDS: print(f"Skipping {title} as it was recently checked.") # Reparse links from the existing file new_links = parse_links_from_file(filename) for new_title, new_url in new_links: if new_title not in VISITED_PAGES: queue.append((new_title, new_url)) continue print(f"Processing {title}...") VISITED_PAGES.add(title) content = fetch_page_content(page, url) if content: save_page_content(DIRECTORY, title, content) page_count += 1 print(f"Saved {title}") # Get all links from the current page new_links = get_links_from_page(page, BASE_URL) for new_title, new_url in new_links: if new_title not in VISITED_PAGES: queue.append((new_title, new_url)) else: print(f"No content found for {title}. Skipping further processing.") # Reparse links from the existing file if it exists if os.path.exists(filename): try: new_links = parse_links_from_file(filename) for new_title, new_url in new_links: if new_title not in VISITED_PAGES: queue.append((new_title, new_url)) except Exception as e: print(f"Error parsing links from {filename}: {e}") try: page.goto(url) time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior # Get all links from the current page new_links = get_links_from_page(page, BASE_URL) for new_title, new_url in new_links: if new_title not in VISITED_PAGES: queue.append((new_title, new_url)) except Exception as e: print(f"Error fetching links from {url}: {e}") continue finally: browser.close() return page_count if __name__ == '__main__': start_title = 'Main_Page' # Replace with the starting page title if different total_pages = bfs_scrape(start_title) print(f"Successfully downloaded or skipped {total_pages} pages.")