196 lines
7.5 KiB
Python
196 lines
7.5 KiB
Python
import os
|
|
import time
|
|
import random
|
|
from playwright.sync_api import sync_playwright
|
|
from urllib.parse import urljoin
|
|
import re
|
|
|
|
# Configuration
|
|
BASE_URL = 'https://minecraft.wiki' # Replace with your wiki's base URL
|
|
DIRECTORY = 'wiki_backup' # Directory to save the pages
|
|
MAX_RETRIES = 5 # Maximum number of retries for each request
|
|
VISITED_PAGES = set() # To keep track of visited pages
|
|
ONE_WEEK_SECONDS = 7 * 24 * 60 * 60 # One week in seconds
|
|
|
|
def create_directory(directory):
|
|
if not os.path.exists(directory):
|
|
os.makedirs(directory)
|
|
|
|
def fetch_page_content(page, url):
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
page.goto(url)
|
|
time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior
|
|
|
|
# Wait for the edit links to be visible
|
|
edit_links = page.query_selector_all('a[href*="?action=edit"]')
|
|
if not edit_links:
|
|
print(f"No edit links found for {url}")
|
|
return None
|
|
|
|
edit_source_link = None
|
|
edit_wysiwyg_link = None
|
|
|
|
for link in edit_links:
|
|
href = link.get_attribute('href')
|
|
if href and '?action=edit&mode=source' in href:
|
|
edit_source_link = urljoin(BASE_URL, href)
|
|
elif href and '?action=edit' in href:
|
|
edit_wysiwyg_link = urljoin(BASE_URL, href)
|
|
|
|
# Prioritize the edit source link
|
|
if edit_source_link:
|
|
edit_url = edit_source_link
|
|
elif edit_wysiwyg_link:
|
|
edit_url = edit_wysiwyg_link
|
|
else:
|
|
print(f"Could not find valid edit URL for {url}")
|
|
return None
|
|
|
|
page.goto(edit_url)
|
|
time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior
|
|
|
|
# Wait for the textarea with the page content
|
|
textarea = page.query_selector('textarea#wpTextbox1')
|
|
if not textarea:
|
|
print(f"Textarea not found for {url}")
|
|
return None
|
|
|
|
content = textarea.input_value()
|
|
return content
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching content for {url} (Attempt {attempt + 1}): {e}")
|
|
time.sleep(random.uniform(2, 5)) # Random delay between retries
|
|
return None
|
|
|
|
def save_page_content(directory, title, content):
|
|
# Replace problematic characters in the filename
|
|
filename = os.path.join(directory, f"{title.replace('/', '_').replace(':', '-')}.txt")
|
|
with open(filename, 'w', encoding='utf-8') as file:
|
|
file.write(content)
|
|
|
|
# Update the timestamp to current time
|
|
os.utime(filename, None)
|
|
|
|
def get_links_from_page(page, base_url):
|
|
links = set()
|
|
for link in page.query_selector_all('a'):
|
|
href = link.get_attribute('href')
|
|
if href and href.startswith('/w/'):
|
|
full_url = urljoin(base_url, href)
|
|
title = href.split('/w/')[-1]
|
|
|
|
# Ignore pages with language codes
|
|
if not re.match(r'^[a-z]{2}:', title):
|
|
links.add((title, full_url))
|
|
|
|
return links
|
|
|
|
def parse_links_from_file(filename):
|
|
with open(filename, 'r', encoding='utf-8') as file:
|
|
content = file.read()
|
|
|
|
# Use a simple regex to extract wiki links
|
|
pattern = r'\[\[([^\]]+)\]\]'
|
|
matches = re.findall(pattern, content)
|
|
|
|
links = set()
|
|
for match in matches:
|
|
if '|' in match:
|
|
title = match.split('|')[0]
|
|
else:
|
|
title = match
|
|
|
|
title = title.strip()
|
|
# Ignore pages with language codes
|
|
if title and not title.startswith(':') and not title.startswith('#') and not re.match(r'^[a-z]{2}:', title):
|
|
full_url = urljoin(BASE_URL, f'/w/{title}')
|
|
links.add((title, full_url))
|
|
|
|
return links
|
|
|
|
def bfs_scrape(start_title):
|
|
create_directory(DIRECTORY)
|
|
|
|
with sync_playwright() as p:
|
|
try:
|
|
browser = p.chromium.launch(headless=True) # Set to False if you want to see the browser
|
|
page = browser.new_page()
|
|
|
|
queue = [(start_title, f"{BASE_URL}/w/{start_title}")]
|
|
page_count = 0
|
|
|
|
while queue:
|
|
title, url = queue.pop(0)
|
|
|
|
if title in VISITED_PAGES:
|
|
continue
|
|
|
|
# Check if the file already exists and was last modified more than a week ago
|
|
filename = os.path.join(DIRECTORY, f"{title.replace('/', '_').replace(':', '-')}.txt")
|
|
if os.path.exists(filename):
|
|
last_modified_time = os.path.getmtime(filename)
|
|
current_time = time.time()
|
|
if current_time - last_modified_time < ONE_WEEK_SECONDS:
|
|
print(f"Skipping {title} as it was recently checked.")
|
|
|
|
# Reparse links from the existing file
|
|
new_links = parse_links_from_file(filename)
|
|
for new_title, new_url in new_links:
|
|
if new_title not in VISITED_PAGES:
|
|
queue.append((new_title, new_url))
|
|
|
|
continue
|
|
|
|
print(f"Processing {title}...")
|
|
VISITED_PAGES.add(title)
|
|
|
|
content = fetch_page_content(page, url)
|
|
if content:
|
|
save_page_content(DIRECTORY, title, content)
|
|
page_count += 1
|
|
print(f"Saved {title}")
|
|
|
|
# Get all links from the current page
|
|
new_links = get_links_from_page(page, BASE_URL)
|
|
for new_title, new_url in new_links:
|
|
if new_title not in VISITED_PAGES:
|
|
queue.append((new_title, new_url))
|
|
else:
|
|
print(f"No content found for {title}. Skipping further processing.")
|
|
|
|
# Reparse links from the existing file if it exists
|
|
if os.path.exists(filename):
|
|
try:
|
|
new_links = parse_links_from_file(filename)
|
|
for new_title, new_url in new_links:
|
|
if new_title not in VISITED_PAGES:
|
|
queue.append((new_title, new_url))
|
|
except Exception as e:
|
|
print(f"Error parsing links from {filename}: {e}")
|
|
|
|
try:
|
|
page.goto(url)
|
|
time.sleep(random.uniform(1, 3)) # Random delay to mimic human behavior
|
|
|
|
# Get all links from the current page
|
|
new_links = get_links_from_page(page, BASE_URL)
|
|
for new_title, new_url in new_links:
|
|
if new_title not in VISITED_PAGES:
|
|
queue.append((new_title, new_url))
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching links from {url}: {e}")
|
|
continue
|
|
|
|
finally:
|
|
browser.close()
|
|
|
|
return page_count
|
|
|
|
if __name__ == '__main__':
|
|
start_title = 'Main_Page' # Replace with the starting page title if different
|
|
total_pages = bfs_scrape(start_title)
|
|
|
|
print(f"Successfully downloaded or skipped {total_pages} pages.")
|