diff --git a/backup.py b/backup.py index 02d1b31..3b7d8f3 100644 --- a/backup.py +++ b/backup.py @@ -3,7 +3,6 @@ import time import random from playwright.sync_api import sync_playwright from urllib.parse import urljoin -import datetime import re # Configuration @@ -81,7 +80,11 @@ def get_links_from_page(page, base_url): if href and href.startswith('/w/'): full_url = urljoin(base_url, href) title = href.split('/w/')[-1] - links.add((title, full_url)) + + # Ignore pages with language codes + if not re.match(r'^[a-z]{2}:', title): + links.add((title, full_url)) + return links def parse_links_from_file(filename): @@ -100,7 +103,8 @@ def parse_links_from_file(filename): title = match title = title.strip() - if title and not title.startswith(':') and not title.startswith('#'): + # Ignore pages with language codes + if title and not title.startswith(':') and not title.startswith('#') and not re.match(r'^[a-z]{2}:', title): full_url = urljoin(BASE_URL, f'/w/{title}') links.add((title, full_url))