ignore non-english pages

2024-12-22 05:27:26 -05:00 · 2024-12-22 05:27:26 -05:00 · cbb55555db
commit cbb55555db
parent 68c2fee367
1 changed files with 7 additions and 3 deletions
--- a/backup.py
+++ b/backup.py
@ -3,7 +3,6 @@ import time
 import random
 from playwright.sync_api import sync_playwright
 from urllib.parse import urljoin
-import datetime
 import re

 # Configuration
@ -81,7 +80,11 @@ def get_links_from_page(page, base_url):
        if href and href.startswith('/w/'):
            full_url = urljoin(base_url, href)
            title = href.split('/w/')[-1]
-            links.add((title, full_url))
+            
+            # Ignore pages with language codes
+            if not re.match(r'^[a-z]{2}:', title):
+                links.add((title, full_url))
+    
    return links

 def parse_links_from_file(filename):
@ -100,7 +103,8 @@ def parse_links_from_file(filename):
            title = match
        
        title = title.strip()
-        if title and not title.startswith(':') and not title.startswith('#'):
+        # Ignore pages with language codes
+        if title and not title.startswith(':') and not title.startswith('#') and not re.match(r'^[a-z]{2}:', title):
            full_url = urljoin(BASE_URL, f'/w/{title}')
            links.add((title, full_url))