ignore non-english pages
This commit is contained in:
parent
68c2fee367
commit
cbb55555db
1 changed files with 7 additions and 3 deletions
10
backup.py
10
backup.py
|
@ -3,7 +3,6 @@ import time
|
|||
import random
|
||||
from playwright.sync_api import sync_playwright
|
||||
from urllib.parse import urljoin
|
||||
import datetime
|
||||
import re
|
||||
|
||||
# Configuration
|
||||
|
@ -81,7 +80,11 @@ def get_links_from_page(page, base_url):
|
|||
if href and href.startswith('/w/'):
|
||||
full_url = urljoin(base_url, href)
|
||||
title = href.split('/w/')[-1]
|
||||
links.add((title, full_url))
|
||||
|
||||
# Ignore pages with language codes
|
||||
if not re.match(r'^[a-z]{2}:', title):
|
||||
links.add((title, full_url))
|
||||
|
||||
return links
|
||||
|
||||
def parse_links_from_file(filename):
|
||||
|
@ -100,7 +103,8 @@ def parse_links_from_file(filename):
|
|||
title = match
|
||||
|
||||
title = title.strip()
|
||||
if title and not title.startswith(':') and not title.startswith('#'):
|
||||
# Ignore pages with language codes
|
||||
if title and not title.startswith(':') and not title.startswith('#') and not re.match(r'^[a-z]{2}:', title):
|
||||
full_url = urljoin(BASE_URL, f'/w/{title}')
|
||||
links.add((title, full_url))
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue