ignore non-english pages
This commit is contained in:
parent
68c2fee367
commit
cbb55555db
1 changed files with 7 additions and 3 deletions
10
backup.py
10
backup.py
|
@ -3,7 +3,6 @@ import time
|
||||||
import random
|
import random
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
import datetime
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
|
@ -81,7 +80,11 @@ def get_links_from_page(page, base_url):
|
||||||
if href and href.startswith('/w/'):
|
if href and href.startswith('/w/'):
|
||||||
full_url = urljoin(base_url, href)
|
full_url = urljoin(base_url, href)
|
||||||
title = href.split('/w/')[-1]
|
title = href.split('/w/')[-1]
|
||||||
links.add((title, full_url))
|
|
||||||
|
# Ignore pages with language codes
|
||||||
|
if not re.match(r'^[a-z]{2}:', title):
|
||||||
|
links.add((title, full_url))
|
||||||
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
def parse_links_from_file(filename):
|
def parse_links_from_file(filename):
|
||||||
|
@ -100,7 +103,8 @@ def parse_links_from_file(filename):
|
||||||
title = match
|
title = match
|
||||||
|
|
||||||
title = title.strip()
|
title = title.strip()
|
||||||
if title and not title.startswith(':') and not title.startswith('#'):
|
# Ignore pages with language codes
|
||||||
|
if title and not title.startswith(':') and not title.startswith('#') and not re.match(r'^[a-z]{2}:', title):
|
||||||
full_url = urljoin(BASE_URL, f'/w/{title}')
|
full_url = urljoin(BASE_URL, f'/w/{title}')
|
||||||
links.add((title, full_url))
|
links.add((title, full_url))
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue