ignore non-english pages

This commit is contained in:
Ryan Voots 2024-12-22 05:27:26 -05:00
parent 68c2fee367
commit cbb55555db

View file

@ -3,7 +3,6 @@ import time
import random
from playwright.sync_api import sync_playwright
from urllib.parse import urljoin
import datetime
import re
# Configuration
@ -81,7 +80,11 @@ def get_links_from_page(page, base_url):
if href and href.startswith('/w/'):
full_url = urljoin(base_url, href)
title = href.split('/w/')[-1]
links.add((title, full_url))
# Ignore pages with language codes
if not re.match(r'^[a-z]{2}:', title):
links.add((title, full_url))
return links
def parse_links_from_file(filename):
@ -100,7 +103,8 @@ def parse_links_from_file(filename):
title = match
title = title.strip()
if title and not title.startswith(':') and not title.startswith('#'):
# Ignore pages with language codes
if title and not title.startswith(':') and not title.startswith('#') and not re.match(r'^[a-z]{2}:', title):
full_url = urljoin(BASE_URL, f'/w/{title}')
links.add((title, full_url))