Download fetched web search results in parallel

This commit is contained in:
oobabooga 2025-05-28 20:34:14 -07:00
parent 7080a02252
commit 75d6cfd14d

View file

@ -1,3 +1,5 @@
import concurrent.futures
from concurrent.futures import as_completed
from datetime import datetime
import requests
@ -5,7 +7,6 @@ from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from modules.logging_colors import logger
from modules.text_generation import generate_reply
def get_current_timestamp():
@ -40,27 +41,50 @@ def download_web_page(url, timeout=5):
return f"[Error downloading content from {url}: {str(e)}]"
def perform_web_search(query, num_pages=3):
def perform_web_search(query, num_pages=3, max_workers=5):
"""Perform web search and return results with content"""
try:
with DDGS() as ddgs:
results = list(ddgs.text(query, max_results=num_pages))
search_results = []
# Prepare download tasks
download_tasks = []
for i, result in enumerate(results):
url = result.get('href', '')
title = result.get('title', f'Search Result {i+1}')
download_tasks.append((url, title, i))
# Download page content
content = download_web_page(url)
search_results = [None] * len(download_tasks) # Pre-allocate to maintain order
search_results.append({
# Download pages in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all download tasks
future_to_task = {
executor.submit(download_web_page, task[0]): task
for task in download_tasks
}
# Collect results as they complete
for future in as_completed(future_to_task):
url, title, index = future_to_task[future]
try:
content = future.result()
search_results[index] = {
'title': title,
'url': url,
'content': content
})
}
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
# Include failed downloads with empty content
search_results[index] = {
'title': title,
'url': url,
'content': ''
}
return search_results
except Exception as e:
logger.error(f"Error performing web search: {e}")
return []