def run_scraping(url, timestamps, scrape_function, concurrency, user_agent):
"""
Run the scraping function asynchronously on the given archives.
The concurrency parameter limits the number of concurrent connections to the web archive.
"""
# Use a semaphore to limit the number of concurrent connections to the internet archive
sem = asyncio.Semaphore(concurrency)
# Use one session to benefit from connection pooling
async with aiohttp.ClientSession(headers={'User-Agent': user_agent}) as session:
# Create scraping coroutines for each archive
coroutines = [scrape_archive(session, url, timestamp, scrape_function, sem) for timestamp in timestamps]
# Wait for coroutines to finish and gather the results
results = await asyncio.gather(*coroutines)
# Compile each valid scraping results in a dictionary
return {timestamp: result for timestamp, result in results if result is not None}
评论列表
文章目录