wayback.py 文件源码-python代码片段

wayback.py 文件源码

python

阅读 29 收藏 0 点赞 0 评论 0

项目：waybackscraper 作者: abrenaut 项目源码文件源码

def run_scraping(url, timestamps, scrape_function, concurrency, user_agent):
    """
    Run the scraping function asynchronously on the given archives.
    The concurrency parameter limits the number of concurrent connections to the web archive.
    """
    # Use a semaphore to limit the number of concurrent connections to the internet archive
    sem = asyncio.Semaphore(concurrency)

    # Use one session to benefit from connection pooling
    async with aiohttp.ClientSession(headers={'User-Agent': user_agent}) as session:
        # Create scraping coroutines for each archive
        coroutines = [scrape_archive(session, url, timestamp, scrape_function, sem) for timestamp in timestamps]

        # Wait for coroutines to finish and gather the results
        results = await asyncio.gather(*coroutines)

    # Compile each valid scraping results in a dictionary
    return {timestamp: result for timestamp, result in results if result is not None}