scraper.py 文件源码

python
阅读 28 收藏 0 点赞 0 评论 0

项目:pylongecity 作者: cyrbon 项目源码 文件源码
def scrape_all_posts_unflat(url: str, verbose: bool, cache: bool) -> List[List['Post']]:
    unflat_posts = []

    fget = requests.get if not cache else memory.cache(requests.get)
    page = fget(url).text # Downloads the page twice.
    # ^ we can scrape_page(page), .append, [urls - url], but KISS.
    n_of_pages = pq(page).find('.pagejump > a').eq(0).text().strip().split(' ')[-1] # Gets '10' from 'Page 1 of 10'

    # If there is only one page
    if(n_of_pages is ''):
        urls = [url]
    else:
        url_prefix_match = re.match('(.*)(page-[0-9]+)', url)
        url_prefix = url if url_prefix_match is None else url_prefix_match.group(1)
        if(url_prefix[-1] != '/'): url_prefix += '/'
        urls = [(url_prefix + 'page-' + str(n + 1)) for n in range(int(n_of_pages))]

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        fscrape = scrape_posts if not cache else memory.cache(scrape_posts, ignore=['verbose'])
        futures = [executor.submit(fscrape, url, verbose) for url in urls]
        results, _ = concurrent.futures.wait (futures)
        for result in results:
            unflat_posts.append(result.result())
    return unflat_posts
评论列表


问题


面经


文章

微信
公众号

扫码关注公众号