search_result_with_lock.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:web_develop 作者: dongweiming 项目源码 文件源码
def save_search_result(page, queue, retry=0):
    proxy = Proxy.get_random()['address']
    url = SEARCH_URL.format(SEARCH_TEXT, page)

    try:
        r = fetch(url, proxy=proxy)
    except (Timeout, ConnectionError, IOError):
        sleep(0.1)
        retry += 1
        if retry > 5:
            put_new_page(page, queue)
            raise GreenletExit()
        try:
            p = Proxy.objects.get(address=proxy)
            if p:
                p.delete()
        except DoesNotExist:
            pass

        return save_search_result(page, queue, retry)
    soup = BeautifulSoup(r.text, 'lxml')
    results = soup.find(class_='results')
    if results is None:
        # ???????, ??????
        sleep(0.1)
        retry += 1
        if retry > 5:
            put_new_page(page, queue)
            print 'retry too much!'
            raise GreenletExit()
        return save_search_result(page, queue, retry)
    articles = results.find_all(
        'div', lambda x: 'wx-rb' in x)
    for article in articles:
        save_article(article)

    page_container = soup.find(id='pagebar_container')
    if page_container and u'???' in page_container.text:
        last_page = int(page_container.find_all('a')[-2].text)
        current_page = int(page_container.find('span').text)
        for page in range(current_page + 1, last_page + 1):
            put_new_page(page, queue)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号