def save_search_result(page, queue, retry=0):
proxy = Proxy.get_random()['address']
url = SEARCH_URL.format(SEARCH_TEXT, page)
try:
r = fetch(url, proxy=proxy)
except (Timeout, ConnectionError, IOError):
sleep(0.1)
retry += 1
if retry > 5:
put_new_page(page, queue)
raise GreenletExit()
try:
p = Proxy.objects.get(address=proxy)
if p:
p.delete()
except DoesNotExist:
pass
return save_search_result(page, queue, retry)
soup = BeautifulSoup(r.text, 'lxml')
results = soup.find(class_='results')
if results is None:
# ???????, ??????
sleep(0.1)
retry += 1
if retry > 5:
put_new_page(page, queue)
print 'retry too much!'
raise GreenletExit()
return save_search_result(page, queue, retry)
articles = results.find_all(
'div', lambda x: 'wx-rb' in x)
for article in articles:
save_article(article)
page_container = soup.find(id='pagebar_container')
if page_container and u'???' in page_container.text:
last_page = int(page_container.find_all('a')[-2].text)
current_page = int(page_container.find('span').text)
for page in range(current_page + 1, last_page + 1):
put_new_page(page, queue)
search_result_with_lock.py 文件源码
python
阅读 23
收藏 0
点赞 0
评论 0
评论列表
文章目录