def getsearchresult_url(ip, url): # ??????????????
try:
html = gethtml(url)
if not html:
print '??????url:%s' % url
return 'error'
if html == 'error': # ??????
return 'error'
root = etree.HTML(html)
lines = root.xpath('//*[@id="b_results"]') # ????
if not lines:
return 'error'
else:
lines = lines[0]
urlandtitle = []
for l in lines:
url = l.xpath('.//h2/a/@href')
title = l.xpath('.//h2/a/text()')
if url and title:
url = url[0]
parser = urlparse(url)
netloc = parser.netloc
if netloc == 'ip.chinaz.com':
pass
else:
title = title[0]
urlandtitle.append({'url': url, 'title': title})
else:
pass
urls_temp = {}
urls_temp = urlandtitle
for u in urls_temp:
title = u['title']
parser = urlparse(u['url'])
url = parser.scheme + '://' + parser.netloc + '/'
if url not in check:
end.append({'url': url, 'title': title})
check.append(url)
next_page = lines.xpath('.//*[@class="sb_pagN"]/@href')
# time.sleep(1)
if len(next_page) > 0:
url = 'https://www.bing.com'+next_page[0]
return 0,url,end
else:
return 1,None,end
except Exception, e:
print e
return 'error'
评论列表
文章目录