def process(keyword,page):
url='https://www.google.com/search?q=%s&start=%s&num=100'%(keyword,page*100)
urlinfos=[]
#urlinfo1={"url":"http://www.baidu.com/link?url=966OdUyxuwFJoAYx_XGYq7_FiVLcej4qEA3Q84e-lLAtLPRGGHA6tsNFNsTN9zka&wd=&eqid=a64931cc000026c3000000035994fd9e","title":"python Django?? ?????????????????..._???","info":'? W3School,???????????????? jQuery ??? jQuery ??jQuery ???? ?W3School,???????? jQuery ????????????? jQuery...'}
page = ct.crawlerTool.getPage(url)
#print page
#print url
segments = ct.crawlerTool.getXpath('//div[@class="g"]',page)
#print segments
for segment in segments:
#print segment
try:
urlinfo={}
urlinfo['url']= ct.crawlerTool.getXpath('//h3/a/@href',segment)[0]#/text()???????
urlinfo['title'] = ct.crawlerTool.getXpath('//h3/a/text()',segment)[0]
urlinfo['info'] = HTMLParser().unescape(ct.crawlerTool.extractorText(ct.crawlerTool.getXpath('//div[@class="s"]', segment)))
#print urlinfo['url'],urlinfo['title'],urlinfo['info']
#info??????????
urlinfos.append(urlinfo)
except:
print('error')
traceback.print_exc()
return {"urlinfos":urlinfos}
评论列表
文章目录