google.py 文件源码

python
阅读 26 收藏 0 点赞 0 评论 0

项目:searchForAll 作者: MemoryAndDream 项目源码 文件源码
def process(keyword,page):
    url='https://www.google.com/search?q=%s&start=%s&num=100'%(keyword,page*100)
    urlinfos=[]
    #urlinfo1={"url":"http://www.baidu.com/link?url=966OdUyxuwFJoAYx_XGYq7_FiVLcej4qEA3Q84e-lLAtLPRGGHA6tsNFNsTN9zka&wd=&eqid=a64931cc000026c3000000035994fd9e","title":"python Django?? ?????????????????..._???","info":'? W3School,???????????????? jQuery ??? jQuery ??jQuery ???? ?W3School,???????? jQuery ????????????? jQuery...'}
    page = ct.crawlerTool.getPage(url)
    #print page
    #print url
    segments = ct.crawlerTool.getXpath('//div[@class="g"]',page)
    #print segments
    for segment in segments:
        #print segment
        try:
            urlinfo={}
            urlinfo['url']= ct.crawlerTool.getXpath('//h3/a/@href',segment)[0]#/text()???????
            urlinfo['title'] = ct.crawlerTool.getXpath('//h3/a/text()',segment)[0]
            urlinfo['info'] =  HTMLParser().unescape(ct.crawlerTool.extractorText(ct.crawlerTool.getXpath('//div[@class="s"]', segment)))
            #print urlinfo['url'],urlinfo['title'],urlinfo['info']
            #info??????????
            urlinfos.append(urlinfo)
        except:
            print('error')
            traceback.print_exc()
    return {"urlinfos":urlinfos}
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号