scraping.py 文件源码

python
阅读 18 收藏 0 点赞 0 评论 0

项目:amazon 作者: parul1931 项目源码 文件源码
def parse_captcha(link, status):
    if status == 0:
        opener = urllib2.build_opener()
        header = ua.random
        print "\n header : ", header
        print "\n link : ", link
        opener.addheaders = [('User-agent', header)]
        response = opener.open(link)
        data = response.read()
        # code = response.getcode()
        # log = "\n\n\n\n header : {header} \n url : {url} \n response : {response}".format(header=header, url=link, response=code)
        # logging.debug(log)
        #print "log : ", log

        soup = BeautifulSoup(data, 'html.parser')
        div1 = soup.find("div", {"class": "a-row a-text-center"})
        if div1 is not None:
            img = div1.find("img")
            image = img["src"]
            print "\n captcha.."
            print "image : ", image
            image = Image.open(StringIO(requests.get(image).content))
            image.filter(ImageFilter.SHARPEN)
            captcha = pytesseract.image_to_string(image)
            print "captcha : ", captcha
            values = {'field-keywords' : captcha}
            data = urllib.urlencode(values)
            req = urllib2.Request(link, data, {'User-agent': header})
            resp = urllib2.urlopen(req)
            the_page = resp.read()
            parse_captcha(link, status)
        else:
            status = 1
            return
    else:
        return
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号