cell_phones_and_accessories.py 文件源码-python代码片段

def parse_captcha(self, link, status):
        print "\n\n status in captcha : ", status
        print "\n link in captcha : ", link
        try:
            if status == 0:
                #proxies = ['http://43.242.104.43', 'http://115.113.43.215', 'http://115.113.43.215']
                #proxy = random.choice(proxies)
                #proxy = urllib2.ProxyHandler({'http': 'http://115.113.43.215'})
                opener = urllib2.build_opener()
                header = ua.random
                print "\n header : ", header
                print "\n link : ", link
                opener.addheaders = [('User-agent', header)]
                data = opener.open(link).read()

                soup = BeautifulSoup(data, 'html.parser')
                div1 = soup.find("div", {"class": "a-row a-text-center"})
                if div1 is not None:
                    img = div1.find("img")
                    image = img["src"]
                    print "\n captcha.."
                    print "image : ", image
                    image = Image.open(StringIO(requests.get(image).content))
                    image.filter(ImageFilter.SHARPEN)
                    captcha = pytesseract.image_to_string(image)
                    print "captcha : ", captcha
                    values = {'field-keywords' : captcha}
                    data = urllib.urlencode(values)
                    req = urllib2.Request(link, data, {'User-agent': header})
                    resp = urllib2.urlopen(req)
                    the_page = resp.read()
                    self.parse_captcha(link, status)
                else:
                    status = 1
                    return
        except Exception as e:
            print "\n Exception : ", e