def parse_captcha(self, link, status):
print "\n\n status in captcha : ", status
print "\n link in captcha : ", link
try:
if status == 0:
#proxies = ['http://43.242.104.43', 'http://115.113.43.215', 'http://115.113.43.215']
#proxy = random.choice(proxies)
#proxy = urllib2.ProxyHandler({'http': 'http://115.113.43.215'})
opener = urllib2.build_opener()
header = ua.random
print "\n header : ", header
print "\n link : ", link
opener.addheaders = [('User-agent', header)]
data = opener.open(link).read()
soup = BeautifulSoup(data, 'html.parser')
div1 = soup.find("div", {"class": "a-row a-text-center"})
if div1 is not None:
img = div1.find("img")
image = img["src"]
print "\n captcha.."
print "image : ", image
image = Image.open(StringIO(requests.get(image).content))
image.filter(ImageFilter.SHARPEN)
captcha = pytesseract.image_to_string(image)
print "captcha : ", captcha
values = {'field-keywords' : captcha}
data = urllib.urlencode(values)
req = urllib2.Request(link, data, {'User-agent': header})
resp = urllib2.urlopen(req)
the_page = resp.read()
self.parse_captcha(link, status)
else:
status = 1
return
except Exception as e:
print "\n Exception : ", e
评论列表
文章目录