def parse_captcha(link, status):
if status == 0:
opener = urllib2.build_opener()
header = ua.random
print "\n header : ", header
print "\n link : ", link
opener.addheaders = [('User-agent', header)]
response = opener.open(link)
data = response.read()
# code = response.getcode()
# log = "\n\n\n\n header : {header} \n url : {url} \n response : {response}".format(header=header, url=link, response=code)
# logging.debug(log)
#print "log : ", log
soup = BeautifulSoup(data, 'html.parser')
div1 = soup.find("div", {"class": "a-row a-text-center"})
if div1 is not None:
img = div1.find("img")
image = img["src"]
print "\n captcha.."
print "image : ", image
image = Image.open(StringIO(requests.get(image).content))
image.filter(ImageFilter.SHARPEN)
captcha = pytesseract.image_to_string(image)
print "captcha : ", captcha
values = {'field-keywords' : captcha}
data = urllib.urlencode(values)
req = urllib2.Request(link, data, {'User-agent': header})
resp = urllib2.urlopen(req)
the_page = resp.read()
parse_captcha(link, status)
else:
status = 1
return
else:
return
评论列表
文章目录