def doc_exalead(domain, user_agents, prox, q):
document_list = []
uas = user_agents
info('Exalead Document Search Started')
for start in range(0,80,10):
ua = random.choice(uas)
link = 'http://www.exalead.com/search/web/results/?search_language=&q=(filetype:xls+OR+filetype:doc+OR++filetype:pdf+OR+filetype:ppt)+site:{}&search_language=&elements_per_page=10&start_index={}'.format(domain, start)
if prox == True:
proxy = {'http' : 'http://127.0.0.1:8080'}
else:
pass
try:
headers = {"Connection" : "close",
"User-Agent" : ua,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate'}
if prox == True:
response = requests.get(link, headers=headers, proxies=proxy, verify=False)
else:
response = requests.get(link, headers=headers, verify=False)
soup = BeautifulSoup(response.text, "lxml")
if soup.find('label', {'class': 'control-label', 'for': 'id_captcha'}):
info("So you don't like spinach?")
info("Captchas are preventing some document searches.")
break
for div in soup.findAll('li', {'class': 'media'}):
document = div.find('a', href=True)['href']
document = urllib2.unquote(document)
document_list.append(document)
except Exception:
info('An Unhandled Exception Has Occured, Please Check The Log For Details' + INFO_LOG_FILE)
continue
time.sleep(10)
potential_docs = len(document_list)
info('Exalead Document Search Finished')
info('Potential Exalead Documents Found: {}'.format(potential_docs))
q.put(document_list)
评论列表
文章目录