def doc_bing(domain, user_agents, prox, q):
document_list = []
uas = user_agents
info('Bing Document Search Started')
for start in range(1,300,10):
ua = random.choice(uas)
if prox == True:
proxy = {'http' : 'http://127.0.0.1:8080'}
else:
pass
try:
headers = {"Connection" : "close",
"User-Agent" : ua,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate'}
payload = { 'q': 'filetype:(doc dot docx docm dotx dotm docb xls xlt xlm xlsx xlsm xltx xltm xlsb xla xlam xll xlw ppt pot pps pptx pptm potx potm ppam ppsx ppsm sldx sldm pub pdf) site:{}'.format(domain), 'first': start}
link = 'http://www.bing.com/search'
if prox == True:
response = requests.get(link, headers=headers, proxies=proxy, params=payload, verify=False)
else:
response = requests.get(link, headers=headers, params=payload, verify=False)
soup = BeautifulSoup(response.text, "lxml")
divs = soup.findAll('li', {'class': 'b_algo'})
for div in divs:
h2 = div.find('h2')
document = h2.find('a', href=True)['href']
document = urllib2.unquote(document)
document_list.append(document)
except requests.models.ChunkedEncodingError:
continue
except Exception:
traceback.print_exc()
continue
potential_docs = len(document_list)
info('Bing Document Search Finished')
q.put(document_list)
评论列表
文章目录