search.py 文件源码-python代码片段

def doc_exalead(domain, user_agents, prox, q):
    document_list = []
    uas = user_agents
    info('Exalead Document Search Started')
    for start in range(0,80,10):
        ua = random.choice(uas)
        link = 'http://www.exalead.com/search/web/results/?search_language=&q=(filetype:xls+OR+filetype:doc+OR++filetype:pdf+OR+filetype:ppt)+site:{}&search_language=&elements_per_page=10&start_index={}'.format(domain, start)
        if prox == True:
            proxy = {'http' : 'http://127.0.0.1:8080'}
        else:
            pass
        try:
            headers = {"Connection" : "close",
                       "User-Agent" : ua,
                       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                       'Accept-Language': 'en-US,en;q=0.5',
                       'Accept-Encoding': 'gzip, deflate'}
            if prox == True:
                response = requests.get(link, headers=headers, proxies=proxy, verify=False)
            else:
                response = requests.get(link, headers=headers, verify=False)
            soup = BeautifulSoup(response.text, "lxml")
            if soup.find('label', {'class': 'control-label', 'for': 'id_captcha'}):
                info("So you don't like spinach?")
                info("Captchas are preventing some document searches.")
                break
            for div in soup.findAll('li', {'class': 'media'}):
                document = div.find('a', href=True)['href']
                document = urllib2.unquote(document)
                document_list.append(document)

        except Exception:
            info('An Unhandled Exception Has Occured, Please Check The Log For Details' + INFO_LOG_FILE)
            continue

        time.sleep(10)
    potential_docs = len(document_list)
    info('Exalead Document Search Finished')
    info('Potential Exalead Documents Found: {}'.format(potential_docs))
    q.put(document_list)