dorkgoo.py 文件源码-python代码片段

def SearchGoogle(num,target,language):
    start_page = 0
    nlink = ""
    user_agent = {'User-agent': 'Mozilla/5.0'}
    nlink_clean = ""
    response =""
    soup = ""
    raw_links = ""
    #Split the target in domain and extension
    domain = target.replace(".es",'')
    extension = target.split(".")[1]
    print "\nLooking domains and subdomains of target",target
    for start in range(start_page, (start_page + num)):
        SearchGoogle = "https://www.google.com/search?q=(site:*."+target+"+OR+site:*"+target+"+OR+site:"+domain+"*."+extension+")+-site:www."+target+"&lr=lang_"+language+"&filter=&num=100"
    try:
        response = requests.get(SearchGoogle, headers = user_agent)
    except requests.exceptions.RequestException as e:
        print "\nError connection to server!"
        pass    
    except requests.exceptions.ConnectTimeout as e:
        print "\nError Timeout",target
        pass
    try:
        #Parser HTML of BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
        if response.text.find("Our systems have detected unusual traffic") != -1:
            print "CAPTCHA detected - Plata or captcha !!!Maybe try form another IP..."
            return True
        #Parser url's throught regular expression
        raw_links = soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
        #print raw_links
        for link in raw_links:
            #Cache Google
            if link["href"].find("webcache.googleusercontent.com") == -1:
                nlink = link["href"].replace("/url?q=","")
            #Parser links
            nlink = re.sub(r'&sa=.*', "", nlink)
            nlink = urllib2.unquote(nlink).decode('utf8')
            nlink_clean = nlink.split("//")[-1].split("/")[0]
            url_google.append(nlink_clean)
    except Exception as e:
        print e
    if len(raw_links) < 2:
        #Verify if the search has taken some results
        print "No more results!!!"
        #captcha = True
        return True
    else:
        return False