def SearchGoogle(num,target,language):
start_page = 0
nlink = ""
user_agent = {'User-agent': 'Mozilla/5.0'}
nlink_clean = ""
response =""
soup = ""
raw_links = ""
#Split the target in domain and extension
domain = target.replace(".es",'')
extension = target.split(".")[1]
print "\nLooking domains and subdomains of target",target
for start in range(start_page, (start_page + num)):
SearchGoogle = "https://www.google.com/search?q=(site:*."+target+"+OR+site:*"+target+"+OR+site:"+domain+"*."+extension+")+-site:www."+target+"&lr=lang_"+language+"&filter=&num=100"
try:
response = requests.get(SearchGoogle, headers = user_agent)
except requests.exceptions.RequestException as e:
print "\nError connection to server!"
pass
except requests.exceptions.ConnectTimeout as e:
print "\nError Timeout",target
pass
try:
#Parser HTML of BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
if response.text.find("Our systems have detected unusual traffic") != -1:
print "CAPTCHA detected - Plata or captcha !!!Maybe try form another IP..."
return True
#Parser url's throught regular expression
raw_links = soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
#print raw_links
for link in raw_links:
#Cache Google
if link["href"].find("webcache.googleusercontent.com") == -1:
nlink = link["href"].replace("/url?q=","")
#Parser links
nlink = re.sub(r'&sa=.*', "", nlink)
nlink = urllib2.unquote(nlink).decode('utf8')
nlink_clean = nlink.split("//")[-1].split("/")[0]
url_google.append(nlink_clean)
except Exception as e:
print e
if len(raw_links) < 2:
#Verify if the search has taken some results
print "No more results!!!"
#captcha = True
return True
else:
return False
评论列表
文章目录