def search(inurl, maxc):
urls = []
for site in sitearray:
page = 0
try:
while page < int(maxc):
jar = cookielib.FileCookieJar("cookies")
query = inurl+"+site:"+site
results_web = 'http://www.search-results.com/web?q='+query+'&hl=en&page='+repr(page)+'&src=hmp'
request_web =urllib2.Request(results_web)
agent = random.choice(header)
request_web.add_header('User-Agent', agent)
opener_web = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
text = opener_web.open(request_web).read()
stringreg = re.compile('(?<=href=")(.*?)(?=")')
names = stringreg.findall(text)
page += 1
for name in names:
if name not in urls:
if re.search(r'\(',name) or re.search("<", name) or re.search("\A/", name) or re.search("\A(http://)\d", name):
pass
elif re.search("google",name) or re.search("youtube", name) or re.search("phpbuddy", name) or re.search("iranhack",name) or re.search("phpbuilder",name) or re.search("codingforums", name) or re.search("phpfreaks", name) or re.search("%", name) or re.search("facebook", name) or re.search("twitter", name):
pass
else:
urls.append(name)
percent = int((1.0*page/int(maxc))*100)
urls_len = len(urls)
sys.stdout.write("\rSite: %s | Collected urls: %s | Percent Done: %s | Current page no.: %s <> " % (site,repr(urls_len),repr(percent),repr(page)))
sys.stdout.flush()
except(KeyboardInterrupt):
pass
tmplist = []
print "\n\n[+] URLS (Sin Clasificar): ",len(urls)
for url in urls:
try:
host = url.split("/",3)
domain = host[2]
if domain not in tmplist and "=" in url:
finallist.append(url)
tmplist.append(domain)
except:
pass
print "[+] URLS (Clasificada) : ",len(finallist)
return finallist
评论列表
文章目录