def cxeSearch(go_inurl,go_site,go_cxe,go_ftype,maxc):
uRLS = []
counter = 0
while counter < int(maxc):
jar = cookielib.FileCookieJar("cookies")
query = 'q='+go_inurl+'+'+go_site+'+'+go_ftype
results_web = 'http://www.google.com/cse?'+go_cxe+'&'+query+'&num='+str(gnum)+'&hl=en&lr=&ie=UTF-8&start=' + repr(counter) + '&sa=N'
request_web = urllib2.Request(results_web)
agent = random.choice(header)
request_web.add_header('User-Agent', agent)
opener_web = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
text = opener_web.open(request_web).read()
strreg = re.compile('(?<=href=")(.*?)(?=")')
names = strreg.findall(text)
counter += 100
for name in names:
if name not in uRLS:
if re.search(r'\(', name) or re.search("<", name) or re.search("\A/", name) or re.search("\A(http://)\d", name):
pass
elif re.search("google", name) or re.search("youtube", name) or re.search(".gov", name) or re.search("%", name):
pass
else:
uRLS.append(name)
tmpList = []; finalList = []
print "[+] URLS (unsorted) :", len(uRLS)
for entry in uRLS:
try:
t2host = entry.split("/",3)
domain = t2host[2]
if domain not in tmpList and "=" in entry:
finalList.append(entry)
tmpList.append(domain)
except:
pass
print "[+] URLS (sorted) :", len(finalList)
return finalList
评论列表
文章目录