def geturls(query,num):
print "[+] getting urls"
counter = 10
urls = []
while counter < int(num):
url = 'http://www.google.com/search?hl=en&q='+query+'&hl=en&lr=&start='+repr(counter)+'&sa=N'
#url = "http://search.lycos.com/?query="+query+"&page="+repr(counter)
opener = urllib2.build_opener(url)
opener.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')]
data = opener.open(url).read()
print data
hosts = re.findall(('\w+\.[\w\.\-/]*\.\w+'),StripTags(data))
#hosts = re.findall('<span class=\"?grnLnk small\"?>http:\/\/(.+?)\/',data)
for x in hosts:
if x.find('www') != -1:
x = x[x.find('www'):]
if x not in urls and re.search("google", x) == None:
urls.append(x)
counter += 10
for url in urls:
print url
return urls
评论列表
文章目录