def crawler(urls, max_urls):
crawled = Set()
queued = Set(urls)
pairs = []
while urls and len(crawled) < max_urls:
page=urls.pop(0)
if is_html(page):
if page not in crawled:
try:
print(page)
links=BeautifulSoup(urllib2.urlopen(page,timeout=5).read(), parseOnlyThese=SoupStrainer('a'))
for link in links:
url = domain + link['href']
if verify(url) and url not in queued:
# print(url)
urls.append('http://' +url)
# print(urls)
queued.add('http://' +url)
# print(page)
crawled.add(page)
# print(crawled)
except:
continue
return crawled,pairs
评论列表
文章目录