def run(self):
# We create this context so that we can crawl
# https sites
myssl = ssl.create_default_context();
myssl.check_hostname=False
myssl.verify_mode=ssl.CERT_NONE
# process all the links in our queue
while True:
self.urlLock.acquire()
print("Queue Size: {}".format(self.linksToCrawl.qsize()))
link = self.linksToCrawl.get()
self.urlLock.release()
# have we reached the end of our queue?
if link is None:
break
# Have we visited this link already?
if (link in self.haveVisited):
print("Already Visited: {}".format(link))
break
try:
link = urljoin(self.baseUrl, link)
req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(req, context=myssl)
print("Url {} Crawled with Status: {}".format(response.geturl(), response.getcode()))
soup = BeautifulSoup(response.read(), "html.parser")
for atag in soup.find_all('a'):
if (atag.get('href') not in self.haveVisited) and (urlparse(link).netloc == 'tutorialedge.net'):
self.linksToCrawl.put(atag.get('href'))
else :
print("{} already visited or not part of website".format(atag.get('href')))
print("Adding {} to crawled list".format(link))
self.haveVisited.append(link)
except URLError as e:
print("URL {} threw this error when trying to parse: {}".format(link, e.reason))
self.errorLinks.append(link)
finally:
self.linksToCrawl.task_done()
webCrawler.py 文件源码
python
阅读 26
收藏 0
点赞 0
评论 0
评论列表
文章目录