webCrawler.py 文件源码

python
阅读 26 收藏 0 点赞 0 评论 0

项目:Learning-Concurrency-in-Python 作者: PacktPublishing 项目源码 文件源码
def run(self):
    # We create this context so that we can crawl 
    # https sites
    myssl = ssl.create_default_context();
    myssl.check_hostname=False
    myssl.verify_mode=ssl.CERT_NONE
    # process all the links in our queue
    while True:

      self.urlLock.acquire()
      print("Queue Size: {}".format(self.linksToCrawl.qsize()))
      link = self.linksToCrawl.get()
      self.urlLock.release()
      # have we reached the end of our queue?
      if link is None:
        break

      # Have we visited this link already?
      if (link in self.haveVisited):
        print("Already Visited: {}".format(link))
        break

      try:
        link = urljoin(self.baseUrl, link)
        req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
        response = urlopen(req, context=myssl)

        print("Url {} Crawled with Status: {}".format(response.geturl(), response.getcode()))

        soup = BeautifulSoup(response.read(), "html.parser")

        for atag in soup.find_all('a'):
          if (atag.get('href') not in self.haveVisited) and (urlparse(link).netloc == 'tutorialedge.net'):
            self.linksToCrawl.put(atag.get('href'))
          else :
            print("{} already visited or not part of website".format(atag.get('href')))

        print("Adding {} to crawled list".format(link))
        self.haveVisited.append(link)

      except URLError as e:
        print("URL {} threw this error when trying to parse: {}".format(link, e.reason))
        self.errorLinks.append(link)
      finally:
        self.linksToCrawl.task_done()
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号