DriverService.py 文件源码

python
阅读 19 收藏 0 点赞 0 评论 0

项目:ugc.aggregator 作者: Dreamcatcher-GIS 项目源码 文件源码
def crawlWeiboContent(self, userID, weiboID, pageHandler = None, threshold=0.2):
        if pageHandler is None:
            pageHandler = self.pageHandler_comment
        # ????????
        url = "http://weibo.com/"+userID +"/"+weiboID
        print url
        self.driver.get(url)
        time.sleep(1)
        # ?????
        while True:
            try:
                totalPageNum = self.__getCommentNum(self.driver.page_source)/20+1
                break
            except:
                continue

        thresholdTime = time.time()+threshold
        currentPageNum = 1
        loopNum = 0
        ifHandle = False
        # ????????????
        while currentPageNum<=totalPageNum:
            # ??????
            self.driver.find_element_by_tag_name("body").send_keys(Keys.END)
            if time.time()>thresholdTime:
                # ???????????????
                if u"???" in self.driver.page_source:
                    # ifHandle???????????????
                    if ifHandle==False:
                        pageHandler(self.driver.page_source,currentPageNum,userID,weiboID)
                        ifHandle = True

                    try:
                        self.driver.find_element_by_partial_link_text("???").click()
                        time.sleep(1)
                        loopNum = 0
                        ifHandle = False
                        currentPageNum = currentPageNum+1
                    except:
                        thresholdTime = time.time()+threshold
                        continue
                else:
                    thresholdTime = time.time()+threshold
                    loopNum = loopNum + 1
                    continue
                # ???????????????
                if currentPageNum==totalPageNum:
                    if ifHandle==False:
                        pageHandler(self.driver.page_source,currentPageNum,userID,weiboID)
                        break

            # ????????,????????????,????
            if loopNum>20:
                loopNum = 0
                self.driver.refresh()
                currentPageNum = 1

    # ?????
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号