sinaCrawlforADSL.py 文件源码

python
阅读 21 收藏 0 点赞 0 评论 0

项目:SinaSpider 作者: szcf-weiya 项目源码 文件源码
def getWeiboContent(self):
        weiboContent = ""
        try:
            req = self.session.get(self.URL, headers = self.myheader)
            if req.status_code == 200:
                print 'This session work.'
                print 'The current Ip is ' + self.getPublicIp()
            else:
                print 'This session not work with code 200.'
                return False
        except:
            print 'This session not work.'
            return False
        try:
            page = req.content

        except httplib.IncompleteRead:
            print 'Incompleted!'
            return False
# try to use phantomjs
#        cmd = 'phantomjs' + ' request.js ' + self.URL + ' '+ str(self.myheader)
#        str_body =  str(os.popen(cmd).read())
#        page = str_body.split('\nbegin\nStatus: success\n')[1]
        soupPage = BeautifulSoup(page, 'lxml')
        numList = soupPage.find_all('script')
        if len(numList) == 0:
            print 'you may need to input an access code'
            return False
        for i in range(0, len(numList)):
            IsSearch = re.search(r"\"pid\":\"pl_weibo_direct\"", str(numList[i]))
            if IsSearch == None:
                continue
            else:
                weiboContent = str(numList[i])
                break
        return weiboContent
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号