DL_ptt_crawer.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:PTTChatBot_DL2017 作者: thisray 项目源码 文件源码
def parse_article(self, url):
        raw  = self.session.get(url, verify=False)
        soup = BeautifulSoup(raw.text, "lxml")
        try:
            article = {}
            article["Author"] = soup.select(".article-meta-value")[0].contents[0].split(" ")[0]
            article["Board"]  = soup.select(".article-meta-value")[1].contents[0]
            article["Title"]  = soup.select(".article-meta-value")[2].contents[0]
            article["Date"]  = soup.select(".article-meta-value")[3].contents[0]
            content = ""
            for tag in soup.select("#main-content")[0]:
                if type(tag) is NavigableString and tag !='\n':
                    content += tag
                    break
            article["Content"] = content
            findIPtag = u'? ???:'

            # deal different ip type
            try:    
                ip_temp = soup.find(string = re.compile(findIPtag))
                ip_temp = re.search(r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*", ip_temp).group()
            except:
                try:
                    ip_temp = 'NA'
                    f2_content = soup.select('.f2')
                    for content in f2_content:
                        if findIPtag in content.contents[0]:
                            ip_temp = content.next_sibling.split()[-1]
                            break
                except:
                    ip_temp = 'NA'
            article["IP"] = ip_temp 

            upvote = 0
            downvote = 0
            novote = 0
            response_list = []

            for response_struct in soup.select(".push"):
                if "warning-box" not in response_struct['class']:
                    response_dic = {}
                    response_dic["Content"] = response_struct.select(".push-content")[0].contents[0][1:]
                    response_dic["Vote"]  = response_struct.select(".push-tag")[0].contents[0][0]
                    response_dic["User"]  = response_struct.select(".push-userid")[0].contents[0]
                    response_list.append(response_dic)
                    if response_dic["Vote"] == u"?":
                        upvote += 1
                    elif response_dic["Vote"] == u"?":
                        downvote += 1
                    else:
                        novote += 1

            article["Responses"] = response_list
            article["UpVote"] = upvote
            article["DownVote"] = downvote
            article["NoVote"] = novote
        except Exception as e:
            print(e)
            print(u"error in: %s " % url)

        return article
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号