def parse_article(self, url):
raw = self.session.get(url, verify=False)
soup = BeautifulSoup(raw.text, "lxml")
try:
article = {}
article["Author"] = soup.select(".article-meta-value")[0].contents[0].split(" ")[0]
article["Board"] = soup.select(".article-meta-value")[1].contents[0]
article["Title"] = soup.select(".article-meta-value")[2].contents[0]
article["Date"] = soup.select(".article-meta-value")[3].contents[0]
content = ""
for tag in soup.select("#main-content")[0]:
if type(tag) is NavigableString and tag !='\n':
content += tag
break
article["Content"] = content
findIPtag = u'? ???:'
# deal different ip type
try:
ip_temp = soup.find(string = re.compile(findIPtag))
ip_temp = re.search(r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*", ip_temp).group()
except:
try:
ip_temp = 'NA'
f2_content = soup.select('.f2')
for content in f2_content:
if findIPtag in content.contents[0]:
ip_temp = content.next_sibling.split()[-1]
break
except:
ip_temp = 'NA'
article["IP"] = ip_temp
upvote = 0
downvote = 0
novote = 0
response_list = []
for response_struct in soup.select(".push"):
if "warning-box" not in response_struct['class']:
response_dic = {}
response_dic["Content"] = response_struct.select(".push-content")[0].contents[0][1:]
response_dic["Vote"] = response_struct.select(".push-tag")[0].contents[0][0]
response_dic["User"] = response_struct.select(".push-userid")[0].contents[0]
response_list.append(response_dic)
if response_dic["Vote"] == u"?":
upvote += 1
elif response_dic["Vote"] == u"?":
downvote += 1
else:
novote += 1
article["Responses"] = response_list
article["UpVote"] = upvote
article["DownVote"] = downvote
article["NoVote"] = novote
except Exception as e:
print(e)
print(u"error in: %s " % url)
return article
评论列表
文章目录