def get_wx_article_lists(article_html,id_index):
# global article_flag
#?????
wx_article_list = []
html_tree = html.document_fromstring(article_html)
html_nodes = html_tree.xpath('//ul[@class="article-ul"]//li')
for html_node in html_nodes:
#?????????
wx_article_object = {}
html_node_children = html_node.getchildren()
#???????????????????(????????find?????????????)
div_wx_ft_children = html_node_children[1].find('div[@class="wx-ft"]').getchildren()
pub_time = div_wx_ft_children[1].text_content().strip()
pub_time = pub_time.encode('utf-8').split('?')
if len(pub_time) < 2:
print_pass_a_article(id_index,'time')
else:
pub_time = int(time.mktime(time.strptime(pub_time[1],'%Y-%m-%d %H:%M:%S')))
#????????
if pub_time <= last_time:
# article_flag = False
# print 'out of the time and return'
return wx_article_list
wx_article_object['time'] = str(pub_time)
readnum_and_likenum = re.split(r'\s',div_wx_ft_children[2].text_content().strip())
length = len(readnum_and_likenum)
if length < 2:
print_pass_a_article(id_index,'readnum_and_likenum')
readnum = str(readnum_and_likenum[0]).strip()
wx_article_object['readnum'] = str(int(readnum))
likenum = str(readnum_and_likenum[length-1]).strip()
wx_article_object['likenum'] = str(int(likenum))
div_wx_ft_h4 = html_node_children[1].find('h4')
title = div_wx_ft_h4.find('a').text_content()
if not title:
print_pass_a_article(id_index,'title')
wx_article_object['title'] = title
content = div_wx_ft_h4.getnext().text_content()
if not content:
print_pass_a_article(id_index,'content')
wx_article_object['content'] = content
#url?img-data-hash
div_wx_img_a = html_node_children[0].find('a')
url = div_wx_img_a.get('href')
if not url:
print_pass_a_article(id_index,'url')
wx_article_object['url'] = url
img_hash = div_wx_img_a.find('img').get('data-hash')
if not img_hash:
print_pass_a_article(id_index,'img-hash')
wx_article_object['imglink'] = get_img_link(img_hash)
wx_article_object['id'] = str(int(id_index))
wx_article_list.append(wx_article_object)
return wx_article_list
评论列表
文章目录