get_wx_articles.py 文件源码-python代码片段

def get_wx_article_lists(article_html,id_index):
    # global article_flag
    #?????
    wx_article_list = []

    html_tree = html.document_fromstring(article_html)
    html_nodes = html_tree.xpath('//ul[@class="article-ul"]//li')

    for html_node in html_nodes:
        #?????????
        wx_article_object = {}

        html_node_children = html_node.getchildren()

        #???????????????????(????????find?????????????)
        div_wx_ft_children = html_node_children[1].find('div[@class="wx-ft"]').getchildren()
        pub_time = div_wx_ft_children[1].text_content().strip()
        pub_time = pub_time.encode('utf-8').split('?')
        if len(pub_time) < 2:
            print_pass_a_article(id_index,'time')
        else:
            pub_time = int(time.mktime(time.strptime(pub_time[1],'%Y-%m-%d %H:%M:%S')))
            #????????
            if pub_time <= last_time:
                # article_flag = False
#                 print 'out of the time and return'
                return wx_article_list            
        wx_article_object['time'] = str(pub_time)
        readnum_and_likenum = re.split(r'\s',div_wx_ft_children[2].text_content().strip())
        length = len(readnum_and_likenum)
        if length < 2:   
            print_pass_a_article(id_index,'readnum_and_likenum')
        readnum = str(readnum_and_likenum[0]).strip()
        wx_article_object['readnum'] = str(int(readnum))
        likenum = str(readnum_and_likenum[length-1]).strip()
        wx_article_object['likenum'] = str(int(likenum))

        div_wx_ft_h4 = html_node_children[1].find('h4')
        title = div_wx_ft_h4.find('a').text_content()
        if not title:
            print_pass_a_article(id_index,'title')
        wx_article_object['title'] = title
        content = div_wx_ft_h4.getnext().text_content()
        if not content:
            print_pass_a_article(id_index,'content')
        wx_article_object['content'] = content

        #url?img-data-hash
        div_wx_img_a = html_node_children[0].find('a')
        url = div_wx_img_a.get('href')
        if not url:
            print_pass_a_article(id_index,'url')
        wx_article_object['url'] = url
        img_hash = div_wx_img_a.find('img').get('data-hash')
        if not img_hash:
            print_pass_a_article(id_index,'img-hash')
        wx_article_object['imglink'] = get_img_link(img_hash)
        wx_article_object['id'] = str(int(id_index))

        wx_article_list.append(wx_article_object)
    return wx_article_list