parser.py 文件源码

python
阅读 26 收藏 0 点赞 0 评论 0

项目:gushiwen-crawler 作者: shenyunhang 项目源码 文件源码
def run(self):
        while True:
            data = self._queue_data.get()
            self._index = data[0]
            html_contents = data[1]

            html_contents = re.sub('<br />', '\n', html_contents)
            only_main3 = SoupStrainer(class_="main3")
            soup_only_main3 = BeautifulSoup(
                html_contents, 'html.parser', parse_only=only_main3)

            # ?????
            if self._num_empty > 1000:
                break
            # ?????
            if soup_only_main3.get_text(strip=True) == self._delete:
                self._num_empty += 1
                continue
            else:
                self._num_empty = 0

            title_poetry = soup_only_main3.find(class_='son1').h1.string

            soup_only_main3.find(class_='son2').p.span.decompose()
            dynasty_poetry = soup_only_main3.find(class_='son2').p.string
            soup_only_main3.find(class_='son2').p.decompose()

            soup_only_main3.find(class_='son2').p.span.decompose()
            author_poetry = soup_only_main3.find(class_='son2').p.string
            soup_only_main3.find(class_='son2').p.decompose()

            soup_only_main3.find(class_='son2').p.decompose()
            soup_only_main3.find(class_='yizhu').decompose()
            content_poetry = soup_only_main3.find(
                class_='cont',id='cont').get_text()
            content_poetry = re.sub('[\n]+', '\n', content_poetry)
            content_poetry = content_poetry.strip('\n')

            path_html, path_txt = get_output_path(dynasty_poetry, self._index)
            file_html = open(path_html, 'w')
            file_html.writelines(data[1].encode('utf-8'))
            file_html.close()
            file_txt = open(path_txt, 'w')
            file_txt.writelines(title_poetry.encode('utf-8') + '\n')
            file_txt.writelines(dynasty_poetry.encode('utf-8') + '\n')
            file_txt.writelines(author_poetry.encode('utf-8') + '\n')
            file_txt.writelines(content_poetry.encode('utf-8') + '\n')
            file_txt.close()

            print '-----------------------------------------------------------'
            print 'Parser: ', self._index
            print '???', title_poetry
            print '???', dynasty_poetry
            print '???', author_poetry
            print '???\n', content_poetry

        print 'Parser finish'
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号