htmlParser.py 文件源码

python
阅读 24 收藏 0 点赞 0 评论 0

项目:pycrawler 作者: zyq001 项目源码 文件源码
def dealLocalFile():
    rootDir = os.getcwd()

    list_dirs = os.walk(rootDir)
    for root, dirs, files in list_dirs:
        # for d in dirs:
        #     print os.path.join(root, d)
        for f in files:
            if f.endswith('html'):
                path = os.path.join(root, f)
                soup = BeautifulSoup(open(path), 'html.parser')
                soup = soup.body

                #????
                comments = soup.findAll(text=lambda text: isinstance(text, Comment))
                [comment.extract() for comment in comments]

                #??span??
                spans = soup.select("span")
                [span.unwrap() for span in spans]

                #??font??
                fonts = soup.select("font")
                [font.unwrap() for font in fonts]

                pps = soup.select("p")
                for pp in pps:
                    del pp['style']
                    # text = pp.get_text()
                    # text = text.strip()
                    # if text is '' or len(text) < 1:#????p??,??
                    #     pp.extract()
                # #
                # imgs = soup.select("img")
                # for img in imgs:
                #     src = img['src']
                #     index = src.find('/')
                #     if index != -1:
                #         newSrc = 'imgs' + src[index:]
                #         img['src'] = newSrc
                #         # print newSrc
                ps = soup.select('p')
                title = ''
                for p in ps:
                    if p.get_text() != '' and len(p.get_text()) > 0:
                        title = p.get_text()
                        p.extract()
                        break
                fo = open(title + ".html", "w")
                soup.prettify()
                fo.write(str(soup));

                # ???????
                fo.close()

                # print soup.prettify()
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号