def dealLocalFile():
rootDir = os.getcwd()
list_dirs = os.walk(rootDir)
for root, dirs, files in list_dirs:
# for d in dirs:
# print os.path.join(root, d)
for f in files:
if f.endswith('html'):
path = os.path.join(root, f)
soup = BeautifulSoup(open(path), 'html.parser')
soup = soup.body
#????
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
#??span??
spans = soup.select("span")
[span.unwrap() for span in spans]
#??font??
fonts = soup.select("font")
[font.unwrap() for font in fonts]
pps = soup.select("p")
for pp in pps:
del pp['style']
# text = pp.get_text()
# text = text.strip()
# if text is '' or len(text) < 1:#????p??,??
# pp.extract()
# #
# imgs = soup.select("img")
# for img in imgs:
# src = img['src']
# index = src.find('/')
# if index != -1:
# newSrc = 'imgs' + src[index:]
# img['src'] = newSrc
# # print newSrc
ps = soup.select('p')
title = ''
for p in ps:
if p.get_text() != '' and len(p.get_text()) > 0:
title = p.get_text()
p.extract()
break
fo = open(title + ".html", "w")
soup.prettify()
fo.write(str(soup));
# ???????
fo.close()
# print soup.prettify()
评论列表
文章目录