def getTextFromSoup(htmlsoup):
#print(htmlsoup)
text = str(htmlsoup)
imgurls = []
imgs = htmlsoup.find_all('img')
#print_list(imgs)
for img in imgs:
imgurls.append('http://acm.hdu.edu.cn%s' % img['src'])
#print(htmlsoup)
if len(imgurls) != 0:
for url in imgurls:
text, number = re.subn(r'<img.+?>', r'' % url, text)
#print(text)
text, number = re.subn(r'<br>', '\n', text)
text, number = re.subn(r'<.+?>', '', text)
return text
#print(text)
#print(seh.group())
#print_list(imgurls)
# res = htmlsoup.replace('<br/>', '\r\n')
# result, number = re.subn('<.+?>', '', res)
# print(result)
# print(number)
评论列表
文章目录