def walkListItems(sess, url):
try:
global visited
def replacewhite(text):
return re.sub(r'(\ |\r|\n|\t)+', ' ', text)
resp = sess.get(url=url)
root = html.fromstring(resp.text)
tds = root.xpath(".//*[@class='kboard-list']//tr/td[2]")
for td in tds:
href = td.xpath(".//a")[0].attrib['href']
href = urljoin(url, href)
href = re.sub(r'pageid=\d+', '', href)
if href in visited:
continue
text = re.sub(r'(\ |\r|\n|\t)+', ' ', td.text_content())
if '???' not in text:
continue
print(text)
visited[href] = (text)
walkPageItem(sess, href, text)
except BaseException as ex:
traceback.print_exc()
print(ex)
评论列表
文章目录