def crawl(url):
domain = url.split("//www.")[-1].split("/")[0]
html = requests.get(url).content
soup = bs4.BeautifulSoup(html, "lxml")
links = set(soup.findAll('a', href=True))
for link in links:
sub_url = link['href']
page_name = link.string
if domain in sub_url:
try:
page = requests.get(sub_url).content
filename = slugify(page_name).lower() + '.html'
with open(filename, 'wb') as f:
f.write(page)
except:
pass
评论列表
文章目录