def get_child_urls(main_page, max_child=20):
"""retrieve urls from giving html page.
args:
main_page(str): html file.
max_child(int): max number of return urls.
return:
list of url string.
"""
from bs4 import BeautifulSoup, SoupStrainer
children = []
for link in BeautifulSoup(main_page, "html.parser", parse_only=SoupStrainer('a')):
if link.has_attr('href') and link['href'].startswith("http"):
children.append(link['href'])
if len(children) > max_child:
children = children[:max_child]
return children
评论列表
文章目录