def extract_links(response_content, unique=False, blacklist_domains=[],
whitelist_domains=[], regex=None, zen_path=None,
blacklist_extensions=[], whitelist_extensions=[]):
"""Extract links from a response content.
Args:
response_content (str): The HTML page received in a Response Object.
unique (bool): A parameter defining if the list can contain duplicates.
Defaults to False.
blacklist_domains (list): List of domains to exclude from the result.
whitelist_domains (list): List of domains to include from the result.
regex (list): A regular expression filter on the link.
Defaults to None.
zen_path (list): A selector to restrict the XPath to parse with bs4.
Returns:
links (list): A list of extracted and filtered links.
"""
if any([item in blacklist_domains for item in whitelist_domains]) \
or any([item in blacklist_extensions for item in whitelist_extensions]):
raise LinkExtractorException('blacklist_domains and whitelist_domains '
'can`t contain common value(s).')
soup = BeautifulSoup(
response_content, "html.parser", parse_only=SoupStrainer('a')
)
links = [a.text for a in soup]
if unique:
links = list(set(links))
if regex:
links = filter_links(links, regex)
if whitelist_domains:
for domn in whitelist_domains:
links = filter_links(links, domn.replace('.', '\.'), include=True)
if blacklist_domains:
for domn in blacklist_domains:
links = filter_links(links, domn.replace('.', '\.'), include=False)
if whitelist_extensions:
for ext in whitelist_extensions:
links = filter_links(links, ext.replace('.', '\.'), include=True)
if blacklist_extensions:
for ext in blacklist_extensions:
links = filter_links(links, ext.replace('.', '\.'), include=False)
return links
评论列表
文章目录