def discover_domains(subdomain_id, request_result_text):
# retrieve subdomain object
subdomain = Subdomain.objects.get(id=subdomain_id)
# Create and start logger
logger = create_logger('discover_{0}.log'.format(subdomain.id))
logger.info('discover {0} START'.format(subdomain.id))
# keep list or extracted subdomains to limit db queries
extracted_subdomain = []
for link in BeautifulSoup(request_result_text,
'html.parser', # todo use lxml to speed things up
parseOnlyThese=SoupStrainer('a')):
# todo this only saves 'href' attributes in 'a' elements, can be missing valid entries
if link.has_attr('href'):
href = link['href']
extract_result = extract_subdomain(href)
if extract_result not in extracted_subdomain:
extracted_subdomain.append(extract_result)
new_subdomain = import_subdomain(href,
discovered_by=subdomain)
logger.info('discover found {0}'.format(new_subdomain))
logger.info('discover {0} DONE'.format(subdomain_id))
# release memory
gc.collect()
评论列表
文章目录