def prepare_links_for_insert(links, url, site):
""" Get links dicts and prepare it to insert in MongoDB """
links_to_insert = []
for link in links:
if not link:
continue
link = urlparse(link)
if not link.scheme and \
not link.netloc and \
not link.path and \
not link.query:
continue
if link.netloc \
and link.netloc != site \
and 'www.' + link.netloc != site \
and link.netloc != 'www.' + site:
SpiderCommon._external_hosts.append(link.netloc)
continue
link = SpiderCommon.clear_link(link)
link = SpiderCommon.build_path(link, url.path)
link = SpiderCommon.clear_link(link)
links_to_insert.append(link)
separated_links = []
for link in links_to_insert:
paths = link.path.split("/")
while len(paths) != 1:
del paths[-1]
separated_links.append(
ParseResult(
scheme='',
netloc='',
path="/".join(paths) + '/',
params='',
query='',
fragment=''
)
)
return links_to_insert + separated_links
评论列表
文章目录