def parse(self, response):
"""
default parse method, rule is not useful now
"""
# import pdb; pdb.set_trace()
response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
hxs = HtmlXPathSelector(response)
index_level = self.determine_level(response)
log.msg("Parse: index level:" + str(index_level))
if index_level in [1, 2, 3, 4]:
self.save_to_file_system(index_level, response)
relative_urls = self.get_follow_links(index_level, hxs)
if relative_urls is not None:
for url in relative_urls:
log.msg('yield process, url:' + url)
yield Request(url, callback=self.parse)
elif index_level == 5:
personProfile = HtmlParser.extract_person_profile(hxs)
linkedin_id = self.get_linkedin_id(response.url)
linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
if linkedin_id:
personProfile['_id'] = linkedin_id
personProfile['url'] = UnicodeDammit(response.url).markup
yield personProfile
评论列表
文章目录