def process_item(self, item, spider):
"""Main function that process URL item (first phase)."""
# validate URL length
if len(item['raw']) > MAX_URL_LEN:
item['raw'] = item['raw'][:MAX_URL_LEN]
logger.error('Raw URL too long, trucate it! %r', item['raw'])
# parse raw URL
purl = get_parsed_url(item['raw'])
if purl is None or purl.hostname is None:
raise DropItem('Invalide URL')
site_id = belongs_to_site(purl.hostname, self.site_tuples)
if site_id is None:
raise DropItem('Offsite domain: %s', item)
item['site_id'] = site_id
# insert URL into table
try:
get_or_create_murl(spider.session, item, spider.platform_id)
except SQLAlchemyError as e:
logger.error(e)
spider.session.rollback()
raise DropItem('Fail to insert database of url: %s', item)
return item
评论列表
文章目录