def process_item(self, item, spider):
if not type(item) == Alert:
return item
uri = item['uri']
if not uri:
raise DropItem("Not a valid alert URI: ", uri)
if spider.custom_whitelist:
for (pattern) in spider.custom_whitelist:
if pattern[0] in uri:
raise DropItem("Whitelisted domain found in Alert: ", uri)
if spider.alexa_whitelist:
try:
parsed_uri = urlparse(uri)
parsed_domain = '{uri.netloc}'.format(uri=parsed_uri)
domain = get_tld(uri)
for alexa_domain in spider.alexa_whitelist:
if domain.endswith(alexa_domain):
raise DropItem("Alert domain found in Alexa Whitelist: ", domain)
except (TldIOError,TldDomainNotFound,TldBadUrl) as e:
log.msg("Error parsing TLD. Still allowing alert for " + uri, level=log.WARNING)
except:
raise
return item
评论列表
文章目录