def __init__(self, *args, **kwargs):
self.rules = [Rule(self.get_link_extractor(),
callback=self.parse_item,
process_links=self.limit_links,
follow=True)]
super(WebSpider, self).__init__(*args, **kwargs)
target_sites = settings.get('TARGET_SITES')
if target_sites and os.path.isfile(target_sites):
# Read a list of URLs from file
# Create the target file list
with open(target_sites) as target_sites_file:
# Make it to Python list
self.start_urls = target_sites_file.read().splitlines()
# Remove empty strings
self.start_urls = [u for u in self.start_urls if u]
else:
self.start_urls = self.default_start_url
评论列表
文章目录