def process_item(self, item, spider):
if not re.match('.*comment.*',item['link']):
if re.match('^http.*qq.com.*\.s?html?$',item['link']):
if item['link'] in self.seen:
raise DropItem('Duplicate link %s' % item['link'])
self.seen.add(item['link'])
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(line)
return item
评论列表
文章目录