def process_item(self, item, spider):
if item is not None:
doc = item['content']
if not isinstance(doc,
(str, bytes)):
if isinstance(doc,
HtmlElement):
item['content'] = tostring(doc,
encoding='UTF-8',
pretty_print=True,
method='html')
item['encoding'] = 'UTF-8'
else:
raise Exception((
'Error in store pipeline unsupported doc type[{}]'
).format(doc.__class__.__name__))
item_ = dict(item)
item_['lang'] = get_article_lang(item)
item_['spider'] = spider._id
item_['source'] = spider.title
item_['category'] = get_category(item_)
if not is_exists_article(item_):
save_article(item_)
return item
评论列表
文章目录