def process_item(self, item, spider):
item['title'] = self.format_title(item['title'])
doc = item['content']
if not isinstance(doc,
HtmlElement):
if isinstance(doc,
(str, bytes)):
doc = fromstring(bytes(bytearray(doc,
encoding=item['encoding'])),
parser=HTMLParser(encoding=item['encoding']))
else:
raise ContentException((
'Error in content pipeline unsupported doc type[{}]'
).format(doc.__class__.__name__))
# remove element with class name for clean display
removed_classes = getattr(spider,
self.REMOVED_CLASSES_NAME,
None)
if removed_classes is not None:
doc = self.remove_element_with_class(doc,
removed_classes)
# remove element with xpath for clean display
removed_xpath_nodes = getattr(spider,
self.REMOVED_XPATH_NODES_NAME,
None)
if removed_xpath_nodes is not None:
doc = self.remove_element_with_xpath(doc,
removed_xpath_nodes)
allow_classes = getattr(spider,
self.ALLOW_CLASSES_NAME,
None)
safe_attrs = getattr(spider,
self.SAFE_ATTRS_NAME,
None)
doc = self.clean_html(doc,
allow_classes=allow_classes,
safe_attrs=safe_attrs)
doc = self.make_abs_link(doc,
item['link'])
item['content'] = doc
return item
评论列表
文章目录