def process_item(self, item, spider):
valid = True
for data in item:
if not data :
valid = False
raise DropItem("Missing {0}!".format(data))
if item['title'] == '':
valid = False
raise DropItem("title is empty")
if item['content'] == '':
valid = False
raise DropItem("content is empty")
for keyword in settings['EXCLUDE']:
if keyword in item['title']:
valid = False
DropItem("title have invalid keywords")
break
if valid:
iskey = False
for key in settings['KEYS']:
if key in item['title']:
iskey = True
break
for author in settings['AUTHOR']:
if author == item['author']:
iskey = True
break
if not iskey:
raise DropItem("item do not have keywords")
for info in self.db.items.find({}, {"title":1}):
infoTitle = info["title"].encode("utf-8")
if infoTitle == item["title"]:
valid = False
raise DropItem("item exist!")
break
if valid:
self.collection.insert(dict(item))
send_mail(item['title'], item['content'], item['href'])
# log.msg("webCrewl item added to MongoDB database!",
# level=log.DEBUG, spider=spider)
return item
评论列表
文章目录