def process_item(self, item, spider):
if item['link'] in self.seen:
raise DropItem('Duplicate link %s' % item['link'])
self.seen.add(item['link'])
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(line)
return item
python类DropItem()的实例源码
def process_item(self, item, spider):
if redis_db.hexists(redis_data_dict, item['link']):
raise DropItem("Duplicate item found: %s" % item)
else:
# print item['link']
cur=self.conn.cursor()
add_url = """insert into sohuurl(url) VALUES (%s)"""
data_url=(str(item['link']),)
cur.execute(add_url,data_url)
self.conn.commit()
cur.close()
return item
def process_item(self, item, spider):
if item['link'] in self.seen:
raise DropItem('Duplicate Link %s' % item['link'])
self.seen.add(item['link'])
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(line)
return item
def process_item(self,jd_item,JDspider):
# if not jd_item['flag']:
# raise DropItem("item dropped found: %s" % jd_item)
# else:
str_line1= json.dumps(dict(jd_item)) + "\n"
self.file1.write(str_line1)
str_line2=json.dumps(dict(jd_item))+','+'\n'
self.file2.write(str_line2)
return jd_item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem('Missming{}!'.format(data))
if valid:
self.coll.insert(dict(item))
log.msg('item added to mongodb database !',level=log.DEBUG,spider=spider)
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Event added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Event added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
def process_item(self, item, spider):
if item['link'] in self.ids_seen:
raise DropItem("Duplicate item found:%s" % item)
else:
self.ids_seen.add(item['link'])
return item
def process_item(self, item, spider):
if re.search(u'window|??|??|????', item['title'], re.I):
print "ignore this item"
raise DropItem("Contains word that you don't want: %s" % item['title'])
elif re.search(u'window|??|??|????', item['abstract'], re.I):
print "ignore this item"
raise DropItem("Contains word that you don't want: %s" % item['abstract'])
else:
return item
def process_item(self, item, spider):
collection_name = item.__class__.__name__
try:
self.db[collection_name].insert(dict(item))
except DuplicateKeyError:
return DropItem("Duplicate item found: %s" % item)
else:
return item
def process_item(self, item, spider):
"""Drop items not fitting parameters. Open in browser if specified. Return accepted items."""
if self._skip_list and str(item['id']) in self._skip_list:
raise DropItem('Item in skip list: {}'.format(item['id']))
if self._minimum_monthly_discount and 'monthly_discount' in item:
if item['monthly_discount'] < self._minimum_monthly_discount:
raise DropItem('Monthly discount too low: {}'.format(item['monthly_discount']))
if self._minimum_weekly_discount and 'weekly_discount' in item:
if item['weekly_discount'] < self._minimum_monthly_discount:
raise DropItem('Weekly discount too low: {}'.format(item['weekly_discount']))
# check regexes
if self._cannot_have_regex:
for f in self._fields_to_check:
v = str(item[f].encode('ASCII', 'replace'))
if self._cannot_have_regex.search(v):
raise DropItem('Found: {}'.format(self._cannot_have_regex.pattern))
if self._must_have_regex:
has_must_haves = False
for f in self._fields_to_check:
v = str(item[f].encode('ASCII', 'replace'))
if self._must_have_regex.search(v):
has_must_haves = True
break
if not has_must_haves:
raise DropItem('Not Found: {}'.format(self._must_have_regex.pattern))
# open in browser
if self._web_browser:
webbrowser.get(self._web_browser).open(item['url'])
return item
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
#item['image_paths'] = image_paths
return item
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
try:
key = {}
self.db[item['item_name']].insert(dict(item))
logging.debug("add {}".format(item['item_name']))
except (pymongo.errors.WriteError, KeyError) as err:
raise DropItem(
"Duplicated comment Item: {}".format(item['good_name']))
return item
def process_item(self, item, spider):
if item['pid'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['pid'])
return item
def item_completed(self, results, item, info):
if info.spider.name == 'sisy':
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
def process_item(self, item, spider):
"""??????
"""
# ????
if isinstance(item, WeChat):
if self.is_duplicate_wechat(item):
return DropItem("Duplicate news found: %s" % item['article_addr'])
else:
return item
def process_item(self, item, spider):
if Redis.exists('ip_port:%s:%s' % (datetime.now().strftime("%Y%m%d"),item['ip_port'])) :
raise DropItem("Duplicate item found: %s" % item)
else:
Redis.set('ip_port:%s:%s' % (datetime.now().strftime("%Y%m%d"),item['ip_port']),1)
return item
def __getValue(self,url):
isHaveManyQueryInUrl = False
for value in self.valuedict:
div_by_value = url.split(value.rstrip('\n'))
mm = div_by_value[0]
if mm in self.seen:
raise DropItem('Duplicate link %s' % url)
elif len(div_by_value) > 1 and not isHaveManyQueryInUrl:
self.seen.add(mm)
isHaveManyQueryInUrl = True
line = url+'\n'
print url
self.file.write(line)
def process_item(self, item, spider):
# For the case where something goes wrong
if item['spider_response'].status != 200:
# Item is no longer processed in the pipeline
raise DropItem("%s: Non-200 response" % item['url'])
else:
return item