def process_item(self, item, spider):
if spider.name == 'RssCrawler':
# Search the CurrentVersion table for a version of the article
try:
self.cursor.execute(self.compare_versions, (item['url'],))
except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
pymysql.IntegrityError, TypeError) as error:
self.log.error("Something went wrong in rss query: %s", error)
# Save the result of the query. Must be done before the add,
# otherwise the result will be overwritten in the buffer
old_version = self.cursor.fetchone()
if old_version is not None:
# Compare the two download dates. index 3 of old_version
# corresponds to the download_date attribute in the DB
if (datetime.datetime.strptime(
item['download_date'], "%y-%m-%d %H:%M:%S") -
old_version[3]) \
< datetime.timedelta(hours=self.delta_time):
raise DropItem("Article in DB too recent. Not saving.")
return item
python类DropItem()的实例源码
def process_item(self, item, spider):
keywords = spider.search_terms
title = item['title'].lower()
#####
# We can pass in excluded words the same way as keywords later. Commented out for now.
# excluded_words = ['asp.net', 'java', 'c#', 'web developer', 'c++',
# 'windows', 'qa', 'support', '.net', 'manager', 'sales',
# 'marketing', 'senior', 'snr', 'salesforce', 'crm']
#####
#####
# if any(keyword in title for keyword in excluded_words):
# raise DropItem("Job title contained excluded word")
#####
if any(keyword in title for keyword in keywords):
return item
else:
raise DropItem("Job title doesn't contain our search terms")
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem('Missing{0}!'.format(data))
if valid:
self.collection.insert(dict(item))
log.msg('??????!', level=log.DEBUG, spider=spider)
return item
# def testdb(self):
# # ???MongoHQ
# con = pymongo.Connection("paulo.mongohq.com",10042)
# db = con.mytest
# db.authenticate("root", "sa123")
# db.urllist.drop()
def process_item(self, item, spider):
if isinstance(item, SsptransparenciaBO):
key = 'bos'
_id = item['id']
elif isinstance(item, SsptransparenciaVitima):
key = 'vitimas'
_id = '%s::%s' % (item['bo_id'], item['count'])
elif isinstance(item, SsptransparenciaNatureza):
key = 'naturezas'
_id = '%s::%s' % (item['bo_id'], item['count'])
if _id in self.ids_seen[key]:
raise DropItem('Duplicate item found: %s' % item)
else:
self.ids_seen[key].add(_id)
return item
def process_item(self, item, domain):
now = arrow.now()
seen = self.check_seen_before(item)
if len(seen) > 0:
last_seen = max(seen)
time_limit = now.replace(**self.time_scale).timestamp
if last_seen < time_limit:
self.insert_item_price(item, now.timestamp)
raise DropItem("Already seen %s, %s" % (item['url'], arrow.get(last_seen).humanize()))
else:
self.insert_item_price(item, now.timestamp)
self.insert_item_main(item)
self.insert_item_tag_list(item)
self.insert_item_description(item)
self.conn.commit()
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
try:
# key = {}
# key['sku_id'] = item['sku_id']
# self.db[item['item_name']].update(key, dict(item), upsert=True)
self.db[item['item_name']].insert(dict(item))
logging.debug("add {}".format(item['item_name']))
except (pymongo.errors.WriteError, KeyError) as err:
raise DropItem("Duplicated Item: {}".format(item['name']))
return item
def process_item(self, item, spider):
db_matches = db.session.query(DBMenuEntry).filter_by(
category=item['category'],
mensa=item['mensa'],
description=item['description'],
date_valid=item['date_valid'],
allergens=item['allergens'],
price=item['price']
).all()
if db_matches:
# If there is more than one matching entry in the database, we probably
# already saved a duplicate by accident. I really hope that doesn't happen.
assert(len(db_matches) == 1)
spider.crawler.stats.inc_value('items_already_in_db')
raise DropItem(
"Menu item already found in database.\n"
"Previously scraped on: {previous_scrape_time}".format(
previous_scrape_time=str(db_matches[0].time_scraped)))
else:
return item
def item_completed(self, results, item, info):
'''
:param results:
:param item:
:param info:
:return:
????????????????????????????????????????
item_completed() ???????
'''
spiderName = self.spiderinfo.spider.name
if spiderName == 'jiandan':
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
def process_item(self, item, spider):
if spider.name == 'RssCrawler':
# Search the CurrentVersion table for a version of the article
try:
self.cursor.execute(self.compare_versions, (item['url'],))
except mysql.connector.Error as error:
self.log.error("Something went wrong in rss query: %s", error)
# Save the result of the query. Must be done before the add,
# otherwise the result will be overwritten in the buffer
old_version = self.cursor.fetchone()
if old_version is not None:
# Compare the two download dates. index 3 of old_version
# corresponds to the download_date attribute in the DB
if (datetime.datetime.strptime(
item['download_date'], "%y-%m-%d %H:%M:%S") -
old_version[3]) \
< datetime.timedelta(hours=self.delta_time):
raise DropItem("Article in DB too recent. Not saving.")
return item
def process_item(self, item, spider):
def raise_if_missing(name, item):
if name not in item:
raise DropItem('The required field "{}" is missing in: {}.'.
format(name, item))
# Required fields for all items
for required in ('id', 'title', 'link'):
raise_if_missing(required, item)
# Required fields for FeedEntryItems
if isinstance(item, FeedEntryItem):
for required in ('updated',):
raise_if_missing(required, item)
return item
def process_item(self, item, spider):
if not isinstance(item, ProxyItem):
return item
if not item.get('ip', None) or not item.get('port', None):
raise DropItem('Bad ProxyItem')
item.setdefault('addr', 'Unknown')
item.setdefault('mode', 'Unknown')
item.setdefault('protocol', 'http')
item.setdefault('validation_time', 'Unknown')
proxy = '{}://{}'.format(item['protocol'], item['proxy'])
if self.conn.sismember('rookie_proxies', proxy) or\
self.conn.sismember('available_proxies', proxy) or\
self.conn.sismember('lost_proxies', proxy) or\
self.conn.sismember('dead_proxies', proxy):
raise DropItem('Already in the waiting list')
key = 'proxy_info:'+item['proxy']
pipe = self.conn.pipeline(False)
pipe.sadd('rookie_proxies', proxy)
pipe.zadd('rookies_checking', item['proxy'], time.time())
pipe.hmset(key, dict(item))
pipe.hset(key, 'failed_times', 0)
pipe.execute()
return item
def process_item(self, item, spider):
title = item.get('title', 'title_not_set')
if title == 'title_not_set':
err_msg = 'Missing title in: %s' % item.get('url')
raise DropItem(err_msg)
raw_content = item.get('raw_content', 'raw_content_not_set')
if raw_content == 'raw_content_not_set':
err_msg = 'Missing raw_content in: %s' % item.get('url')
raise DropItem(err_msg)
published_at = item.get('published_at', 'published_at_not_set')
if published_at == 'published_at_not_set':
err_msg = 'Missing published_at in: %s' % item.get('url')
raise DropItem(err_msg)
# Pass item to the next pipeline, if any
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
try:
# key = {}
# key['sku_id'] = item['sku_id']
# self.db[item['item_name']].update(key, dict(item), upsert=True)
self.db[item['item_name']].insert(dict(item))
logging.debug("add {}".format(item['item_name']))
except (pymongo.errors.WriteError, KeyError) as err:
raise DropItem("Duplicated Item: {}".format(item['name']))
return item
def process_item(self, item, spider):
try:
data = {
'url': item['url'],
'file_name': item['file_name'],
'media_type': item['media_type'],
'host': item['host'],
'file_dir': item['file_dir'],
'download': item['download'],
'extract': item['extract'],
'info': item['info'],
'stack': item['stack'],
'media_urls': item['media_urls'],
}
self.col.update({'url': item['url']}, data, upsert=True)
# self.col.update({'url': item['url']}, {'$set': {'info': item['info']}})
# self.col.insert(data)
except Exception, err:
logging.error(str(err))
raise DropItem(str(err))
return item
def __insert_item(self, item=None):
item, self.items = self.items, item
item.pop('index', None)
try:
data = {
'url': item['url'],
'file_name': item['file_name'],
'media_type': item['media_type'],
'host': item['host'],
'file_dir': item['file_dir'],
'download': item['download'],
'extract': item['extract'],
'info': item['info'],
'stack': item['stack'],
'media_urls': item['media_urls'],
}
self.col.update({'url': item['url']}, data, upsert=True)
# self.col.insert(data)
except Exception, err:
logging.error(str(err))
raise DropItem(str(err))
return item
def process_item(self, item, spider):
if self.site_item_exist(item):
self.MG_table.insert(dict(item))
logging.debug("Question added to MongoDB database!")
# log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider)
'''
Scrapy ?? 5 ? logging ???
CRITICAL - ????(critical)
ERROR - ????(regular errors)
WARNING - ????(warning messages)
INFO - ????(informational messages)
DEBUG - ????(debugging messages) ???????DEBUG
'''
else:
raise DropItem("{} is exist".format(item['url']))
return item
def process_item(self, item, spider):
"""Main function that process URL item (first phase)."""
# validate URL length
if len(item['raw']) > MAX_URL_LEN:
item['raw'] = item['raw'][:MAX_URL_LEN]
logger.error('Raw URL too long, trucate it! %r', item['raw'])
# parse raw URL
purl = get_parsed_url(item['raw'])
if purl is None or purl.hostname is None:
raise DropItem('Invalide URL')
site_id = belongs_to_site(purl.hostname, self.site_tuples)
if site_id is None:
raise DropItem('Offsite domain: %s', item)
item['site_id'] = site_id
# insert URL into table
try:
get_or_create_murl(spider.session, item, spider.platform_id)
except SQLAlchemyError as e:
logger.error(e)
spider.session.rollback()
raise DropItem('Fail to insert database of url: %s', item)
return item
def process_item(self, item, spider):
"""Check if we need to store the item and decide whether to notify.
"""
# check if already in the database
stored = self.jobs_collection.find_one({'url': item['url']})
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
if stored:
item = stored
item['times_seen'] += 1
self.jobs_collection.update(
{'_id': item['_id']}, dict(item), False)
else:
# if not (and if not already set), add date to item
if not item.get('date_added', False):
item['date_added'] = datetime.now().isoformat()
if not item.get('date_posted', False):
item['date_posted'] = datetime.now().isoformat()
item['times_seen'] = 0
self.jobs_collection.insert(item)
return item
def _convert(self, item, spider):
image_paths = [im['path'] for im in item['images']]
datapath = spider.crawler.settings['FILES_STORE']
image_files = [datapath + path for path in image_paths]
item['pdf_file'] = '%s.pdf' % item['id']
dest = '{root}/{spider}/{file}'.format(
root=datapath,
spider=item['spider'],
file=item['pdf_file'],
)
print "file:"+dest
# Use convert command from ImageMagick.
cmd = ['convert'] + image_files + [dest]
try:
# TODO: capture errors
subprocess.check_call(cmd, stdout=subprocess.PIPE)
except subprocess.CalledProcessError as detail:
print detail
raise DropItem("failed to generate PDF")
return item
def process_item(self, item, spider):
str = ""
for e in item["bookinfo"]:
if re.search(r'^\s*$', e):
print "drop this element"
else:
str = str + e + ","
item["bookinfo"] = str[:-1]
if item['name']:
if item['author']:
return item
else:
raise DropItem("Missing name or author in %s" % item)
def process_item(self, item, spider):
item_keywords = judge_key_words(item)#??item????????
if item_keywords: #????????item
item["keywords"] = item_keywords
return item
else:
logger = logging.getLogger(spider.name)
logger.info("No keyword in %s" % item["news_url"])
raise DropItem("No keyword in %s" % item["news_url"])
def process_item(self, item, spider):
"""check item weather in item_seen
"""
if item['hash'] in self.item_seen:
raise DropItem('Duplicate item found: %s' %item)
else:
self.item_seen.add(item['hash'])
return item
def process_item(self, item, spider):
"""return ip is duplicate or not
:item: crawl item including host port
:returns: return item or DropItem
"""
if 'ip' not in item:
raise DropItem('')
port = item.get('port', 80)
host = '%s:%s' % (item['ip'], port)
if self.conn.sismember(settings.HOST_S, host) or self.dup_in_queue(host):
raise DropItem('%s, cause duplicate' % (host))
else:
return item
def process_item(self, item, spider):
"""save to redis and return item
:item: crawl item including host port
:returns: return item or DropItem
"""
if 'ip' not in item:
raise DropItem('')
port = item.get('port', 80)
host = '%s:%s' % (item['ip'], port)
self.conn.sadd(self.host_s, host)
return item
def process_item(self, item, spider):
job_title_company = item['title'] + item['company']
if job_title_company in self.title_company:
raise DropItem("Duplicate item found: %s" % (item))
else:
self.title_company.add(job_title_company)
return item
def process_item(self, item, spider):
valid = True
print '--'*40
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
try:
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",
level=log.DEBUG, spider=spider)
except:
print 'ggggg'*40
return item
def process_item(self, item, spider):
print "------"
if item.keys() >= 5:
if item in self.has:
raise DropItem("Duplicate item found: %s" % item)
else:
self.has.add(item)
return item
# mongodb??
def process_item(self, item, spider):
if item['pid'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['id'])
return item
def process_item(self, item, spider):
if item['link'] in self.seen:
raise DropItem('Duplicate link %s' % item['link'])
self.seen.add(item['link'])
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(line)
return item
def process_item(self, item, spider):
if not re.match('.*comment.*',item['link']):
if re.match('^http.*qq.com.*\.s?html?$',item['link']):
if item['link'] in self.seen:
raise DropItem('Duplicate link %s' % item['link'])
self.seen.add(item['link'])
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(line)
return item