def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem('Missing{0}!'.format(data))
if valid:
self.collection.insert(dict(item))
log.msg('??????!', level=log.DEBUG, spider=spider)
return item
# def testdb(self):
# # ???MongoHQ
# con = pymongo.Connection("paulo.mongohq.com",10042)
# db = con.mytest
# db.authenticate("root", "sa123")
# db.urllist.drop()
python类DEBUG的实例源码
def process_item(self, item, spider):
#import pudb; pu.db
#val = "{0}\t{1}\t{2}\t{3}\t".format(item['appid'], item['title'], item['recommended'], item['intro'])
#self.file.write('--------------------------------------------\n')
#self.file.write(val)
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("new app added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
def process_item(self, item, spider):
if spider.name == 'baiduTopStockSpider':
collection = self.db[settings['stock']]
d = dict(item)
cursor = list(collection.find({'num': d["num"], 'source': d["source"]}))
if cursor:
collection.update({'_id': cursor[0]['_id']}, d)
else:
collection.insert(d)
log.msg("stock added to MongoDB database!", level=log.DEBUG, spider=spider)
elif spider.name == 'xueqiuPostSpider':
collection = self.db['post']
collection.save(dict(item))
log.msg("post added to MongoDB database!", level=log.DEBUG, spider=spider)
return item
def _retry(self, request, reason, spider):
retries = request.meta.get('retry_times', 0) + 1
if retries <= self.max_retry_times:
log.msg(format="Retrying %(request)s " \
"(failed %(retries)d times): %(reason)s",
level=log.DEBUG, spider=spider, request=request,
retries=retries, reason=reason)
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
# our priority setup is different from super
retryreq.meta['priority'] = retryreq.meta['priority'] - 10
return retryreq
else:
log.msg(format="Gave up retrying %(request)s "\
"(failed %(retries)d times): %(reason)s",
level=log.DEBUG, spider=spider, request=request,
retries=retries, reason=reason)
def process_item(self, item, spider):
if item['site'] == 'Qua':
if item['company']:
item['company'] = wash(item['company'])
if item['flight_time']:
item['flight_time'] = wash(item['flight_time'])
if item['airports']:
item['airports'] = wash(item['airports'])
if item['passtime']:
item['passtime'] = wash(item['passtime'])
if item['price']:
item['price'] = wash(item['price'])
for data in item:
if not data:
raise DropItem("Missing data!")
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",
level=log.DEBUG, spider=spider)
elif item['site'] == 'Ctrip':
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
def parse(self, response):
el = JDspiderLoader(response=response)
el.add_xpath('title', '//*[@id="name"]/h1/text()')
with Browser() as browser:
url = response.url
browser.visit(url)
price = browser.find_by_id('jd-price')
if price == []:
price = browser.find_by_xpath('//*[@id="price"]/strong')
# self.log(price[0].value, level=log.DEBUG)
el.add_value('price', price[0].value[1:])
with Browser() as browser:
number = response.url.split('/')[-1].split('.')[0]
url = 'http://club.jd.com/review/' + number + '-2-1.html'
browser.visit(url)
shaitu = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[5]/a/em')
el.add_value('shaitu', shaitu[0].value[1:-1])
haoping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[2]/a/em')
el.add_value('haoping', haoping[0].value[1:-1])
zhongping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[3]/a/em')
el.add_value('zhongping', zhongping[0].value[1:-1])
chaping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[4]/a/em')
el.add_value('chaping', chaping[0].value[1:-1])
return el.load_item()
def dropped(self, item, exception, response, spider):
return {
'level': log.DEBUG,
'msg': logformatter.DROPPEDMSG,
'args': {
'exception': exception,
'item': item,
}
}
def process_item(self, item, spider):
valid = True
print '--'*40
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
try:
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",
level=log.DEBUG, spider=spider)
except:
print 'ggggg'*40
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem('Missming{}!'.format(data))
if valid:
self.coll.insert(dict(item))
log.msg('item added to mongodb database !',level=log.DEBUG,spider=spider)
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Event added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Event added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
request.headers.setdefault('User-Agent', ua)
spider.log(
u'User-Agent: {} {}'.format(request.headers.get('User-Agent'), request),
level=log.DEBUG
)
def start_listening(self):
self.port = listen_tcp(self.portrange, self.host, self)
h = self.port.getHost()
log.msg(format='Web service listening on %(host)s:%(port)d',
level=log.DEBUG, host=h.host, port=h.port)
def process_item(self, item, spider):
valid=True
for data in item:
if not data:
valid=False
raise DropItem('Missing{0}!'.format(data))
if valid:
self.collection.insert(dict(item))
log.msg('question added to mongodb database!',
level=log.DEBUG,spider=spider)
return item
def process_item(self, item, spider):
for data in item:
if not data:
raise DropItem("Missing data!")
#self.collection.update({'url': item['url']}, dict(item), upsert=True)
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",
level=log.DEBUG, spider=spider)
return None
def _download_request(self, request, spider):
"""Download a request URL using webdriver."""
log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG)
request.manager.webdriver.get(request.url)
#time.sleep(5)
take_screenshot = getattr(settings, 'TAKE_SCREENSHOT', None)
screenshot_loc = getattr(settings, 'SCREENSHOT_LOCATION', None)
if take_screenshot and screenshot_loc:
screenshot_location = screenshot_loc + str(randint(10000,10000000)) + '.png'
request.manager.webdriver.save_screenshot(screenshot_location)
request.meta['screenshot'] = screenshot_location
request.meta['User-Agent'] = request.headers.get('User-Agent')
request.meta['Referer'] = request.headers.get('Referer')
return WebdriverResponse(request.url, request.manager.webdriver)
def _do_action_request(self, request, spider):
"""Perform an action on a previously webdriver-loaded page."""
log.msg('Running webdriver actions %s' % request.url, level=log.DEBUG)
request.actions.perform()
return WebdriverResponse(request.url, request.manager.webdriver)
def process_item(self,item,spider):
for data in item:
if not data:
raise DropItem("Missing data!")
self.collection.update({'url':item['url']},dict(item),upsert=True)
log.msg("Question added to MongoDB !",level=log.DEBUG,spider=spider)
return item
def debug(msg):
log.msg(str(msg), level=log.DEBUG)
#??????
def process_item(self, item, spider):
if self.__get_uniq_key() is None:
self.collection.insert(dict(item))
else:
self.collection.update(
{self.__get_uniq_key(): item[self.__get_uniq_key()]},
dict(item),
upsert=True)
log.msg("Item wrote to MongoDB database %s/%s" %
(settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
level=log.DEBUG, spider=spider)
return item
def debug(msg):
log.msg(str(msg), level=log.DEBUG)
def open_spider(self, spider):
self.connection = pymongo.MongoClient(
settings['MONGODB_SERVER'],
settings['MONGODB_PORT']
)
self.db = self.connection[settings['MONGODB_DB']]
self.collection = self.db[settings['MONGODB_COLLECTION']]
log.msg('Load nid from MongoDB database!',
level=log.DEBUG, spider=spider)
self.itemlist = set()
for i in self.collection.find():
self.itemlist.add(i['nid'])
def process_item(self, item, spider):
if item['nid'] in self.itemlist:
raise DropItem('Duplication data!')
#self.collection.update({'nid': item['nid']}, dict(item), upsert=True)
self.collection.insert(dict(item))
log.msg('Goods added to MongoDB database!',
level=log.DEBUG, spider=spider)
return item
def process_item(self, item, spider):
if not isinstance(item,StackOverflowItem):
return item
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider)
return item
def process_item(self, item, spider):
if not isinstance(item,StackOverflowItemJobs):
return item
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Jobs added to MongoDB database!",level=log.DEBUG, spider=spider)
return item
def process_item(self, item, spider):
""" ??item????????????????? """
if isinstance(item, TravelCrawlItem):
try:
self.spot_review.insert(dict(item))
log.msg("News added to MongoDB database!", level=log.DEBUG, spider=spider)
except Exception:
pass
elif isinstance(item, TravelnoteItem):
try:
self.note.insert(dict(item))
log.msg("News added to MongoDB database!", level=log.DEBUG, spider=spider)
except Exception:
pass
elif isinstance(item, TravelfoodItem):
try:
self.food_review.insert(dict(item))
log.msg("News added to MongoDB database!", level=log.DEBUG, spider=spider)
except Exception:
pass
elif isinstance(item, TravelhotelItem):
try:
self.hotel_review.insert(dict(item))
log.msg("News added to MongoDB database!", level=log.DEBUG, spider=spider)
except Exception:
pass
return item
def parse(self, response):
el = Pbdnof58Loader(response=response)
PageUrl = response.xpath('//a[contains(@class, "next")]/@href').extract()
self.log(PageUrl, level=log.DEBUG)
r = Redis()
if PageUrl != []:
r.lpush('myspider:58_urls', self.url + PageUrl[0])
sleep(1)
el.add_value('UrlofPage', self.url + PageUrl[0])
urls = response.xpath('//table[contains(@class, "tbimg")]/tr')
for url in urls:
url = url.xpath('td[contains(@class, "t")]/a/@href').extract()
if len(url) == 1 and 'zhuan' not in url[0]:
r.lpush('myspider:start_urls', url[0])
return el.load_item()
def __do__insert(self, conn, item, spider):
try:
conn.execute("""
insert into 58pbdndb set title = %s, area = %s, price = %s, quality = %s, time = %s
""", (item['title'], item['area'], item['price'], item['quality'], item['time']))
except MySQLdb.Error, e:
spider.log("Mysql Error %d: %s" % (e.args[0], e.args[1]), level=log.DEBUG)
def file_path(self, request, response=None, info=None):
item = request.meta['item']
# ?URL????????
# ????????,??:qq?/{???}/?????.jpg
image_guid = request.url.split('/')[-3]
log.msg(image_guid, level=log.DEBUG)
filename = u'{0[account]}/{0[album_name]}/{1}.jpg'.format(item,
image_guid)
return filename
def process_item(self, item, spider):
valid = True
for data in item:
if not data :
valid = False
raise DropItem("Missing {0}!".format(data))
if item['title'] == '':
valid = False
raise DropItem("title is empty")
if item['content'] == '':
valid = False
raise DropItem("content is empty")
for keyword in settings['EXCLUDE']:
if keyword in item['title']:
valid = False
DropItem("title have invalid keywords")
break
if valid:
iskey = False
for key in settings['KEYS']:
if key in item['title']:
iskey = True
break
for author in settings['AUTHOR']:
if author == item['author']:
iskey = True
break
if not iskey:
raise DropItem("item do not have keywords")
for info in self.db.items.find({}, {"title":1}):
infoTitle = info["title"].encode("utf-8")
if infoTitle == item["title"]:
valid = False
raise DropItem("item exist!")
break
if valid:
self.collection.insert(dict(item))
send_mail(item['title'], item['content'], item['href'])
# log.msg("webCrewl item added to MongoDB database!",
# level=log.DEBUG, spider=spider)
return item