def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem('Missing{0}!'.format(data))
if valid:
self.collection.insert(dict(item))
log.msg('??????!', level=log.DEBUG, spider=spider)
return item
# def testdb(self):
# # ???MongoHQ
# con = pymongo.Connection("paulo.mongohq.com",10042)
# db = con.mytest
# db.authenticate("root", "sa123")
# db.urllist.drop()
python类msg()的实例源码
def process_item(self, item, spider):
if len(item['ip_port']):
a = Proxy(
ip_port=item['ip_port'],
type=item['type'],
level=item['level'],
location=item['location'],
speed=item['speed'],
lifetime=item['lifetime'],
lastcheck=item['lastcheck'],
rule_id=item['rule_id'],
source=item['source']
)
session = loadSession()
try:
session.merge(a)
session.commit()
except MySQLdb.IntegrityError, e:
log.msg("MySQL Error: %s" % str(e), _level=logging.WARNING)
return item
else:
log.msg("ip_port is invalid!",_level=logging.WARNING)
def process_item(self, item, spider):
#import pudb; pu.db
#val = "{0}\t{1}\t{2}\t{3}\t".format(item['appid'], item['title'], item['recommended'], item['intro'])
#self.file.write('--------------------------------------------\n')
#self.file.write(val)
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("new app added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
def parse_model_selled(self, response):
log.msg('[parse_selled] %s' % response.url)
series_id = response.meta['series_id']
data = json.loads(response.body_as_unicode())
models = data['Spec']
count = 0
for model in models:
model_id = model['Id']
model_name = model['Name']
group = model['GroupName']
price = model['Price']
model = ModelItem()
model['id'] = model_id
model['name'] = model_name
model['series_id'] = series_id
model['group'] = group
model['price'] = price
yield model
count += 1
log.msg('[parse_selled] model count is %d' % count)
android_apps_spider.py 文件源码
项目:Android-Repackaged-App-Detection-System
作者: M157q
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def parse_xpath(self, response, xpath):
appItemList = []
sel = Selector(response)
for url in sel.xpath(xpath).extract():
url = urljoin(response.url, url)
log.msg("Catch an application: %s" % url, level=log.INFO)
appItem = AppItem()
appItem['url'] = url
appItemList.append(appItem)
return appItemList
#def parse_anzhi(self, response, xpath):
# appItemList = []
# hxs = HtmlXPathSelector(response)
# for script in hxs.select(xpath).extract():
# id = re.search(r"\d+", script).group()
# url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,)
# appItem = AppItem()
# appItem['url'] = url
# appItemList.append(appItem)
# return appItemList
def _do_upinsert(self, conn, item, spider):
conn.execute("""SELECT EXISTS(
SELECT 1 FROM wstable WHERE id = %s
)""", (item['id'],))
ret = conn.fetchone()[0]
uri, title, author, time, description, content, images, view, id1 = self._parseItem(item)
if ret:
conn.execute("""
update wstable set uri = %s, title = %s, author = %s, time1 = %s, description = %s, content = %s, images = %s, view1 = %s where id = %s
""", (uri,title,author,time,description,content,images,view,id1))
# log.msg("""
# update wstable set uri = %s, title = %s, author = %s, time1 = %s, description = %s, content = %s, images = %s, view1 = %s where id = %s
# """ % (uri,title,author,time,description,content,images,view,id1))
else:
# log.msg("""
# insert into wstable(id, uri, title, author, time1, description, content, images, view1)
# values(%s, %s, %s, %s, %s, %s, %s, %s, %s)
# """ % (id1,uri,title,author,time,description,content,images,view))
conn.execute("""
insert into wstable(id, uri, title, author, time1, description, content, images, view1)
values(%s, %s, %s, %s, %s, %s, %s, %s, %s)
""", (id1,uri,title,author,time,description,content,images,view))
# log.msg('finished item %s' % item['id'])
print 'finished item %s' % item['id']
def process_item(self, item, spider):
if spider.name == 'baiduTopStockSpider':
collection = self.db[settings['stock']]
d = dict(item)
cursor = list(collection.find({'num': d["num"], 'source': d["source"]}))
if cursor:
collection.update({'_id': cursor[0]['_id']}, d)
else:
collection.insert(d)
log.msg("stock added to MongoDB database!", level=log.DEBUG, spider=spider)
elif spider.name == 'xueqiuPostSpider':
collection = self.db['post']
collection.save(dict(item))
log.msg("post added to MongoDB database!", level=log.DEBUG, spider=spider)
return item
def parse(self, response):
"""
default parse method, rule is not useful now
"""
# import pdb; pdb.set_trace()
response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
hxs = HtmlXPathSelector(response)
index_level = self.determine_level(response)
log.msg("Parse: index level:" + str(index_level))
if index_level in [1, 2, 3, 4]:
self.save_to_file_system(index_level, response)
relative_urls = self.get_follow_links(index_level, hxs)
if relative_urls is not None:
for url in relative_urls:
log.msg('yield process, url:' + url)
yield Request(url, callback=self.parse)
elif index_level == 5:
personProfile = HtmlParser.extract_person_profile(hxs)
linkedin_id = self.get_linkedin_id(response.url)
linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
if linkedin_id:
personProfile['_id'] = linkedin_id
personProfile['url'] = UnicodeDammit(response.url).markup
yield personProfile
def determine_level(self, response):
"""
determine the index level of current response, so we can decide wether to continue crawl or not.
level 1: people/[a-z].html
level 2: people/[A-Z][\d+].html
level 3: people/[a-zA-Z0-9-]+.html
level 4: search page, pub/dir/.+
level 5: profile page
"""
import re
url = response.url
if re.match(".+/[a-z]\.html", url):
return 1
elif re.match(".+/[A-Z]\d+.html", url):
return 2
elif re.match(".+/people-[a-zA-Z0-9-]+", url):
return 3
elif re.match(".+/pub/dir/.+", url):
return 4
elif re.match(".+/search/._", url):
return 4
elif re.match(".+/pub/.+", url):
return 5
log.msg("Crawl cannot determine the url's level: " + url)
return None
def get_last_time(self):
try:
self.cu.execute('CREATE TABLE history (time TEXT,result TEXT,spider_name TEXT primary key)')
last_time="2015-1-1 00:00:00"
except:
try:
self.cu.execute('SELECT time FROM history where spider_name="'+self.spider_name+'"')
last_time = self.cu.fetchone()[0]
log.msg('************* '+last_time,level=log.WARNING)
except:
last_time="2015-5-1 00:00:00"
log.msg('************* '+last_time,level=log.WARNING)
last_time = time.strptime(last_time, '%Y-%m-%d %H:%M:%S')
last_time = time.mktime(last_time)
return last_time
def insert_new_time(self):
if time.mktime(time.strptime(self.item_max_time, '%Y-%m-%d %H:%M:%S')) < time.time():
if self.sqlite_flag:
try:
log.msg('delete from history where spider_name='+self.spider_name,level=log.WARNING)
self.cu.execute('delete from history where spider_name="'+self.spider_name+'"')
self.sx.commit()
except sqlite3.OperationalError,e:
log.msg('__________',level=log.WARNING)
pass
sql = "insert into history values(?,?,?)"
params = (self.item_max_time,self.item_max_id,self.spider_name)
self.cu.execute(sql,params)
self.sx.commit()
self.close_sqlite()
def _retry(self, request, reason, spider):
retries = request.meta.get('retry_times', 0) + 1
if retries <= self.max_retry_times:
log.msg(format="Retrying %(request)s " \
"(failed %(retries)d times): %(reason)s",
level=log.DEBUG, spider=spider, request=request,
retries=retries, reason=reason)
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
# our priority setup is different from super
retryreq.meta['priority'] = retryreq.meta['priority'] - 10
return retryreq
else:
log.msg(format="Gave up retrying %(request)s "\
"(failed %(retries)d times): %(reason)s",
level=log.DEBUG, spider=spider, request=request,
retries=retries, reason=reason)
def process_item(self,item,spider):
m = hashlib.md5()
m.update(item['url'])
url_MD5 = m.hexdigest()
content_simhash = Simhash(self.get_features(item['content'])).value
language = 'en'
query_json='{"fields":["url_MD5","content_simhash"],"query":{"filtered":{"filter":{"term":{"url_MD5":"'+url_MD5+'"}}}}}'
es = Elasticsearch(host='192.168.1.14',port=9200,timeout=1000)
res = es.search(index="hiddenwebs", body=query_json)
if res['hits']['total'] == 0:
es.index(index="hiddenwebs", doc_type="hiddenwebpages",body={"url":item['url'],"content":item['content'],"create_time":item['create_time'],"domain_name":item['domain_name'],"url_MD5":url_MD5,"title":item['title'],"content_simhash":content_simhash,"language":language})
else:
flag = 0
for hit in res['hits']['hits']:
#print content_simhash
#print hit["fields"]["content_simhash"][0]
if int(hit["fields"]["content_simhash"][0]) == int(content_simhash):
log.msg('The similar pages in es %s'%(item['url']),level=log.INFO)
flag = 1
es.index(index="hiddenwebs", doc_type="hiddenwebpages", id=hit['_id'], body={"create_time":item['create_time']})
break
if flag == 0 :
es.index(index="hiddenwebs", doc_type="hiddenwebpages",body={"url":item['url'],"content":item['content'],"create_time":item['create_time'],"domain_name":item['domain_name'],"url_MD5":url_MD5,"title":item['title'],"content_simhash":content_simhash,"language":language})
def process_item(self, item, spider):
if item['site'] == 'Qua':
if item['company']:
item['company'] = wash(item['company'])
if item['flight_time']:
item['flight_time'] = wash(item['flight_time'])
if item['airports']:
item['airports'] = wash(item['airports'])
if item['passtime']:
item['passtime'] = wash(item['passtime'])
if item['price']:
item['price'] = wash(item['price'])
for data in item:
if not data:
raise DropItem("Missing data!")
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",
level=log.DEBUG, spider=spider)
elif item['site'] == 'Ctrip':
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
def parse(self, response):
for build in foreigh_7:
item = SightItem()
log.msg('build: ' + build, level=log.INFO)
if baidu_geo_api(build.encode('utf-8')) is not None:
lng, lat = baidu_geo_api(build.encode('utf-8'))
else:
lng, lat = 1, 1
item['lng'] = lng
item['lat'] = lat
item['id_num'] = self.id_num
self.id_num += 1L
item['category'] = u'??????'
item['title'] = build.encode('utf-8')
pinyin = lazy_pinyin(build)
item['pinyin'] = ''.join(pinyin).upper()
if lng == 1 or lat == 1:
log.msg('no landmark found: ' + 'at line 36,' + build, level=log.INFO)
continue
baike_url = 'https://baike.baidu.com/item/%s' % build
yield scrapy.Request(baike_url, meta={'item': item}, callback=self.content_parse)
def content_parse(self, response):
log.msg('run into content_parse at line 40', level=log.INFO)
item = response.meta['item']
result = response.xpath(
'//div[@class="main-content"]/div[@class="lemma-summary"]/div[@class="para"]').extract() # ????
if len(result) != 0:
pattern = re.compile(r'<[^>]+>', re.S)
description = pattern.sub('', result[0]).encode('utf-8')
else:
description = 'description_null'
item['description'] = description
picture_url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&ic=0&width=0&height=0' % item[
'title'].decode('utf-8')
log.msg('picture_url: ' + picture_url, level=log.INFO)
log.msg('run out content_parse at line 51', level=log.INFO)
yield scrapy.Request(picture_url, meta={'item': item,
'splash': {
'endpoint': 'render.html',
'args': {'wait': 0.5}
}
}, callback=self.picture_parse)
def google_geo_api(sight_name):
sight_name = sight_name.decode('utf-8')
key = "AIzaSyDJtV9r7rAr9EBwlQ8Rbxvo6e7CkJsLn4k"
url = "https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=AIzaSyAw-IJpHf6CYtb4OVgrj2MB7pmXlbSs7aY%s" % (sight_name, key)
print 'url: %s' % url
response = urllib2.urlopen(url.encode('utf-8'))
result = response.read()
json_loads = json.loads(result)
if json_loads.get('status') == 'OK':
location = json_loads.get('results')[0].get('geometry').get('location')
lat = location.get('lat')
lat = float('%.2f' % lat)
lng = location.get('lng')
lng = float('%.2f' % lng)
print ('lat: %s\r\n lng %s' % (lat, lng))
return lng, lat
else:
log.msg('There is no result about lat and lng')
return 1, 1
# json_text = json.loads(result)
# lng = json_text.get('geometry')
# print ('lng: %s' % lng)
def process_exception(self, request, exception, spider):
proxy = request.meta['proxy']
log.msg('Removing failed proxy <%s>, %d proxies left' % (
proxy, len(self.proxies)))
try:
del self.proxies[proxy]
except ValueError:
pass
def process_item(self, item, spider):
valid = True
print '--'*40
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
try:
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",
level=log.DEBUG, spider=spider)
except:
print 'ggggg'*40
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem('Missming{}!'.format(data))
if valid:
self.coll.insert(dict(item))
log.msg('item added to mongodb database !',level=log.DEBUG,spider=spider)
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Event added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Event added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
def close_spider(self, spider, reason):
if self._dump:
log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self.get_stats()),
spider=spider)
self._persist_stats(self.get_stats(), spider)
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
#???????useragent
#print "********Current UserAgent:%s************" %ua
#??
log.msg('Current UserAgent: '+ua, _level=logging.INFO)
request.headers.setdefault('User-Agent', ua)
#the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
#for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
def process_request(self, request, spider):
# Set the location of the proxy
pro_adr = random.choice(self.proxyList)
log.msg("Current Proxy <%s>" % pro_adr,_level=logging.INFO)
request.meta['proxy'] = "http://" + pro_adr
def process_exception(self, request, exception, spider):
proxy = request.meta['proxy']
log.msg('Removing failed proxy <%s>, %d proxies left' % (
proxy, len(self.proxies)))
try:
del self.proxies[proxy]
except ValueError:
pass
def parse_datetime(value):
try:
d = parse(value)
except ValueError:
log.msg('Unable to parse %s' % value, level=log.WARNING)
return value
else:
return d.isoformat()
def parse_date(value):
try:
d = parse(value)
except ValueError:
log.msg('Unable to parse %s' % value, level=log.WARNING)
return value
else:
return d.strftime("%Y-%m-%d")
def start_listening(self):
self.port = listen_tcp(self.portrange, self.host, self)
h = self.port.getHost()
log.msg(format='Web service listening on %(host)s:%(port)d',
level=log.DEBUG, host=h.host, port=h.port)
def parse(self, response):
book_id = response.url.strip('/').split('/')[-1]
log.msg('book_id[%s].' % book_id)
book_name = response.xpath('//title/text()')[0].extract().strip(' (??)')
bean = BookName()
bean['book_id'] = book_id
bean['book_name'] = book_name
yield bean