def parse_xpath(self, response, xpath):
appItemList = []
sel = Selector(response)
for url in sel.xpath(xpath).extract():
url = urljoin(response.url, url)
log.msg("Catch an application: %s" % url, level=log.INFO)
appItem = AppItem()
appItem['url'] = url
appItemList.append(appItem)
return appItemList
#def parse_anzhi(self, response, xpath):
# appItemList = []
# hxs = HtmlXPathSelector(response)
# for script in hxs.select(xpath).extract():
# id = re.search(r"\d+", script).group()
# url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,)
# appItem = AppItem()
# appItem['url'] = url
# appItemList.append(appItem)
# return appItemList
python类INFO的实例源码
android_apps_spider.py 文件源码
项目:Android-Repackaged-App-Detection-System
作者: M157q
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def process_item(self,item,spider):
m = hashlib.md5()
m.update(item['url'])
url_MD5 = m.hexdigest()
content_simhash = Simhash(self.get_features(item['content'])).value
language = 'en'
query_json='{"fields":["url_MD5","content_simhash"],"query":{"filtered":{"filter":{"term":{"url_MD5":"'+url_MD5+'"}}}}}'
es = Elasticsearch(host='192.168.1.14',port=9200,timeout=1000)
res = es.search(index="hiddenwebs", body=query_json)
if res['hits']['total'] == 0:
es.index(index="hiddenwebs", doc_type="hiddenwebpages",body={"url":item['url'],"content":item['content'],"create_time":item['create_time'],"domain_name":item['domain_name'],"url_MD5":url_MD5,"title":item['title'],"content_simhash":content_simhash,"language":language})
else:
flag = 0
for hit in res['hits']['hits']:
#print content_simhash
#print hit["fields"]["content_simhash"][0]
if int(hit["fields"]["content_simhash"][0]) == int(content_simhash):
log.msg('The similar pages in es %s'%(item['url']),level=log.INFO)
flag = 1
es.index(index="hiddenwebs", doc_type="hiddenwebpages", id=hit['_id'], body={"create_time":item['create_time']})
break
if flag == 0 :
es.index(index="hiddenwebs", doc_type="hiddenwebpages",body={"url":item['url'],"content":item['content'],"create_time":item['create_time'],"domain_name":item['domain_name'],"url_MD5":url_MD5,"title":item['title'],"content_simhash":content_simhash,"language":language})
def parse(self, response):
for build in foreigh_7:
item = SightItem()
log.msg('build: ' + build, level=log.INFO)
if baidu_geo_api(build.encode('utf-8')) is not None:
lng, lat = baidu_geo_api(build.encode('utf-8'))
else:
lng, lat = 1, 1
item['lng'] = lng
item['lat'] = lat
item['id_num'] = self.id_num
self.id_num += 1L
item['category'] = u'??????'
item['title'] = build.encode('utf-8')
pinyin = lazy_pinyin(build)
item['pinyin'] = ''.join(pinyin).upper()
if lng == 1 or lat == 1:
log.msg('no landmark found: ' + 'at line 36,' + build, level=log.INFO)
continue
baike_url = 'https://baike.baidu.com/item/%s' % build
yield scrapy.Request(baike_url, meta={'item': item}, callback=self.content_parse)
def content_parse(self, response):
log.msg('run into content_parse at line 40', level=log.INFO)
item = response.meta['item']
result = response.xpath(
'//div[@class="main-content"]/div[@class="lemma-summary"]/div[@class="para"]').extract() # ????
if len(result) != 0:
pattern = re.compile(r'<[^>]+>', re.S)
description = pattern.sub('', result[0]).encode('utf-8')
else:
description = 'description_null'
item['description'] = description
picture_url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&ic=0&width=0&height=0' % item[
'title'].decode('utf-8')
log.msg('picture_url: ' + picture_url, level=log.INFO)
log.msg('run out content_parse at line 51', level=log.INFO)
yield scrapy.Request(picture_url, meta={'item': item,
'splash': {
'endpoint': 'render.html',
'args': {'wait': 0.5}
}
}, callback=self.picture_parse)
def baidu_geo_api(sight_name):
sight_name = sight_name.decode('utf-8')
ak = 'qsQB3G3zIR1SvZ01bEIAMBHGbCCUhTgm'
url = 'http://api.map.baidu.com/geocoder/v2/?output=json&address=%s&ak=%s' % (sight_name, ak)
log.msg('run into baidu_geo_api at line 123, url: ' + url, log.INFO)
try:
response = urllib2.urlopen(url.encode('utf-8'))
result = response.read()
json_text = json.loads(result)
if json_text.get('status') != 1:
lng = json_text.get('result').get('location').get('lng')
lng = float('%.2f' % lng)
lat = json_text.get('result').get('location').get('lat')
lat = float('%.2f' % lat)
print 'lng: %d, lat: %d' % (lng, lat)
return lng, lat
else:
log.msg('response status is 1 at line 132,' + sight_name, level=log.INFO)
return 1, 1
except urllib2.HTTPError as e:
print 'HttpError in baidu_geo_api at line 40 %s' % e
except TypeError as e:
print 'TypeError in baidu_geo_api at line 53 %s' % e
def readIds(self):
names = filter(lambda x: 'model' in x and 'json' in x,
os.listdir('/Users/king/Work/code/codePool/python/autohome_spider/data'))
print names
if not names:
log.msg('[spec]no model data file in data dir.', log.ERROR)
return
model_file_name = names[-1]
f = codecs.open('/Users/king/Work/code/codePool/python/autohome_spider/data/%s' % model_file_name, 'r')
ids = [line['id'] for line in json.loads(f.read())]
log.msg(len(ids), log.INFO)
return ids
def readIds(self):
names = filter(lambda x: 'model' in x and 'json' in x,
os.listdir('/home/king/code/python_job/autohome_spider/data'))
print names
if not names:
log.msg('[spec]no model data file in data dir.', log.ERROR)
return
model_file_name = names[-1]
f = codecs.open('/home/king/code/python_job/autohome_spider/data/%s' % model_file_name, 'r')
ids = [line['id'] for line in json.loads(f.read())]
log.msg(len(ids), log.INFO)
return ids
def process_request(self,request,spider):
user_agent = UserAgent()
ua = user_agent.random
if ua:
log.msg('Current UserAgent: '+ua, level=log.INFO)
request.headers.setdefault('User-Agent', ua)
pipelines.py 文件源码
项目:Android-Repackaged-App-Detection-System
作者: M157q
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def process_item(self, item, spider):
log.msg("Catch an AppItem", level=log.INFO)
return item
def scan(html):
alerts = list()
matches = HTMLClassifier.yara_rules.match(data=html)
if not len(matches) > 0:
return alerts
for match in matches['html']:
print match
alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']])
alert_data = "\n".join([s['data'] for s in match['strings']])
alerts.append((alert_reason, alert_data))
log.msg("Yara HTML Classification Match: " + alert_reason, level=log.INFO)
return alerts
def scan(uri):
alerts = list()
matches = URLClassifier.yara_rules.match(data=uri.encode('ascii', 'ignore'))
if not len(matches) > 0:
return alerts
for match in matches['urls']:
alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']])
alert_data = "\n".join([s['data'] for s in match['strings']])
alerts.append((alert_reason, alert_data))
log.msg("Yara URL Classification Match: " + alert_reason, level=log.INFO)
return alerts
def scan(js):
alerts = list()
matches = JSClassifier.yara_rules.match(data=js.encode('ascii', 'ignore'))
if not len(matches) > 0:
return alerts
for match in matches['js']:
alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']])
alert_data = "\n".join([s['data'] for s in match['strings']])
alerts.append((alert_reason, alert_data))
log.msg("Yara JS Classification Match: " + alert_reason, level=log.INFO)
return alerts
def parse_response(self, response):
page_id = ObjectId()
analyzer = Analyzer(response)
alerts = analyzer.inspect_response()
elems = analyzer.get_resource_elems()
page = analyzer.get_page_info()
for alert in alerts:
alert['org_id'] = self.org
yield alert
for elem in elems:
elem['page_id'] = page_id
elem['org_id'] = self.org
yield elem
page['page_id'] = page_id
page['org_id'] = self.org
yield page
#limit page depth
if self.pages_crawled >= settings.PAGES_PER_DOMAIN:
return
for link in LxmlLinkExtractor(unique=True, deny_extensions=list(), allow_domains=self.allowed_domains).extract_links(response):
if not link.url in self.already_crawled and self.pages_crawled <= settings.PAGES_PER_DOMAIN:
self.already_crawled.add(link.url)
self.pages_crawled = self.pages_crawled + 1
log.msg("Yielding request for " + link.url, level=log.INFO)
yield WebdriverRequest(link.url, callback=self.parse_response)
elif self.pages_crawled >= settings.PAGES_PER_DOMAIN:
log.msg("Reached max crawl depth: " + str(settings.PAGES_PER_DOMAIN), level=log.INFO)
return
else:
log.msg("avoiding duplicate request for: " + link.url, level=log.INFO)
def info(msg):
log.msg(str(msg), level=log.INFO)
def info(msg):
log.msg(str(msg), level=log.INFO)
def spider_closing(spider):
"""Activates on spider closed signal"""
log.msg("Spider closed: %s" % spider, level=log.INFO)
RUNNING_CRAWLERS.remove(spider)
if not RUNNING_CRAWLERS:
reactor.stop()
def open(self, spider):
super(RecorderScheduler, self).open(spider)
self.stats_manager = StatsManager(spider.crawler.stats)
settings = spider.crawler.settings
self.recorder_enabled = settings.get('RECORDER_ENABLED', DEFAULT_RECORDER_ENABLED)
if not self.recorder_enabled:
log.msg('Recorder disabled!', log.WARNING)
return
log.msg('Starting recorder', log.INFO)
recorder_storage = settings.get('RECORDER_STORAGE_ENGINE', None)
if not recorder_storage:
self.recorder_enabled = False
log.msg('Missing Recorder storage! Recorder disabled...', log.WARNING)
return
self.graph = graphs.Manager(
engine=recorder_storage,
drop_all_tables=settings.getbool('RECORDER_STORAGE_DROP_ALL_TABLES',
DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES),
clear_content=settings.getbool('RECORDER_STORAGE_CLEAR_CONTENT',
DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT))
def close(self, reason):
super(RecorderScheduler, self).close(reason)
if self.recorder_enabled:
log.msg('Finishing recorder (%s)' % reason, log.INFO)
pages = self.graph.session.query(graphs.Page).filter_by(status=None).all()
for page in pages:
n_deleted_links = self.graph.session.query(graphs.Relation).filter_by(child_id=page.id).delete()
if n_deleted_links:
self.stats_manager.remove_links(n_deleted_links)
n_deleted_pages = self.graph.session.query(graphs.Page).filter_by(status=None).delete()
if n_deleted_pages:
self.stats_manager.remove_pages(n_deleted_pages)
self.graph.save()
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
#???????useragent
print "********Current UserAgent:%s************",ua
log.msg('Current UserAgent:'+ua,log.INFO)
request.headers.setdefault('User-Agent', ua)
#the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
#for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
# ???????useragent
# log.INFO("********Current UserAgent:%s************".format(ua))
# ??
log.msg('Current UserAgent: ' + ua, level=log.INFO)
request.headers.setdefault('User-Agent', ua)
# the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
# for more user agent strings,you can find it in
# http://www.useragentstring.com/pages/useragentstring.php
android_apps_spider.py 文件源码
项目:Android-Repackaged-App-Detection-System
作者: M157q
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def parse(self, response):
response_domain = urlparse(response.url).netloc
appItemList = []
cookie = {}
xpath_rule = self.scrape_rules['xpath']
for key in xpath_rule.keys():
if key in response_domain:
appItemList.extend(
self.parse_xpath(response, xpath_rule[key]))
break
custom_parser_rule = self.scrape_rules['custom_parser']
for key in custom_parser_rule.keys():
if key in response_domain:
appItemList.extend(
getattr(custom_parser, custom_parser_rule[key])(response))
break
#if "appchina" in response_domain:
# xpath = "//a[@id='pc-download' and @class='free']/@href"
# appItemList.extend(self.parse_xpath(response, xpath))
#elif "hiapk" in response_domain:
# xpath = "//a[@class='linkbtn d1']/@href"
# appItemList.extend(self.parse_xpath(response, xpath))
#elif "android.d.cn" in response_domain:
# xpath = "//a[@class='down']/@href"
# appItemList.extend(self.parse_xpath(response, xpath))
#elif "anzhi" in response_domain:
# xpath = "//div[@id='btn']/a/@onclick"
# appItemList.extend(self.parse_anzhi(response, xpath))
#else:
# pass
sel = Selector(response)
for url in sel.xpath('//a/@href').extract():
url = urljoin(response.url, url)
yield Request(url, meta=cookie, callback=self.parse)
for item in appItemList:
yield item
#def parse_appchina(self, response):
# appItemList = []
# hxs = HtmlXPathSelector(response)
# for url in hxs.select(
# "//a[@id='pc-download' and @class='free']/@href"
# ).extract():
# url = urljoin(response.url, url)
# log.msg("Catch an application: %s" % url, level=log.INFO)
# appItem = AppItem()
# appItem['url'] = url
# appItemList.append(appItem)
# return appItemList
def picture_parse(self, response):
log.msg('run into picture_parse at line 66', level=log.INFO)
item = response.meta['item']
host_address = 'http://image.baidu.com'
path = response.xpath('//*[@id="page"]/a[10]/@href').extract_first()
url = host_address.encode('utf-8') + path
page_num = response.xpath('//*[@id="page"]/strong/span/text()').extract_first()
log.msg('page_num is %s' % page_num, level=log.INFO)
for option in response.xpath('//div[@id="imgid"]/ul[@class="imglist"]/li[@class="imgitem"]'):
item_final = SightItem()
item_final['title'] = item['title']
item_final['lng'] = item['lng']
item_final['lat'] = item['lat']
item_final['description'] = item['description']
item_final['category'] = item['category']
img_src = option.xpath('a/@href').extract_first()
result = re.search(r'.*objurl=(http.*?)&.*', img_src).groups()[0]
img_src = urllib.unquote(result).encode('utf-8')
item['url'] = img_src
print 'img_src: %s ========================****==============' % img_src
img_url = jpg_test(img_url=img_src)
print 'function jpg_test img_url is: %s ****************************' % img_url
# if img_url is not None:
try:
print 'id_num: %s' % item['id_num']
save_img(img_url=img_url, id_num=item['id_num'])
except TypeError as e:
log.msg('img url is NoneType in function picture_parse at line 103: {0}'.format(e), level=log.INFO)
if img_src is None or len(img_src) == 0:
item['url'] = 'url_null'
log.msg('img_src is null==============' + img_src, level=log.INFO)
item_final['url'] = item['url']
log.msg('img_src in line 61***********' + img_src + '; type: %s ' % type(img_src), log.INFO)
log.msg('run out picture_parse at line 92', level=log.INFO)
yield item
if path and page_num < PAGE_NUM:
log.msg('***************path**************\r\n' + path, level=log.INFO)
yield scrapy.Request(url, meta={'item': item,
'splash': {
'endpoint': 'render.html',
'args': {'wait': 0.5}
}
}, callback=self.picture_parse)
# def next_page_parse(self, response):