def parse(self, response):
url = response.url
log.msg('[url]%s' % url)
body = response.body
soup = BeautifulSoup(body, 'lxml').select('.cardetail-infor')[0]
text = str(self.gettextonly(soup)).decode('utf-8')
m = re.findall(ur'(????|????|?????|????|????|? ? ?|? ? ?|????|??????)?\n?(.+)\n', text, re.M | re.U)
map = dict([(d[0], d[1]) for d in m])
result = SpecItem()
result['id'] = url.split('/')[-1]
result['spec'] = map
yield result
python类msg()的实例源码
def readIds(self):
names = filter(lambda x: 'model' in x and 'json' in x,
os.listdir('/Users/king/Work/code/codePool/python/autohome_spider/data'))
print names
if not names:
log.msg('[spec]no model data file in data dir.', log.ERROR)
return
model_file_name = names[-1]
f = codecs.open('/Users/king/Work/code/codePool/python/autohome_spider/data/%s' % model_file_name, 'r')
ids = [line['id'] for line in json.loads(f.read())]
log.msg(len(ids), log.INFO)
return ids
def parse(self, response):
log.msg('[parse] %s' % response.url)
# ????ID???????????URL?request??
for seriesId in response.xpath('body/dl').re(r'id="s(\d+)"'):
series_page_url = "http://www.autohome.com.cn/" + seriesId
log.msg('series_page_url:%s' % series_page_url)
request = scrapy.Request(url=series_page_url, callback=self.parse_model_selling, dont_filter=True)
request.meta['series_id'] = seriesId
yield request
# ???????
def process_item(self, item, spider):
valid=True
for data in item:
if not data:
valid=False
raise DropItem('Missing{0}!'.format(data))
if valid:
self.collection.insert(dict(item))
log.msg('question added to mongodb database!',
level=log.DEBUG,spider=spider)
return item
def process_item(self, item, spider):
for data in item:
if not data:
raise DropItem("Missing data!")
#self.collection.update({'url': item['url']}, dict(item), upsert=True)
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",
level=log.DEBUG, spider=spider)
return None
def process_request(self,request,spider):
user_agent = UserAgent()
ua = user_agent.random
if ua:
log.msg('Current UserAgent: '+ua, level=log.INFO)
request.headers.setdefault('User-Agent', ua)
pipelines.py 文件源码
项目:Android-Repackaged-App-Detection-System
作者: M157q
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def process_item(self, item, spider):
log.msg("Catch an AppItem", level=log.INFO)
return item
pipelines.py 文件源码
项目:Android-Repackaged-App-Detection-System
作者: M157q
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def process_item(self, item, spider):
try:
self.conn.execute('insert into apps(url) values(?)',
(item['url'],)
)
self.conn.commit()
log.msg("Inserting into database");
except sqlite3.IntegrityError:
print "Duplicated"
return item
def process_item(self, item, spider):
for field in self.required_fields:
if not item[field]:
# log.msg("Field '%s' missing" % (field))
print "Field '%s' missing" % (field)
raise DropItem("Field '%s' missing: %r" % (field, item))
return item
def process_item(self, item, spider):
if 'image_urls' in item:
images = []
abpath = '%s/%s/%s/%s' % (spider.name, item['id'][0],item['id'][1],item['id'])
dir_path = '%s/%s' % (settings['IMAGES_STORE'], abpath)
if not os.path.exists(dir_path) and len(item['image_urls'])>0:
os.makedirs(dir_path)
for image_url in item['image_urls']:
name = image_url.split('/')[-1]
_i = name.rfind('!')
if _i > 4:
name = name[:_i]
name = re.sub('\\\|/|:|\*|\?|"|<|>','_',name)
image_file_name = name[-100:]
file_path = '%s/%s' % (dir_path, image_file_name)
images.append((image_url, file_path))
if os.path.exists(file_path):
continue
with open(file_path, 'wb') as handle:
try:
response = requests.get(image_url, stream=True)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
# log.msg("download img to %s" % file_path)
except:
continue
item['images'] = images
if not images:
pass
else:
_ = images[0][1]
item['firstimage'] = '%s/%s' % (abpath, _[_.rfind('/')+1:])
print item['firstimage']
return item
def scan(html):
alerts = list()
matches = HTMLClassifier.yara_rules.match(data=html)
if not len(matches) > 0:
return alerts
for match in matches['html']:
print match
alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']])
alert_data = "\n".join([s['data'] for s in match['strings']])
alerts.append((alert_reason, alert_data))
log.msg("Yara HTML Classification Match: " + alert_reason, level=log.INFO)
return alerts
def scan(uri):
alerts = list()
matches = URLClassifier.yara_rules.match(data=uri.encode('ascii', 'ignore'))
if not len(matches) > 0:
return alerts
for match in matches['urls']:
alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']])
alert_data = "\n".join([s['data'] for s in match['strings']])
alerts.append((alert_reason, alert_data))
log.msg("Yara URL Classification Match: " + alert_reason, level=log.INFO)
return alerts
def scan(js):
alerts = list()
matches = JSClassifier.yara_rules.match(data=js.encode('ascii', 'ignore'))
if not len(matches) > 0:
return alerts
for match in matches['js']:
alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']])
alert_data = "\n".join([s['data'] for s in match['strings']])
alerts.append((alert_reason, alert_data))
log.msg("Yara JS Classification Match: " + alert_reason, level=log.INFO)
return alerts
def process_item(self, item, spider):
if not type(item) == Alert:
return item
uri = item['uri']
if not uri:
raise DropItem("Not a valid alert URI: ", uri)
if spider.custom_whitelist:
for (pattern) in spider.custom_whitelist:
if pattern[0] in uri:
raise DropItem("Whitelisted domain found in Alert: ", uri)
if spider.alexa_whitelist:
try:
parsed_uri = urlparse(uri)
parsed_domain = '{uri.netloc}'.format(uri=parsed_uri)
domain = get_tld(uri)
for alexa_domain in spider.alexa_whitelist:
if domain.endswith(alexa_domain):
raise DropItem("Alert domain found in Alexa Whitelist: ", domain)
except (TldIOError,TldDomainNotFound,TldBadUrl) as e:
log.msg("Error parsing TLD. Still allowing alert for " + uri, level=log.WARNING)
except:
raise
return item
def spider_opened(self, spider):
self.conn = MySQLdb.connect(host=settings.MYSQL_HOST, db=settings.MYSQL_DB, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWORD, charset='utf8', use_unicode=True)
cursor = spider.conn.cursor()
sql_str = "SELECT pattern from whitelist"
cursor.execute(sql_str)
self.custom_whitelist = cursor.fetchall()
try:
alexa_whitelist_file = pkgutil.get_data("malspider", "resources/alexa-1k-whitelist.csv").decode('ascii')
self.alexa_whitelist = alexa_whitelist_file.splitlines()
except:
log.msg("Error loading alexa whitelist...", level=log.ERROR)
def parse_response(self, response):
page_id = ObjectId()
analyzer = Analyzer(response)
alerts = analyzer.inspect_response()
elems = analyzer.get_resource_elems()
page = analyzer.get_page_info()
for alert in alerts:
alert['org_id'] = self.org
yield alert
for elem in elems:
elem['page_id'] = page_id
elem['org_id'] = self.org
yield elem
page['page_id'] = page_id
page['org_id'] = self.org
yield page
#limit page depth
if self.pages_crawled >= settings.PAGES_PER_DOMAIN:
return
for link in LxmlLinkExtractor(unique=True, deny_extensions=list(), allow_domains=self.allowed_domains).extract_links(response):
if not link.url in self.already_crawled and self.pages_crawled <= settings.PAGES_PER_DOMAIN:
self.already_crawled.add(link.url)
self.pages_crawled = self.pages_crawled + 1
log.msg("Yielding request for " + link.url, level=log.INFO)
yield WebdriverRequest(link.url, callback=self.parse_response)
elif self.pages_crawled >= settings.PAGES_PER_DOMAIN:
log.msg("Reached max crawl depth: " + str(settings.PAGES_PER_DOMAIN), level=log.INFO)
return
else:
log.msg("avoiding duplicate request for: " + link.url, level=log.INFO)
def _download_request(self, request, spider):
"""Download a request URL using webdriver."""
log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG)
request.manager.webdriver.get(request.url)
#time.sleep(5)
take_screenshot = getattr(settings, 'TAKE_SCREENSHOT', None)
screenshot_loc = getattr(settings, 'SCREENSHOT_LOCATION', None)
if take_screenshot and screenshot_loc:
screenshot_location = screenshot_loc + str(randint(10000,10000000)) + '.png'
request.manager.webdriver.save_screenshot(screenshot_location)
request.meta['screenshot'] = screenshot_location
request.meta['User-Agent'] = request.headers.get('User-Agent')
request.meta['Referer'] = request.headers.get('Referer')
return WebdriverResponse(request.url, request.manager.webdriver)
def _do_action_request(self, request, spider):
"""Perform an action on a previously webdriver-loaded page."""
log.msg('Running webdriver actions %s' % request.url, level=log.DEBUG)
request.actions.perform()
return WebdriverResponse(request.url, request.manager.webdriver)
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
#???????useragent
print "********Current UserAgent:%s************" %ua
#??
log.msg('Current UserAgent: '+ua, level=1)
request.headers.setdefault('User-Agent', ua)
#the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
#for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
def process_item(self,item,spider):
for data in item:
if not data:
raise DropItem("Missing data!")
self.collection.update({'url':item['url']},dict(item),upsert=True)
log.msg("Question added to MongoDB !",level=log.DEBUG,spider=spider)
return item