def main():
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item extracted:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# definir el spider para el crawler
crawler.crawl(EuropythonSpyder())
# iniciar scrapy
print "STARTING ENGINE"
crawler.start() #iniciar el crawler llamando al spider definido
print "ENGINE STOPPED"
python类Item()的实例源码
def main():
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item Extraido:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# definir el spider para el crawler
crawler.crawl(BloggerSpider())
# iniciar scrapy
print "STARTING ENGINE"
crawler.start() #iniciar el crawler llamando al spider definido
print "ENGINE STOPPED"
def main():
from scrapy.xlib.pydispatch import dispatcher
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item extracted:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# define spyder for the crawler
crawler.crawl(PydataSpiderDetails())
print "STARTING ENGINE"
crawler.start() #start the crawler
print "ENGINE STOPPED"
def _parse_spider_response(self, spider_response):
"""
:param spider_response: return of parse spider method
:return: job item generator
"""
for response_item in spider_response:
if isinstance(response_item, Request):
request = response_item
file_path = self._dump_format % request.url.replace(self._replace, self._dump_dir)
if file_path.find('file://') != -1:
file_path = file_path.replace('file://', '')
response = fake_response_from_file(
file_path=file_path,
request=request,
response_class=HtmlResponse
)
# If a callback it's a job page request
if request.callback:
for item in request.callback(response):
yield item
# Else it's a next page
else:
for job_item in self._parse_spider_response(self._spider.parse(response)):
yield job_item
elif isinstance(response_item, Item):
yield response_item
def _get_item_field_attr(self, field_name, key, default=None):
if isinstance(self.item, Item):
value = self.item.fields[field_name].get(key, default)
else:
value = default
return value
def parse_blog(self, response):
print 'link parseado %s' %response.url
hxs = HtmlXPathSelector(response)
item = HackerWayItem()
item['title'] = hxs.select('//title/text()').extract() # Selector XPath para el titulo
item['author'] = hxs.select("//span[@class='author']/a/text()").extract() # Selector XPath para el author
item['tag'] = hxs.select("//meta[@property='og:title']/text()").extract() # Selector XPath para el tag
item['date'] = hxs.select("//span[@class='date']/text()").extract() # Selector XPath para la fecha
return item # Retornando el Item.