def parse_news(self,response):
item = response.meta.get("item",None)
# #??????????????????????
# news_date = item.get("news_date",None)
# if news_date:
# struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
# news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
#
# delta = self.end_now-struct_date
# if delta.days == self.end_day:
# # pass
# raise CloseSpider('today scrapy end')
soup = BeautifulSoup(response.body)
news_content_group = soup.find("div",class_="entry-content group")
#??????
news_content_group.find("div",class_="related_posts").replace_with("")
content = news_content_group.text.strip()
item["content"] = content
item["catalogue"] = u"????"
yield item
python类CloseSpider()的实例源码
def parse_article(self,response):
#content,news_no,crawl_date
item = response.meta.get("item",NewsItem())
# news_date = item.get("news_date",None)
# if news_date:
# struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
# delta = self.end_now-struct_date
# print delta.days
# if delta.days == self.end_day:
# raise CloseSpider('today scrapy end')
soup =BeautifulSoup(response.body)
author = soup.find("span",class_="name").text if soup.find("span",class_="name") else None
abstract = soup.find("p",class_="excerpt").text if soup.find("p",class_="excerpt") else None
content = soup.find("div",class_="detail").text if soup.find("div",class_="detail") else None
news_no = response.url.split("/")[-1][:-5]
item["author"] = author
item["abstract"] = abstract
item["content"] = content
item["crawl_date"] = NOW
item["news_no"] = news_no
yield item
def parse_search(self, response):
"""
@summary: ?????????????request???????
@param response:start_requests()?????????????
"""
# ???????????????????????"antispider"??
# ????"antispider"???????????????????????????
if "antispider" in response.url:
spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
time.sleep(43200) # ??????????????
raise CloseSpider('antispider')
# ext????????????????json????url???????????????
ext = response.xpath(
'//div[@class="wx-rb bg-blue wx-rb_v1 _item"][1]/@href').extract() # ?????????????????????????????ext??
if not ext:
spider_logger.error("Faild searching {0} !".format(response.meta['query']))
return
# ???????json???url?????????10?????????????1?(page=1????)?url
json_url = "".join(ext).replace('/gzh?','http://weixin.sogou.com/gzhjs?')+'&cb=sogou.weixin_gzhcb&page=1&gzhArtKeyWord='
cookies = response.meta['cookies']
yield Request(json_url, callback= self.parse_index, cookies=cookies, meta ={'cookies':cookies})
def spider_opened(self, spider):
try:
file = open(spider.settings.get('FEED_FILE'), 'wb')
except TypeError:
raise NotConfigured('FEED_FILE parameter does not string or does not exist')
except (IOError, OSError) as e:
raise CloseSpider('Cannot open file {}: {}'.format(spider.settings.get('FEED_FILE', None), e))
self.files[spider] = file
feed_title = spider.settings.get('FEED_TITLE')
if not feed_title:
raise NotConfigured('FEED_TITLE parameter does not exist')
feed_link = spider.settings.get('FEED_LINK')
if not feed_link:
raise NotConfigured('FEED_LINK parameter does not exist')
feed_description = spider.settings.get('FEED_DESCRIPTION')
if feed_description is None:
raise NotConfigured('FEED_DESCRIPTION parameter does not exist')
feed_exporter = spider.settings.get('FEED_EXPORTER', RssItemExporter)
if isinstance(feed_exporter, six.string_types):
feed_exporter = load_object(feed_exporter)
if not issubclass(feed_exporter, RssItemExporter):
raise TypeError("FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(feed_exporter))
self.exporters[spider] = feed_exporter(file, feed_title, feed_link, feed_description)
self.exporters[spider].start_exporting()
def parse(self, response):
if response.status ==503:
raise CloseSpider("denied by remote server")
sel = Selector(response)
appends = response.meta['appends']
cityname = appends['city']
smexp = appends['cat']
xpath_exp = '//a[text()="Search for more '+smexp+'"]/@href'
if cityname=='??':
moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Hong+Kong', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=???%2C+Hong+Kong']
elif cityname=='Adelaide':
moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Adelaide%2C+Adelaide+South+Australia%2C+Australia', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Adelaide+South+Australia+5000']
elif cityname=='Park La Brea':
moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=South+La+Brea+Avenue%2C+Los+Angeles%2C+CA+90056', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Mid-Wilshire%2C+Los+Angeles%2C+CA', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=North+La+Brea+Avenue%2C+Los+Angeles%2C+CA']
else:
searchmore = sel.xpath(xpath_exp).extract()[0]
moreLink = [response.urljoin(searchmore)]
for link in moreLink:
yield Request(url=link, callback=self.parseBegin, meta={'appends': appends}, dont_filter=True)
def process_request(self, request, spider):
if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
return
proxy = self.proxies.get_random()
if not proxy:
if self.stop_if_no_proxies:
raise CloseSpider("no_proxies")
else:
logger.warn("No proxies available; marking all proxies "
"as unchecked")
self.proxies.reset()
proxy = self.proxies.get_random()
if proxy is None:
logger.error("No proxies available even after a reset.")
raise CloseSpider("no_proxies_after_reset")
request.meta['proxy'] = proxy
request.meta['download_slot'] = self.get_proxy_slot(proxy)
request.meta['_rotating_proxy'] = True
def parse_news(self,response):
#content,news_date,news_no,crawl_date,referer_web
item = response.meta.get("item",NewsItem())
pageindex = response.meta.get("pageindex",1)
soup = BeautifulSoup(response.body)
# news_date = item.get("news_date",None)
#?????????
news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None
#http://info.meadin.com/PictureNews/2938_1.shtml Exception
if news_date:
# struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
# delta = self.end_now-struct_date
# if delta.days == self.end_day:
# raise CloseSpider('today scrapy end')
referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None
#????
art,content = None,None
art = soup.find("div",class_="article js-article")
if art:
#?????
art.find("div",class_="intro").replace_with("")
content =art.text.strip()
news_no =response.url.split("/")[-1].split("_")[0]
item["news_date"]=news_date
item["content"]=content
item["referer_web"]=referer_web
item["crawl_date"]=NOW
item["news_no"]=news_no
item = judge_news_crawl(item)
if item:
yield item
else:
self.flag = pageindex
else:
logger.warning("can't find news_date.the url is %s" % response.url)
def parse(self, response):
#????
html = response.body
soup = BeautifulSoup(html,"lxml")
#????????
for i in self.fetch_newslist(soup):
# raise CloseSpider(str(i['time'] == u"???"))
# if i['time'] == "???": raise CloseSpider("today news end")
request = scrapy.Request(i['news_url'],callback=self.parse_news)
request.meta['item'] = i
request.meta['pageindex'] = 1
yield request
#????????
lasttime = "nothing"
for i in soup.select('div[class="news_li"]'):
if i.attrs.has_key("lasttime"):
lasttime = i["lasttime"]
break
#?????url????
# ???load_chosen.jsp?nodeids=25949&topCids=1495258,1494171,1495064,1495130,1495285,&pageidx=
load_chosen = re.search(r'data.:."(.*)".+.masonry',html)
page = 2
if load_chosen :
tp_url = "http://www.thepaper.cn/load_chosen.jsp?%s%s&lastTime=%s" % (load_chosen.group(1),page,lasttime)
yield scrapy.Request(tp_url, callback=self.next_page_parse)
def start_requests(self):
# while len(self.finished) < len(self.all_urls):
current_hour = time.strftime("%Y%m%d%H", time.localtime())
if current_hour != START_HOUR:
self.logger.info("It's already %s. Stopping..." % current_hour)
return
for url, item_idx in self.all_urls.iteritems():
if not self.cookies:
raise CloseSpider("No enough cookies.")
if item_idx in self.finished:
continue
else:
yield Request(url, callback=self.parse_item)
# self.logger.info(u'Crawled %s / %s. Done :)' % (len(self.finished), len(self.all_urls)))
def process_response(request, response, spider):
if "antispider" in response.url:
spider_logger.error("recieve verification code in %s" % response.url)
raise CloseSpider('antispider')
return response
def __init__(self,
query=None,
start_time=None,
end_time=None,
index_pages=None):
"""
@summary: ?????????, ?????????????
@param query: ???,???????
@param start_time: ????????start_time???????????????????????
@param end_time: ????????end_time?????
@param index_pages: ?????????????
"""
# ??????????????????????????
if query:
self.query = query # self.query????????????
else:
# ???????????????????????
spider_logger.error("Spider need single search word each time!Check input!")
raise CloseSpider('invaild search word')
# ???????????????100??
if start_time:
self.from_time = start_time
else:
self.from_time = datetime.now()-timedelta(days=100) # ????100??
# ?????????????
if end_time:
self.end_time = end_time
else:
self.end_time = datetime.now() # ????????
# ???????
if index_pages:
self.index_pages = int(index_pages)
else:
self.index_pages = 10 # ????10?
def parse_index(self, response):
"""
@summary: ?????????????????Request??
@param response: parse_search()?????????????
@return: list????????????url???????????
"""
if "antispider" in response.url:
spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
time.sleep(43200)
raise CloseSpider('antispider')
requests = []
page_list = self._get_result(response)
# ???????????????
if not page_list:
return requests
next_page = True # ????????
# ???????????????
for item in page_list:
if isinstance(item, Request): # ?????Request
requests.append(item)
next_page = False
break
if item['publish_time'] <= self.from_time: # ????????self.from_time
next_page = False
break
elif item['publish_time'] > self.end_time: # ????????self.end_time
continue
else:
req = Request(item['url'], self.parse_page)
# ???????
req.meta["item"] = item
requests.append(req)
# ?????,??????Request;???????
if next_page and self._next_result_page(response):
cookies = response.meta['cookies']
requests.append(Request(self._next_result_page(response),callback=self.parse_index,cookies=cookies, meta ={'cookies':cookies}))
return requests
def parse_page(self, response):
"""
@summary: ??????
@param response: parse_index()?????????????
@return: ?????_finish_item()??????
"""
if "antispider" in response.url:
spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
time.sleep(43200)
raise CloseSpider('antispider')
item = response.meta["item"]
return self._finish_item(item, response)
def process_eastmoney_gubalist_item(self, item, spider):
status = item.get('status')
if status is not None and status != 200:
self.error_count += 1
if self.error_count * 5 > self.success_count:
raise CloseSpider(
'too many error occurred, shutdown gracefully.')
return item
if 'ticker_id' not in item or item['ticker_id'] == "":
raise DropItem('??ticker_id')
self.write_to_file(item, spider.name)
pass
def test_empty_feed(self):
for partial_settings in itertools.chain.from_iterable(
itertools.combinations(self.feed_settings.items(), r)
for r in range(1, len(self.feed_settings))):
partial_settings = dict(partial_settings)
undefined_settings = [name.upper() for name in set(self.feed_settings) - set(partial_settings)]
with self.assertRaisesRegexp(NotConfigured,
'({})'.format('|'.join(undefined_settings))
if len(undefined_settings) > 1 else undefined_settings[0],
msg='The feed file, title, link and description must be specified, but the absence of {} is allowed'
.format(undefined_settings)):
with CrawlerContext(**partial_settings):
pass
with self.assertRaises(CloseSpider):
feed_settings = dict(self.feed_settings)
feed_settings['feed_file'] = 'non/existent/filepath'
with CrawlerContext(**feed_settings):
pass
with CrawlerContext(**self.feed_settings):
pass
with open(self.feed_settings['feed_file']) as data, \
open(os.path.join(os.path.dirname(__file__), 'expected_rss', 'empty_feed.rss')) as expected:
self.assertUnorderedXmlEquivalentOutputs(data.read(), expected.read())
def close_spider(self, reason):
raise CloseSpider(reason=reason)
# do something before spider close
def parse(self, response):
self.logger.info('parse: %s' % response)
is_no_update = False
published_at_wib = ''
try:
# Get list of news from the current page
articles = json.loads(response.text)
for article in articles['contents']:
url = article['friendlyURL']
date = article['publishTime']
published_at_wib = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_updated = True
break
yield Request('http://pilkada.arah.com' + url, callback=self.parse_news)
except:
raise CloseSpider('article not found')
if is_no_update:
self.logger.info('Media have no update')
return
# Get more
try:
next_date = published_at_wib - timedelta(seconds=1)
if self.media['last_scraped_at'] < wib_to_utc(next_date):
yield Request('http://pilkada.arah.com/api/article/8/' + str(next_date)[:19],
callback=self.parse)
except:
pass
# Collect news item
def parse(self, response):
self.logger.info('parse: %s' % response)
has_no_update = False
# Get list of news from the current page
for article in response.css('.col-sm-16 > .row > .col-sm-16 > .row'):
title = article.css('h4::text').extract_first()
url = article.css('a::attr(href)').extract_first()
time = article.css('.indexTime::text').extract_first() # 16:51
date = article.css('.indexDay::text').extract_first() # Sabtu, 15 Oktober 2016
date = date.split(',')[-1].strip() # 15 Oktober 2016
date_time = date + ' ' + time # 15 Oktober 2016 16:51
date_time = date_time.split(' ')
date_time = ' '.join([_(s) for s in date_time]) # Oktober => October
# Parse date information
try:
published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M')
except ValueError as e:
raise CloseSpider('cannot_parse_date: %s' % e)
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
has_no_update = True
break
# For each url we create new scrapy request
yield Request(url, callback=self.parse_news)
if has_no_update:
self.logger.info('Media have no update')
return
# Currently has no more pages
def parse_news_pilkada(self, loader, response):
date_selector = response.css('.block-judul-artikel > .tanggal::text')
try:
date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
except Exception:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
if (self.media['last_scraped_at'] >= published_at):
is_no_update = True
self.logger.info('Media have no update')
raise CloseSpider('finished')
loader.add_value('published_at', published_at)
title_selector = response.css('.block-judul-artikel > .judul-artikel')
loader.add_value('title', title_selector.extract()[0])
raw_content_selector = response.css('.block-artikel .p-artikel')
raw_content_selector = raw_content_selector.xpath('//p[not(iframe)]')
raw_content = ''
for rsl in raw_content_selector:
raw_content = raw_content + rsl.extract().strip()
loader.add_value('raw_content', raw_content)
author_name = ''
for author_name_selector in reversed(raw_content_selector):
author_name_selector = author_name_selector.css('strong::text')
for tmp in reversed(author_name_selector.extract()):
tmp = tmp.strip()
if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
author_name = tmp
break
if author_name:
break
author_name = ','.join(author_name.split(' | '))
loader.add_value('author_name', author_name)
loader.add_value('url', response.url)
def parse(self, response):
self.logger.info('parse: {}'.format(response))
is_no_update = False
# Collect list of news from current page
# Note: no next page button on cnnindonesia, all is loaded here
article_selectors = response.css('a.list_kontribusi');
if not article_selectors:
raise CloseSpider('article_selectors not found')
for article in article_selectors:
url_selectors = article.css('a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
# Example: Jumat, 23/09/2016 21:17
info_selectors = article.css('div.text > div > span.tanggal::text')
if not info_selectors:
raise CloseSpider('info_selectors not found')
info = info_selectors.extract()[0]
info_time = info.split(',')[1].strip()
# Parse date information
try:
# Example: 23/09/2016 21:17
published_at_wib = datetime.strptime(info_time, '%d/%m/%Y %H:%M')
except ValueError as err:
raise CloseSpider('cannot_parse_date: {}'.format(err))
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy Request
yield Request(url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
def parse(self, response):
self.logger.info('parse: {}'.format(response))
is_no_update = False
# Collect list of news from current page
articles = json.loads(response.body)['response']
for article in articles:
# Example: 2016-10-12 15:16:04
date_time_str = article['news_date_publish']
# Parse date information
try:
published_at_wib = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
except Exception as e:
raise CloseSpider('cannot_parse_date: {}'.format(e))
published_at = wib_to_utc(published_at_wib)
if (self.media['last_scraped_at'] >= published_at):
is_no_update = True
break;
for sub_article in article['news_content']:
yield self.parse_news(article, sub_article)
if is_no_update:
self.logger.info('Media have no update')
return
# Collect news on next page
if len(articles) > 0:
# Example: 'http://api.merdeka.com/mobile/gettag/pilgub-dki/0/20/L9pTAoWB269T&-E/'
next_page_url = response.url.split('/')
next_page_url[-4] = str(int(next_page_url[-4]) + 20)
next_page_url = '/'.join(next_page_url)
yield Request(next_page_url, callback=self.parse)
# Collect news item
def check_error(self):
# Stop spider if error has been raised in pipeline
if hasattr(self, 'close_error'):
raise CloseSpider(self.close_error)
def next_request(self):
while True:
try:
url = next(self.redis_queue)
except StopIteration:
url = None
if not (url and FeedbackSpider.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['productId'][0])):
break
if url:
return self.make_requests_from_url(url)
else:
raise CloseSpider('redis queue has no url to request')
def next_request(self):
while True:
try:
url = next(self.redis_queue)
except StopIteration:
url = None
if not (url and self.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['storeId'][0])):
break
if url:
return self.make_requests_from_url(url)
else:
raise CloseSpider('redis queue has no url to request')
def next_request(self):
while True:
try:
url = next(self.redis_queue)
except StopIteration:
url = None
if not (url and self.ids.add(url[url.rfind('/') + 1:url.rfind('.')])):
break
if url:
return self.make_requests_from_url(url)
else:
raise CloseSpider('redis queue has no url to request')
def next_request(self):
while True:
try:
url = next(self.redis_queue)
except StopIteration:
url = None
if not (url and self.ids.add(url[url.rfind('/') + 1:])):
break
if url:
return self.make_requests_from_url(url)
else:
raise CloseSpider('redis queue has no url to request')
def next_request(self):
while True:
try:
url = next(self.redis_queue)
except StopIteration:
url = None
if not (url and OrderSpider.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['productId'][0])):
break
if url:
return self.make_requests_from_url(url)
else:
raise CloseSpider('redis queue has no url to request')
def parse(self, response):
data = json.loads(response.body)
total = int(data['totalRecord']['num'])
total_page = int(math.ceil(total/float(self.page_size)))
if total == 0:
raise CloseSpider('blocked')
for i in self.parse_item(response):
yield i
for page in range(2, total_page+1):
yield Request(url=self.get_url(page), callback=self.parse_item)
def __check_for_close(self):
"""
Check to see if this spider has been running for longer than the maximum amount
of allowed time, and stop the spider if it has.
:return: None
"""
if self._start_time is None:
self._start_time = DatetimeHelper.now()
elapsed_time = (DatetimeHelper.now() - self.start_time).total_seconds()
if elapsed_time > self.max_run_time:
raise CloseSpider(
"Spider run time exceeded maximum time of %s seconds. Closing."
% (self.max_run_time,)
)
def open_spider(self, spider):
site_setting = spider.settings.get('SITE')
if not site_setting:
error_msg = 'Can not find the website configuration from settings.'
spider.logger.error(error_msg)
raise CloseSpider(error_msg)
self.session = self.session_maker()
site = self.session.query(LiveTVSite).filter(LiveTVSite.code == site_setting['code']).one_or_none()
if not site:
site = LiveTVSite(code=site_setting['code'], name=site_setting['name'],
description=site_setting['description'], url=site_setting['url'],
image=site_setting['image'], show_seq=site_setting['show_seq'])
self.session.add(site)
self.session.commit()
self.site[site.code] = {'id': site.id, 'starttime': datetime.utcnow(), 'channels': {}}