def parse_news(self,response):
# print response.url,"response"
PageKey = response.meta.get("topic_id")
PageNumber =response.meta.get("PageNumber")
flag_id =str(int(PageKey)-40037910)
soup =BeautifulSoup(response.body,"lxml")
#2016-07-13
news_date = soup.find("time").text if soup.find("time") else None
# print self.flag[flag_id],int(PageNumber)
"""
?????????self.flag[flag_id]??0??????????????
??????????????????????????????
self.flag[flag_id]=????
"""
if not self.flag[flag_id] or int(PageNumber)==self.flag[flag_id]:
#???????
struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
# print self.end_now,struct_date,"time"
delta = self.end_now-struct_date
# print delta.days,"delta day ~~~~~~~~~~~~~~~~"
if delta.days > self.end_day:
self.flag[str(flag_id)]=int(PageNumber)
# print flag_id,"stop ~~~~~~"
# raise CloseSpider('today scrapy end')
else:
head = soup.find("div",class_="post-head")
topic,title,abstract=None,None,None
if head:
topic = head.find("span",class_="category").text if head.find("span",class_="category") else None
title =head.find("h1",class_="h1").text if head.find("h1",class_="h1") else None
abstract = head.find("span",class_="kicker").text if head.find("span",class_="kicker") else None
content = soup.find("div",class_="post-body clearfix").text if soup.find("div",class_="post-body clearfix") else None
news_no = response.url.split("/")[-1].split("?")[0]
#TODO ????js??????
item = NewsItem(title=title,topic=topic,
abstract=abstract,news_date=news_date,
content=content,news_no=news_no
,crawl_date=NOW,news_url=response.url,catalogue='????')
yield item
python类CloseSpider()的实例源码
def parse(self, response):
self.logger.info('parse: {}'.format(response))
is_no_update = False
# Collect list of news from current page
articles_grid = response.css('li:not(.last) > div.grid')
articles = zip(articles_grid, [NEWS_GRID] * len(articles_grid))
articles += zip(response.css('div.topic'), [NEWS_HEADLINE])
if not articles:
raise CloseSpider('article not found')
for article in articles:
# Close the spider if we don't find the list of urls
url_selectors = None
if article[1] == NEWS_GRID:
url_selectors = article[0].css('h2 > a::attr(href)')
elif article[1] == NEWS_HEADLINE:
url_selectors = article[0].css('h1 > a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
self.logger.info('Url: {}'.format(url))
# Example: Minggu, 09 Oct 2016 15:14
info_selectors = article[0].css('div.reg::text')
if not info_selectors:
raise CloseSpider('info_selectors not found')
info = info_selectors.extract()[1]
# Example: 09 Oct 2016 15:14
info_time = info.split(',')[1].strip()
# Parse date information
try:
published_at_wib = datetime.strptime(info_time, '%d %b %Y %H:%M')
except ValueError as e:
raise CloseSpider('cannot_parse_date: {}'.format(e))
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy request
yield Request(url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# Collect news on next page
if response.css('div.bu.fr > a'):
next_page = response.css('div.bu.fr > a[rel="next"]::attr(href)').extract()[0]
next_page_url = response.urljoin(next_page)
yield Request(next_page_url, callback=self.parse)
# Collect news item
def parse(self, response):
self.logger.info('parse: {}'.format(response))
is_no_update = False
# Collect list of news from current page
article_selectors = response.css('ul.indexlist > li')
if not article_selectors:
raise CloseSpider('article_selectors not found')
for article in article_selectors:
url_selectors = article.css('a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
# Example: 7 Oktober 2016 19:37
info_selectors = article.css('div.upperdeck::text')
if not info_selectors:
raise CloseSpider('info_selectors not found')
info = info_selectors.extract()[1]
info = info.split(',')[1].replace('\t','').strip()
# Example: 7 October 2016 19:37
info_time = info.split(' ')
info_time = ' '.join([_(s) for s in info_time])
# Parse date information
try:
published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
except ValueError as err:
raise CloseSpider('cannot_parse_date: {}'.format(err))
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy Request
yield Request(url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# Collect news on next page
tag_selectors = response.css('div.pagination > a')
if not tag_selectors:
raise CloseSpider('tag_selectors not found')
for tag in tag_selectors:
more_selectors = tag.css('a::text')
if not more_selectors:
raise CloseSpider('more_selectors not found')
more = more_selectors.extract()[0]
if more == 'NEXT':
next_page = tag.css('a::attr(href)').extract()[0]
next_page_url = response.urljoin(next_page)
yield Request(next_page_url, callback=self.parse)
# Collect news item
def parse(self, response):
self.logger.info('parse: %s' % response)
is_no_update = False
# Get list of news from the current page
articles = response.css('div.view-front > div.view-content > div.views-row')
if not articles:
raise CloseSpider('article not found')
for article in articles:
# Close the spider if we don't find the list of urls
url_selectors = article.css('span.field-content a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
# Example '19 Oct 2016'
info_selectors = article.css('span.field-content::text')
if not info_selectors:
raise CloseSpider('info_selectors not found')
info_time = info_selectors.extract()[1].strip()
# Parse date information
try:
published_at_wib = datetime.strptime(info_time, '%d %b %Y')
except ValueError as e:
raise CloseSpider('cannot_parse_date: %s' % e)
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy request
yield Request('http://www.qureta.com' + url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
if response.css('li.next'):
next_page_url = response.css('li.next > a::attr(href)')[0].extract()
yield Request('http://www.qureta.com' + next_page_url, callback=self.parse)
# Collect news item
def parse(self, response):
self.logger.info('parse: %s' % response)
is_no_update = False
# Get list of news from the current page
articles = response.css('li.media')
if not articles:
raise CloseSpider('article not found')
for article in articles:
# Close the spider if we don't find the list of urls
url_selectors = article.css('a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
# Example '02 November 2016'
date_selectors = article.css('time::text')
if not date_selectors:
raise CloseSpider('date_selectors not found')
# Parse date information
try:
date = date_selectors.extract()[0].split(' ')
# Sanitize month - Indo month to Eng month
# Example: 02 Nov 2016
date[1] = sanitize(date[1])
published_at_wib = datetime.strptime(' '.join(date),
'%d %b %Y')
except ValueError as e:
raise CloseSpider('cannot_parse_date: %s' % e)
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy request
yield Request('http:' + url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# try getting next page
try:
next_page_url = response.xpath(
'//section[@class="pagination-numeric"]/span/a/@href')[-1].extract()
if next_page_url and next_page_url != response.url:
yield Request(next_page_url, callback=self.parse)
except:
pass
# Collect news item
def parse_news_metro(self, response):
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
date_selector = response.css('.artikel > div.block-tanggal::text')
if not date_selector:
return self.parse_news_pilkada(loader, response)
try:
date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
except Exception:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
if (self.media['last_scraped_at'] >= published_at):
is_no_update = True
self.logger.info('Media have no update')
raise CloseSpider('finished')
loader.add_value('published_at', published_at)
title_selector = response.css('.artikel > h1::text')
if not title_selector:
return loader.load_item()
loader.add_value('title', title_selector.extract()[0])
# Select all p which don't have iframe inside it
raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]')
if not raw_content_selector:
return loader.load_item()
raw_content = ''
for rsl in raw_content_selector:
raw_content = raw_content + rsl.extract().strip()
# Go to next page while there is next page button
next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href')
if next_page_selector:
return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content))
loader.add_value('raw_content', raw_content)
# The author usually put inside <strong> tag, however, some news is not using <strong> tag.
# NOTE: this block of code may need revision in the future
author_name = ''
for author_name_selector in reversed(raw_content_selector):
author_name_selector = author_name_selector.css('strong::text')
for tmp in reversed(author_name_selector.extract()):
tmp = tmp.strip()
if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
author_name = tmp
break
if author_name:
break
author_name = ','.join(author_name.split(' | '))
loader.add_value('author_name', author_name)
return loader.load_item()
def parse(self, response):
self.logger.info('parse: {}'.format(response))
is_no_update = False
for article in response.css('li > div.breaking-title'):
# http://metro.sindonews.com/read/1146316/171/penyidik-bareskrim-mulai-dalami-video-dugaan-penistaan-agama-1476179831
url_selectors = article.css('a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
# Example 'Kamis, 13 Oktober 2016 - 11:18 WIB'
date_time_str_selectors = article.css('p::text')
if not date_time_str_selectors:
raise CloseSpider('date_time_str_selectors not found')
date_time_str = date_time_str_selectors.extract()[0]
# Parse date information
# Example '13 Oktober 2016 - 11:18'
date_time_str = date_time_str.split(',')[1].strip()[:-4]
date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
try:
published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
except Exception as e:
raise CloseSpider('cannot_parse_date: %s' % e)
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy request
yield Request(url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
for next_button in response.css('.mpaging > ul > li'):
if len(next_button.css('a:not(.active) > .fa-angle-right')) > 0:
next_page = next_button.css('a::attr(href)').extract()[0]
next_page_url = response.urljoin(next_page)
yield Request(next_page_url, callback=self.parse)
break
# Collect news item
def parse(self, response):
self.logger.info('parse: %s' % response)
is_no_update = False
# Get list of news from the current page
articles = response.css('article')
if not articles:
raise CloseSpider('article not found')
for article in articles:
# Close the spider if we don't find the list of urls
url_selectors = article.css('a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
# Example: 'Monday, 24/11/2016 | 13:54'
date_selectors = article.css('time::text')
if not date_selectors:
raise CloseSpider('date_selectors not found')
# Parse date information
try:
date = date_selectors.extract()[0].split(' ')
published_at_wib = datetime.strptime(' '.join(date[1:]), '%d/%m/%Y | %H:%M')
except ValueError as e:
raise CloseSpider('cannot_parse_date: %s' % e)
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy request
yield Request(url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# try getting next page
try:
next_page_url = response.css('div.pagination > a.next::attr(href)').extract()[0]
if next_page_url:
yield Request(next_page_url, callback=self.parse)
except:
pass
# Collect news item
def parse(self, response):
self.logger.info('parse: %s' % response)
is_no_update = False
# Get list of news from the current page
articles = response.css('div.wp-terhangat > div.item3')
if not articles:
raise CloseSpider('article not found')
for article in articles:
# Close the spider if we don't find the list of urls
url_selectors = article.css('a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
# Example 'Wednesday, 02 November 2016'
date_selectors = article.css('span.date::text')
if not date_selectors:
raise CloseSpider('date_selectors not found')
# Parse date information
try:
date = date_selectors.extract()[0].split(' ')
# Sanitize month - Indo month to Eng month
# Example: Wednesday, 02 Nov 2016
date[2] = sanitize(date[2])
published_at_wib = datetime.strptime(' '.join(date[1:]),
'%d %b %Y')
except ValueError as e:
raise CloseSpider('cannot_parse_date: %s' % e)
published_at = wib_to_utc(published_at_wib)
# if it's news from before 2015, drop them
if self.media['last_scraped_at'] >= published_at or int(date[-1]) < 2015:
is_no_update = True
break
# For each url we create new scrapy request
yield Request(url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# try getting next page
try:
next_page_url = response.css('nav > ul > li > a::attr(href)').extract()[-1]
if next_page_url:
yield Request(next_page_url, callback=self.parse)
except:
pass
# Collect news item
def parse(self, response):
is_no_update = False
news_selector = response.css("ul.clearfix > li > div.tleft")
if not news_selector:
raise CloseSpider('news_selectors not found')
for news in news_selector:
url_selectors = news.css("div.tleft > h3 > a::attr(href)")
if not url_selectors:
raise CloseSpider('url_selectors not found')
# http://megapolitan.kompas.com/read/xml/2016/10/18/17244781/ini.alat.peraga.kampanye.yang.boleh.dibuat.cagub-cawagub.dki
# http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.2016.10.15.07300081&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1
url = url_selectors.extract()[0]
url = 'http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.' + '.'.join(url.split('/')[-5:-1]) + '&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1'
date_selectors = news.css("div.grey.small::text")
if not date_selectors:
raise CloseSpider('date_selectors not found')
raw_date = date_selectors.extract()[0]
# Parse date information
try:
published_at = self.convert_date(raw_date);
except Exception as e:
raise CloseSpider('cannot_parse_date: %s' % e)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy request
yield Request(url=url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# For kompas case, we don't rely on the pagination
# Their pagination is max 17 pages, the truth is they have 25 pages
if self.first_time:
template_url = 'http://lipsus.kompas.com/topikpilihanlist/3754/{}/Pilkada.DKI.2017'
for i in xrange(25):
page = i + 1
next_url = template_url.format(page)
yield Request(next_url, callback=self.parse)
self.first_time = False
def parse(self, response):
self.logger.info('parse: %s' % response)
is_no_update = False
# Get list of news from the current page
articles = response.css('div.article-snippet__info')
if not articles:
raise CloseSpider('article not found')
for article in articles:
# Close the spider if we don't find the list of urls
url_selectors = article.css('a::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
info_selectors = article.css('div.article-snippet__date')
info_selectors = info_selectors.css('.timeago::text')
if not info_selectors:
raise CloseSpider('info_selectors not found')
# Example '13 Okt 2016 16:10'
info_time = info_selectors.extract()[0]
# Example '13 Oct 2016 16:10'
info_time = ' '.join([_(w) for w in info_time.split(' ')])
# Parse date information
try:
published_at_wib = datetime.strptime(info_time,
'%d %b %Y %H:%M')
except ValueError as e:
raise CloseSpider('cannot_parse_date: {}'.format(e))
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy Request
yield Request(url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# TODO: Collect news item
def parse(self, response):
self.logger.info('parse: %s' % response)
is_no_update = False
# Get list of news from the current page
articles = response.css('article > div > div.post-content')
if not articles:
raise CloseSpider('article not found')
for article in articles:
# Close the spider if we don't find the list of urls
url_selectors = article.css('a.timestamp-link::attr(href)')
if not url_selectors:
raise CloseSpider('url_selectors not found')
url = url_selectors.extract()[0]
# Example 'Sabtu, November 19, 2016'
date_selectors = article.css('a.timestamp-link > abbr::text')
if not date_selectors:
raise CloseSpider('date_selectors not found')
# Parse date information
try:
date = date_selectors.extract()[0].split(' ')
# Sanitize month - Indo month to Eng month
# Example: Nov 19 2016
date[1] = sanitize(date[1])
published_at_wib = datetime.strptime(' '.join(date[1:]),
'%b %d, %Y')
except ValueError as e:
raise CloseSpider('cannot_parse_date: %s' % e)
published_at = wib_to_utc(published_at_wib)
if self.media['last_scraped_at'] >= published_at:
is_no_update = True
break
# For each url we create new scrapy request
yield Request(url, callback=self.parse_news)
if is_no_update:
self.logger.info('Media have no update')
return
# try getting next page
if len(articles) > 0:
try:
yield Request('http://www.nusanews.co/search/label/Pilkada?updated-max=' +
str(published_at_wib).replace(' ','T') + '%2B07:00&max-results=20', callback=self.parse)
except Exception as e:
pass
# Collect news item
def parseBegin(self, response):
if response.status ==503:
raise CloseSpider("denied by remote server")
sel = Selector(response)
appends = response.meta['appends']
cityName = appends['city']
category = appends['cat']
locations = self.getLocations(response.body)
if locations == []:
# self.logger.error("location is []: %s\t%s", response.url, str(cityName))
return
div_a = sel.xpath('//li[@class="regular-search-result"]/div/div[@class="biz-listing-large"]')
for ii, div in enumerate(div_a):
# pdb.set_trace()
main = div.xpath('./div[1]/div/div[2]/h3/span/a[@class="biz-name"]')
item = FoodItem()
url = main.xpath('./@href').extract()
item['url'] = response.urljoin(url[0])
item['name'] = main.xpath('./span/text()').extract()[0]
# pdb.set_trace()
second = div.xpath('./div[2]')
address = second.xpath('./address').extract()
region = second.xpath('./span[@class="neighborhood-str-list"]/text()').extract()
if address:
item['address'] = self.filtertags(address[0])
else:
item['address'] = ""
if region:
item['region'] = (region[0]).strip()
else:
item['region'] = ""
item['city'] = cityName.strip()
item['category'] = category
item['location'] = eval(locations[ii])
yield item
time.sleep(1.0)
nextPage = sel.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href').extract()
if nextPage:
nextLink = response.urljoin(nextPage[0])
yield Request(url=nextLink, callback=self.parseBegin, meta={'appends':appends}, dont_filter=True)