def parse_news_pilkada(self, loader, response):
date_selector = response.css('.block-judul-artikel > .tanggal::text')
try:
date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
except Exception:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
if (self.media['last_scraped_at'] >= published_at):
is_no_update = True
self.logger.info('Media have no update')
raise CloseSpider('finished')
loader.add_value('published_at', published_at)
title_selector = response.css('.block-judul-artikel > .judul-artikel')
loader.add_value('title', title_selector.extract()[0])
raw_content_selector = response.css('.block-artikel .p-artikel')
raw_content_selector = raw_content_selector.xpath('//p[not(iframe)]')
raw_content = ''
for rsl in raw_content_selector:
raw_content = raw_content + rsl.extract().strip()
loader.add_value('raw_content', raw_content)
author_name = ''
for author_name_selector in reversed(raw_content_selector):
author_name_selector = author_name_selector.css('strong::text')
for tmp in reversed(author_name_selector.extract()):
tmp = tmp.strip()
if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
author_name = tmp
break
if author_name:
break
author_name = ','.join(author_name.split(' | '))
loader.add_value('author_name', author_name)
loader.add_value('url', response.url)
评论列表
文章目录