def parse_articles(self, response):
article_ptn = "http://www.theglobeandmail.com/opinion/(.*?)/article(\d+)/"
resp_url = response.url
article_m = re.match(article_ptn, resp_url)
article_id = ''
if article_m != None:
article_id = article_m.group(2)
if article_id == '32753320':
print('***URL***', resp_url)
soup = BeautifulSoup(response.text, 'html.parser')
text = Selector(text=response.text).xpath('//*[@id="content"]/div[1]/article/div/div[3]/div[2]').extract()
if text:
print("*****in Spider text*****", soup.title.string)
yield {article_id: {"title": soup.title.string, "link": resp_url, "article_text": text}}
comments_link = response.url + r'comments/'
if comments_link == 'http://www.theglobeandmail.com/opinion/a-fascists-win-americas-moral-loss/article32753320/comments/':
yield Request(comments_link, callback=self.parse_comments)
评论列表
文章目录