def parse(self, response):
#print response.url.split('/')
#sel=HtmlXPathSelector(response)
content=response.xpath('//div[@class="quote"]')
for x in content:
word= x.xpath('.//span[@class="text"]/text()').extract_first()
print '\n'
print word
yield {'text':word}
nextPage=response.css('li.next a::attr(href)').extract_first()
if nextPage is not None:
goNext=response.urljoin(nextPage)
print "Go next: ",goNext
yield scrapy.Request(url=goNext,callback=self.parse)
python类HtmlXPathSelector()的实例源码
def parse(self, response):
"""
default parse method, rule is not useful now
"""
# import pdb; pdb.set_trace()
response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
hxs = HtmlXPathSelector(response)
index_level = self.determine_level(response)
log.msg("Parse: index level:" + str(index_level))
if index_level in [1, 2, 3, 4]:
self.save_to_file_system(index_level, response)
relative_urls = self.get_follow_links(index_level, hxs)
if relative_urls is not None:
for url in relative_urls:
log.msg('yield process, url:' + url)
yield Request(url, callback=self.parse)
elif index_level == 5:
personProfile = HtmlParser.extract_person_profile(hxs)
linkedin_id = self.get_linkedin_id(response.url)
linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
if linkedin_id:
personProfile['_id'] = linkedin_id
personProfile['url'] = UnicodeDammit(response.url).markup
yield personProfile
def show(self, response):
# print(response)
hxs = HtmlXPathSelector(response)
news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
for new in news_list:
# temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract()
link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first()
yield Request(
url='http://dig.chouti.com/link/vote?linksId=%s' % (link_id,),
method='POST',
cookies=self.cookie_dict,
callback=self.do_favor
)
page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract()
for page in page_list:
page_url = 'http://dig.chouti.com%s' % page
import hashlib
hash = hashlib.md5()
hash.update(bytes(page_url, encoding='utf-8'))
key = hash.hexdigest()
if key in self.has_request_set:
pass
else:
self.has_request_set[key] = page_url
yield Request(
url=page_url,
method='GET',
callback=self.show
)
def parse_responsepage(self, response):
hxs = HtmlXPathSelector(response)
yum = hxs.select('//span')
print(yum)
def parse(self,response):
url=response.url
#url='https://data.btcchina.com/data/ticker?market=all'
#hxs=HtmlXPathSelector(response)sss
hxs=json.loads(response.body_as_unicode())
item=BTC.items.BtcItem()
item['time']=self._get_sys_time()
item['now']=hxs['ticker_btccny']['buy']
item['height']=hxs['ticker_btccny']['high']
item['low']=hxs['ticker_btccny']['low']
yield item
yield Request(url)
def parse(self,response):
self.log("fetch group home page: %s" % response.url)
hxs=HtmlXPathSelector(response)
item=douban_group.items.DoubanGroupItem()
item['groupName']=hxs.select('//*[@id="group-info"]/h1/text()').re('^\s+(.*)\s+$')[0]
item['groupUrl']=response.url
group_id=self.__get_id_from_group_url(response.url)
member_url='https://www.douban.com/group/%s/members' % group_id
member_text=hxs.select('//a[contains(@href,"%s")]/text()' % member_url).re('(\d+)')
item['totalNumber']=member_text[0]
groups=hxs.select('//div[contains(@class,"group-list-item")]')
for group in groups:
url=group.select('div[contains(@class,"title")]/a/@href').extract()[0]
yield Request(url)
time.sleep(1)
yield item
def parse_blog(self, response):
print 'link parseado %s' %response.url
hxs = HtmlXPathSelector(response)
item = HackerWayItem()
item['title'] = hxs.select('//title/text()').extract() # Selector XPath para el titulo
item['author'] = hxs.select("//span[@class='author']/a/text()").extract() # Selector XPath para el author
item['tag'] = hxs.select("//meta[@property='og:title']/text()").extract() # Selector XPath para el tag
item['date'] = hxs.select("//span[@class='date']/text()").extract() # Selector XPath para la fecha
return item # Retornando el Item.
def parse_item(self, response):
""" Parse a response into a DocumentItem. """
doc_loader = ItemLoader(item=DocumentItem(), response=response)
doc_loader.add_value('url', response.url)
doc_loader.add_xpath('meta', '//meta[@name=\'description\']/@content')
doc_loader.add_value('domain', urlparse(response.url).hostname)
doc_loader.add_xpath('title', '//title/text()')
hxs = HtmlXPathSelector(response) # For HTML extractions
# Extract links
# For each link on this page
links = []
a_links = hxs.xpath('//a')
for link in a_links:
link_obj = {}
# Extract the link's URL
link_str = " ".join(link.xpath('@href').extract())
link_obj['link'] = link_str.replace("\n", "")
# Extract the links value
link_name_str = " ".join(link.xpath('text()').extract())
link_name_str = link_name_str.replace("\n", "")
link_name_str = link_name_str.lstrip()
link_name_str = link_name_str.rstrip()
link_obj['link_name'] = link_name_str
links.append(link_obj)
doc_loader.add_value('links', links)
# Populate text field
title_list = hxs.xpath('//title/text()').extract()
title = ' '.join(title_list)
body_text = self.html2string(response)
text = title + " " + body_text
doc_loader.add_value('content', text)
doc_loader.add_value('raw_text', text)
doc_loader.add_value('raw_title', title)
doc_loader.add_value('raw_url', response.url)
h1_list = hxs.xpath("//h1/text()").extract()
doc_loader.add_value('h1', " ".join(h1_list))
doc_loader.add_value('content_type', response.headers['Content-type'])
doc_loader.add_value('updated_on', datetime.datetime.now().strftime(
"%Y-%m-%dT%H:%M:%S"))
item = doc_loader.load_item()
return item