def parse(self, response):
for outer in response.css('#comapreTable tr:not(:first-child)'):
if outer.css('td[align="center"]'):
ccode = outer.css('td[align="center"]>a::attr(id)').extract_first()
cname = outer.css('td[align="center"]>a::text').extract_first()
for inner in outer.xpath('td[div[@align="left"]/a]'):
loader = ItemLoader(item=EolZhuanyeItem(), selector=inner)
loader.add_value('ccode', ccode)
loader.add_value('cname', cname)
loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0]))
loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip))
loader.add_css('name', 'a::text', MapCompose(unicode.strip))
item = loader.load_item()
yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)
python类loader()的实例源码
def parse_single_song(self, response):
loader = response.meta['loader']
selector = Selector(response)
singer = selector.xpath('//title/text()').extract()
loader.add_value('singer', singer)
loader.add_value('_id', response.meta['song_id'])
comment_data, comment_url = api_comment(response.meta['song_id'], 0, 100)
source_data, source_url = api_song_url(response.meta['song_id'])
comment_id = generate_comment_index()['comment_index']
loader.add_value('comment_id', comment_id)
yield scrapy.FormRequest(url=comment_url, method='POST', headers=self.headers,
formdata=comment_data, callback=self.parse_comments,
meta={'comment_id': comment_id})
yield scrapy.FormRequest(url=source_url, method='POST', headers=self.headers,
formdata=source_data, meta={'loader': loader}, callback=self.get_source_url)
def get_details(self, response):
self.log('Starting the second parsing phase')
loader = ItemLoader(item=LibraryOrFrameworkItem(), response=response)
# Load the values obtained in the first phase
loader.add_value('name', response.meta['name'])
language = response.meta['language']
loader.add_value('stable_release', response.meta['stable_version'])
loader.add_value('release_date', response.meta['rel_date'])
descr = response.xpath('//*[@id="mw-content-text"]/div/p[1] | //*[@id="mw-content-text"]/p[1]').extract_first()
cleaned_descr = cleanhtml(descr)
loader.add_value('description', cleaned_descr)
license_found = False
for row in response\
.xpath('//*[@id="mw-content-text"]/div/table[position()<=3]/tr'):
header = row.xpath('./th/a/text() | ./th/text()').extract_first()
key, value = self.get_key_value(header, row)
if key:
if key == 'license': # If we find the license in the main page, we will use it
license_found = True
loader.add_value(key, value)
# If we not found the license in the main page
# We will use the license found on the start page
if not license_found:
loader.add_value('license', response.meta['license'])
return {
"item": loader.load_item(),
"language": language
# We need to return the language separately in order to manage the many to many relation
}
# Given a couple (key, elem), obtained during the scraping, he returns the valid couple (key1, value1)
# to add to the db. If key is not valid, he will return the tuple (None, None)
def parse(self, response):
for quote in response.css(".quote"):
loader = ItemLoader(item=QuoteItem(), selector=quote)
loader.add_css("text", ".text")
loader.add_css("by", ".authoor")
loader.add_css("tags", ".tag")
yield loader.load_item()
def parse_item(self, response):
loader = ItemLoader(GaokaopaiZhiyeItem(), response)
loader.add_value('url', response.url)
loader.add_value('code', response.url, re=ur'-([^-]+)\.html')
loader.add_css('name', u'.modTitle>h1::text')
def parse_category():
for e in response.css(u'.catType>a'):
yield {
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
'name': e.css('::text').extract_first(),
}
loader.add_value('category', list(parse_category()))
loader.add_css('detail', u'.zhiyeShow')
item = loader.load_item()
return FormRequest(
url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html',
formdata={'code': item['code'][0]},
meta={'item': item},
dont_filter=True,
callback=self.parse_majors
)
def parse_item(self, response):
loader = ItemLoader(EolZhiyeItem(), response)
loader.add_value('url', response.url)
loader.add_value('code', response.url, re=r'/(\w+)\.shtml')
loader.add_css('name', 'h1#pagetitle::text')
loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()')
loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()')
loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n'))
yield loader.load_item()
def parse_song_list(self, response):
selector = Selector(response)
song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()
title = selector.xpath('//title/text()').extract()
for index, id_ in enumerate(song_id_list):
l = ItemLoader(item=PlayListItem())
l.add_value('song_name', song_name_list[index])
l.add_value('title', title)
yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
headers=self.headers, callback=self.parse_single_song)
def parse_detail(self,response):
# article_item = JobBoleArticleItem()
# #????????
front_image_url = response.meta.get("front_image_url", "") #?????
# title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
# create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·","").strip()
# praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract()[0]
# fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract()[0]
# match_re = re.match(r".*?(\d+).*",fav_nums)
# if match_re:
# fav_nums = int(match_re.group(1))
# else:
# fav_nums = 0
# comments_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
# match_re = re.match(r".*?(\d+).*",comments_nums)
# if match_re:
# comments_nums = int(match_re.group(1))
# else:
# comments_nums = 0
# content = response.xpath('//div[@class="entry"]').extract()[0]
# tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
# tag_list = [element for element in tag_list if not element.strip().endswith("??")]
# tags = ",".join(tag_list)
#
# article_item["url_object_id"] = get_md5(response.url)
# article_item["title"] = title
# article_item["url"] = response.url
# try:
# create_date = datetime.datetime.strptime(create_date,"%Y/%m/%d").date()
# except Exception as e:
# create_date = datetime.datetime.now().date()
# article_item["create_date"] = create_date
# article_item["front_image_url"] = [front_image_url]
# article_item["praise_nums"] = praise_nums
# article_item["comments_nums"] = comments_nums
# article_item["fav_nums"] = fav_nums
# article_item["tags"] = tags
# article_item["content"] = content
#??item loader??item
item_loader = ArticleItemLoader(item = JobBoleArticleItem(),response = response)
item_loader.add_value("url",response.url)
item_loader.add_xpath("title",'//div[@class="entry-header"]/h1/text()')
item_loader.add_value("url_object_id",get_md5(response.url))
item_loader.add_xpath("create_date",'//p[@class="entry-meta-hide-on-mobile"]/text()')
item_loader.add_xpath("praise_nums",'//span[contains(@class, "vote-post-up")]/h10/text()')
item_loader.add_value("front_image_url",[front_image_url])
item_loader.add_xpath("fav_nums",'//span[contains(@class, "bookmark-btn")]/text()')
item_loader.add_xpath("comments_nums",'//a[@href="#article-comment"]/span/text()')
item_loader.add_xpath("tags",'//p[@class="entry-meta-hide-on-mobile"]/a/text()')
item_loader.add_xpath("content",'//div[@class="entry"]')
article_item = item_loader.load_item()
yield article_item
def parse_item(self, response):
loader = ItemLoader(GaokaopaiZhuanyeItem(), response)
loader.add_value('url', response.url)
loader.add_css('name', u'.majorTitle>h1::text')
loader.add_xpath('code', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)')
loader.add_xpath('degree', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)')
loader.add_xpath('period', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)')
loader.add_xpath('courses', u'//div[@class="course"]/h3[.="?????"]/following-sibling::p/text()')
def parse_related():
for e in response.xpath(u'//div[@class="course"]/h3[.="?????"]/following-sibling::a'):
yield {
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
'name': e.css('::text').extract_first(),
}
loader.add_value('related', list(parse_related()))
def parse_category():
category = []
for i in [u"????", u"????", u"????"]:
x = u'//h3[.="{}"]/following-sibling::ul[1]/li[@class="current"]/a'.format(i)
e = response.xpath(x)
category.append({
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'/zhuanye([-0-9]*)\.html').strip('-'),
'name': e.css('::text').extract_first(),
})
return category
loader.add_value('category', parse_category())
loader.add_css('detail', u'.majorCon')
item = loader.load_item()
return Request(
url='http://www.gaokaopai.com/zhuanye-jiuye-{}.html'.format(item['code'][0]),
meta={'item': item},
callback=self.parse_jiuye
)
def parse_item(self, response):
loader = ItemLoader(ChsiDaxueItem(), response)
loader.add_value('id', response.url, re=ur'schId-(\w+)\.dhtml')
loader.add_value('url', response.url)
loader.add_css('logo', u'.r_c_sch_logo>img::attr(src)', MapCompose(lambda url: urljoin('http://gaokao.chsi.com.cn/', url)))
loader.add_css('name', u'.topImg::text')
loader.add_css('badges', u'.r_c_sch_attr .r_c_sch_icon::attr(title)')
data_clean = MapCompose(lambda x: re.sub(r'\s+', ' ', x), unicode.strip)
loader.add_xpath('type', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
loader.add_xpath('membership', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
loader.add_xpath('province', u'//span[@class="f_bold" and span]/following-sibling::text()', data_clean)
loader.add_xpath('address', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
loader.add_xpath('phone', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
loader.add_xpath('website', u'//span[@class="f_bold" and .="?????"]/following-sibling::a/@href', data_clean)
loader.add_xpath('backdoor', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
def parse_votes():
xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank'
get_vote = lambda what: float(response.xpath(xpath.format(what)).extract_first() or 0)
return {
'overall': get_vote(u'?????'),
'environment': get_vote(u'???????'),
'life': get_vote(u'?????'),
}
loader.add_value('votes', parse_votes())
def parse_trending():
css = u'{}>table tr:not(:first-child)'
def get_trending(what):
majors = []
for e in response.css(css.format(what)):
majors.append({
'id': e.css(u'.tdZytjTDiv>a::attr(href)').re_first(r'specId=(\w+)'),
'name': e.css(u'.tdZytjTDiv::attr(title)').extract_first(),
'vote': float(e.css(u'.avg_rank::text').extract_first()),
'count': int(e.css(u'.c_f00::text, .red::text').extract_first()),
})
return majors
return {
'count': get_trending(u'#topNoofPTable'),
'index': get_trending(u'#topIndexTable'),
'like': get_trending(u'.r_r_box_zymyd'),
}
loader.add_value('trending', parse_trending())
item = loader.load_item()
for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="????"]').extract_links(response):
yield Request(link.url, meta={'item': item}, callback=self.parse_jianjie)