def parse(self, response):
"""Get the pagination links and hand them off.
"""
s = Selector(response)
pagination = s.css('.pagination')
pagelinks = [response.url]
pagelinks.extend(pagination.xpath(
'//a[contains(@href, "l-remote/p-")]/@href').extract())
# for pagelink in pagelinks:
for pagelink in pagelinks[:1]:
request = Request(
urljoin(self.root, pagelink),
callback=self.parse_jobspage,
dont_filter=True,
)
yield request
python类selector()的实例源码
def parse_job(self, response):
"""Parse a joblink into a JobItem.
"""
s = Selector(response)
item = JobItem()
item['url'] = response.url.split('?')[0]
item['site'] = 'CareerBuilder'
item['title'] = s.css('h1::text').extract_first()
item['text'] = s.css('.job-facts::text').extract()
item['text'].extend(s.css('.item').css('.tag::text').extract())
item['text'].extend(s.css('.description::text').extract())
try:
posted = s.xpath(
'//h3[@id="job-begin-date"]/text()').extract_first()
item['date_posted'] = utilities.naturaltime(
posted.replace('Posted ', '')).isoformat()
except Exception as e:
self.logger.error(e)
yield item
def parse(self, response):
hxs = scrapy.Selector(response)
slots_tutorials = hxs.xpath('//td[@class="slot slot-tutorial"]')
for slot in slots_tutorials:
speakers_tutorials = slot.xpath('//span[@class="speaker"]/text()').extract()
urls_tutorials = slot.xpath('//span[@class="title"]//@href').extract()
talks_tutorials = slot.xpath('//span[@class="title"]//a/text()').extract()
indexSpeaker=0
for speaker in speakers_tutorials:
yield Request(url=''.join(('http://www.pydata.org', urls_tutorials[indexSpeaker])),
callback=self.parse_details,
meta={'speaker': speaker.strip(), 'url': urls_tutorials[indexSpeaker],
'talk': talks_tutorials[indexSpeaker]}
)
indexSpeaker=indexSpeaker+1
def parse_page(self, response):
self.write(response.body)
sel = Selector(response)
infos = sel.xpath('//tbody/tr').extract()
for i, info in enumerate(infos):
if i == 0:
continue
val = Selector(text = info)
ip = val.xpath('//td[1]/text()').extract_first()
port = val.xpath('//td[2]/text()').extract_first()
country = val.xpath('//td[6]/text()').extract_first()
anonymity = val.xpath('//td[3]/text()').extract_first()
https = val.xpath('//td[4]/text()').extract_first()
proxy = Proxy()
proxy.set_value(
ip = ip,
port = port,
country = country,
anonymity = anonymity,
source = self.name,
)
self.add_proxy(proxy = proxy)
def parse(self, response):
def getdomain(url):
proto, rest = urllib.splittype(url)
host, rest = urllib.splithost(rest)
return "http://"+host
sel=scrapy.Selector(response)
links_in_a_page=sel.xpath('//a[@href]')
for link_sel in links_in_a_page:
item=XinhuaItem()
link=str(link_sel.re('href="(.*?)"')[0])
if link:
if not link.startswith('http'):
link=response.url+link
#link=getdomain(response.url)+link
yield scrapy.Request(link,callback=self.parse)
p1=re.compile(r'.*\d{4}-\d{2}/\d{2}.*')
if re.match(p1,link):
print ("Y: "+link)
item['link']=link
yield item
else:
print ("F: "+link)
def parse(self, response):
def getdomain(url):
proto, rest = urllib.splittype(url)
host, rest = urllib.splithost(rest)
return "http://"+host
sel=scrapy.Selector(response)
links_in_a_page = sel.xpath('//a[@href]')
for link_sel in links_in_a_page:
item=QqurlItem()
link=str(link_sel.re('href="(.*?)"')[0])
if link:
if not link.startswith('http'):
if link.startswith('javascript'):
continue
if link.startswith('//support'):
continue
link=getdomain(response.url)+link
if re.match('.*comment.*',link):
continue
yield scrapy.Request(link,callback=self.parse)
if not re.match('.*comment.*',link):
if re.match('^http.*qq.com.*\.s?html?$',link):
item['link']=link
yield item
def parse(self, response):
def getdomain(url):
#proto,rest=urllib.splittype(url)
#host,rest=urllib.splithost(rest)
return "http:"
sel = scrapy.Selector(response)
links_in_a_page=sel.xpath('//a[@href]')
for link_sel in links_in_a_page:
item=SohuItem()
link=str(link_sel.re('href="(.*?)"')[0])
if link:
if not link.startswith('http'):
link=getdomain(response.url)+link
yield scrapy.Request(link,callback=self.parse)
p1=re.compile(r'.*/a/.*')
p2=re.compile(r'.*#comment_area$')
p3=re.compile(r'.*news.sohu.com.*s?html?$')
if (re.match(p3,link) or re.match(p1,link)) and (not re.match(p2,link)):
#print ('T: '+link)
item['link']=link
yield item
else:
pass
#print ('F: '+link)
def alternative_parse_method(self, response):
# An alternative would be to build a Scrapy selector from the JS string
# and extract the data using CSS selectors
script = response.xpath('//script[contains(., "var data =")]/text()').extract_first()
sel = scrapy.Selector(root=js2xml.parse(script))
for quote in sel.css('var[name="data"] > array > object'):
yield {
'text': quote.css('property[name="text"] > string::text').extract_first(),
'author': quote.css('property[name="author"] property[name="name"] > string::text').extract_first(),
'tags': quote.css('property[name="tags"] string::text').extract(),
}
link_next = response.css('li.next a::attr("href")').extract_first()
if link_next:
yield scrapy.Request(response.urljoin(link_next))
def parse(self, response):
self.driver.get(response.url)
sel = scrapy.Selector(text=self.driver.page_source)
for quote in sel.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('span small::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
next_page = sel.css('li.next > a::attr(href)').extract_first()
if next_page:
yield scrapy.Request(response.urljoin(next_page))
def parse(self, response):
selector = scrapy.Selector(response)
#item = CrawlmeizituItemPage()
next_pages = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract()
next_pages_text = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/text()').extract()
all_urls = []
if '???' in next_pages_text:
next_url = "http://www.meizitu.com/a/{}".format(next_pages[-2])
with open('..//url.txt', 'a+') as fp:
fp.write('\n')
fp.write(next_url)
fp.write("\n")
request = scrapy.http.Request(next_url, callback=self.parse)
time.sleep(2)
yield request
all_info = selector.xpath('//h3[@class="tit"]/a')
#??????????
for info in all_info:
links = info.xpath('//h3[@class="tit"]/a/@href').extract()
for link in links:
request = scrapy.http.Request(link, callback=self.parse_item)
time.sleep(1)
yield request
# next_link = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract()
# next_link_text = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/text()').extract()
# if '???' in next_link_text:
# nextPage = "http://www.meizitu.com/a/{}".format(next_link[-2])
# item['page_url'] = nextPage
# yield item
#??????????
def parse_detail(self,response):
item = CrawldetailsItem()
sel = Selector(response)
try:
item["kd"] = response.meta['kd']
item["title"] = self.get_text(sel,'//*[@id="job_detail"]/dt/h1/@title')
item["company"] = sel.xpath('//*[@id="container"]/div[2]/dl/dt/a/div/h2/text()').extract()[0].strip()
item["city"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[2]/text()').extract()[0]
item["address"] = sel.xpath('//*[@id="container"]/div[2]/dl/dd/div[1]/text()').extract()[0]
industry = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[1]').extract()[0]
item["industry"] = BeautifulSoup(industry).get_text().encode("utf-8").split(' ')[1].strip()
scale = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[2]').extract()[0]
item["scale"] = BeautifulSoup(scale).get_text().encode("utf-8").split(' ')[1].strip()
phase = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[2]/li').extract()[0]
item["phase"] = BeautifulSoup(phase).get_text().encode("utf-8").split(' ')[1].strip()
item["salary"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[1]/text()').extract()[0]
item["experience"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[3]/text()').extract()[0]
item["education"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[4]/text()').extract()[0]
item["description"] = self.get_text(sel,'//*[@id="job_detail"]/dd[2]')
item["url"] = response.url
item["published"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[3]/text()').extract()[0][:-8]
item["tag"] = self.get_text(sel, '//*[@id="job_detail"]/dd[1]/p[2]/text()')
except Exception, e:
print e
yield item
def get_case_studies_details(response: Response):
content = response.content.decode("utf-8")
article_selector = "#company-projects > article"
articles = Selector(text=content).css(article_selector).extract()
result = []
for article in articles:
title = Selector(text=article).css("h3::text").extract()[0]
summary = Selector(text=article).css("p::text").extract()[0]
href = Selector(text=article).css("a::attr(href)").extract()[0]
slug = href.split("/")[-2]
assert slug, "Could not extract case study slug from {}".format(article)
logging.debug("Got case study slug: %s", slug)
result.append((title, summary, href, slug))
assert result, "No Case Study details extracted from {}".format(articles)
return result
def fas_get_company_profile_url(response: Response, name: str) -> str:
content = response.content.decode("utf-8")
links_to_profiles_selector = "#ed-search-list-container a"
href_selector = "a::attr(href)"
links_to_profiles = Selector(text=content).css(
links_to_profiles_selector).extract()
profile_url = None
for link in links_to_profiles:
if escape_html(name).lower() in escape_html(link).lower():
profile_url = Selector(text=link).css(href_selector).extract()[0]
with assertion_msg(
"Couldn't find link to '%s' company profile page in the response",
name):
assert profile_url
return profile_url
def fas_follow_case_study_links_to_related_sectors(context, actor_alias):
actor = context.get_actor(actor_alias)
session = actor.session
content = context.response.content.decode("utf-8")
links_css_selector = "#company-showcase .case-study-info a"
links_to_sectors = Selector(text=content).css(links_css_selector).extract()
with assertion_msg("Expected to find at least 1 link to Industry sector"
"associated with Company Showcase Case Study"):
assert links_css_selector
results = {}
fas_url = get_absolute_url("ui-supplier:landing")
for link in links_to_sectors:
industry = Selector(text=link).css("a::text").extract()[0]
href = Selector(text=link).css("a::attr(href)").extract()[0]
url = urljoin(fas_url, href)
sectors = [value for _, value in parse_qsl(urlsplit(href).query)]
logging.debug(
"%s will look for Suppliers in '%s' Industry sectors '%s'",
actor_alias, industry, ", ".join(sectors)
)
response = make_request(Method.GET, url=url, session=session)
results[industry] = {
"url": url,
"sectors": sectors,
"response": response
}
context.results = results
def fas_should_see_unfiltered_search_results(context, actor_alias):
response = context.response
content = response.content.decode("utf-8")
sector_filters_selector = "#id_sectors input"
filters = Selector(text=content).css(sector_filters_selector).extract()
for fil in filters:
sector = Selector(text=fil).css("input::attr(value)").extract()[0]
selector = "input::attr(checked)"
checked = True if Selector(text=fil).css(selector).extract() else False
with assertion_msg(
"Expected search results to be unfiltered but this "
"filter was checked: '%s'", sector):
assert not checked
logging.debug("%s was shown with unfiltered search results", actor_alias)
def parse_location(self,response):
loc_hxs = scrapy.Selector(response)
loc_xs = loc_hxs.xpath('//div[@id="aside"]/script[1]').extract()[0]
coord_text = re.findall(r'lng:\w+.\w+,lat:\w+.\w+',loc_xs)[0]
item = response.meta['item']
item['location'] = coord_text.encode('gbk')
return item
#print coord_text
def parse(self,response):
reload(sys)
sys.setdefaultencoding('utf8')
print '__________'
if response.status == 403:
print 'meet 403, sleep 600 sconds'
import time
time.sleep(1200)
yield Request(response.url,callback=self.parse)
#404,????????????
elif response.status == 404:
print 'meet 404,return'
else:
hxs = scrapy.Selector(response)
for i in range(1,31):
item = SoufangItem()
name_ = hxs.xpath('/html/body/div[4]/div[1]/ul/li['+str(i)+']/div[1]/div[1]/a/text()').extract()
name = ''.join(name_)
http = hxs.xpath('/html/body/div[4]/div[1]/ul/li['+str(i)+']/div[1]/div[1]/a/@href').extract()
href = ''.join(http)
#href = href + 'xiangqing/'
item['name'] = name.encode('gbk')
item['link'] = href.encode('gbk')
yield Request(href,callback=self.parse_detail,meta={'item':item})
print name, href
print '__________'
def parse_detail(self,response):
#print 'in'
loc_hxs = scrapy.Selector(response)
loudongzongshu = loc_hxs.xpath('/html/body/div[5]/div[2]/div[2]/div[5]/span[2]/text()').extract()
loudongzongshu = ''.join(loudongzongshu)
fangwuzongshu = loc_hxs.xpath('/html/body/div[5]/div[2]/div[2]/div[6]/span[2]/text()').extract()
fangwuzongshu = ''.join(fangwuzongshu)
item = response.meta['item']
item['address'] = loudongzongshu.encode('gbk')
item['zonghushu'] = fangwuzongshu.encode('gbk')
return item
def parse_detail(self,response):
loc_hxs = scrapy.Selector(response)
build_num_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[2]/text()').extract()
build_num = ''.join(build_num_)
total_households_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[4]/text()').extract()
total_households = ''.join(total_households_)
plot_ratio_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[6]/text()').extract()
plot_ratio = ''.join(plot_ratio_)
green_ratio_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[8]/text()').extract()
green_ratio = ''.join(green_ratio_)
property_fee_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[10]/text()').extract()
property_fee = ''.join(property_fee_)
item = response.meta['item']
item['build_num'] = build_num.encode('gbk')
item['total_households'] = total_households.encode('gbk')
item['plot_ratio'] = plot_ratio.encode('gbk')
item['greening_ratio'] = green_ratio.encode('gbk')
item['properity_fee'] = property_fee.encode('gbk')
return item
test_design_topic_spider.py 文件源码
项目:decoration-design-crawler
作者: imflyn
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def test_parse_content(self):
content = requests.get('http://xiaoguotu.to8to.com/topic/11.html')
response = Response('http://xiaoguotu.to8to.com/topic/11.html')
response.text = content.content.decode("utf-8")
selector = Selector(response)
title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0]
description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0]
items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p')
article = []
text = ''
for index, item_selector in enumerate(items_selector):
try:
text = item_selector.xpath('span/text()').extract()[0]
except IndexError:
try:
img_url = item_selector.xpath('img/@src').extract()[0]
img_width = 0
try:
img_width = item_selector.xpath('img/@width').extract()[0]
except IndexError:
pass
img_height = 0
try:
img_height = item_selector.xpath('img/@height').extract()[0]
except IndexError:
pass
article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height})
except IndexError:
continue
design_topic_item = DesignTopicItem()
design_topic_item['title'] = title
design_topic_item['description'] = description
design_topic_item['article'] = article
design_topic_item['html_url'] = response.url
return design_topic_item
def parse(self, response):
sel = scrapy.Selector(response)
#print(sel.xpath('//title').extract())
fligint_div = "//ul[@class='news-list2']/li[1]/div[@class='gzh-box2']/div[@class='img-box']/a[1]/@href"
first_url_list = sel.xpath(fligint_div).extract()
self.first_url = first_url_list[0]
print(self.first_url)
yield scrapy.Request(self.first_url,meta=self.meta, callback=self.parse_url_list)