def parse_salaries(self, response):
"""
The values about person salary is in another table
in another page, that function grab all the table headers
and values and assign to the entity[entity_id]
The id was passed in the response.meta
"""
item = VereadorItem()
item['name'] = response.meta['name']
item['id'] = response.meta['entity_id']
item['mesano'] = response.meta['mesano']
for salary in response.xpath('//*[@id="holerite"]').extract():
selector = Selector(text=salary)
table = selector.xpath('//tr[@class="holerite_valor"]/td/text()').extract()
item["salary_gross"] = table[0]
item["salary_liquid"] = selector.xpath('//tr[@class="holerite_valor"]/td/strong/text()').extract_first()
return item
python类selector()的实例源码
def parse(self, response):
selector = Selector(response)
articles = selector.xpath('//ul[@class="article-list thumbnails"]/li')
for article in articles:
item = Jianshu2Item()
url = article.xpath('div/h4/a/@href').extract()
likeNum = article.xpath('div/div/span[2]/text()').extract()
posturl = 'http://www.jianshu.com'+url[0]
if len(likeNum) == 0:
item['likeNum'] = 0
else:
item['likeNum'] = int(likeNum[0].split(' ')[-1])
request = Request(posturl,callback=self.parse_donate)
request.meta['item'] = item
yield request
next_link = selector.xpath('//*[@id="list-container"]/div[@class="load-more"]/button/@data-url').extract()[0]
if next_link:
next_link = self.url + str(next_link)
yield Request(next_link,callback=self.parse)
def list_parse(self, response):
selector = Selector(text=response.body)
list = selector.xpath("//li//a[@class='msk']/@title")
urls = selector.xpath("//a[@class='zpgi']/@href").extract()
start_url = "http://music.163.com"
for tmp_url in urls:
yield scrapy.Request(url=start_url + tmp_url, method="GET", callback=self.list_parse,
meta={"cat": response.meta['cat']})
i = 1
for tmp in list:
list_id = selector.xpath("//li[" + str(i)
+ "]//a[@class='icon-play f-fr']/@data-res-id").extract_first()
i = i + 1
# ????
yield scrapy.Request(url=start_url+"/playlist?id="+list_id, method="GET", callback=self.play_list_parse,
meta={"cat": response.meta['cat'], "id": list_id})
def parse(self, response):
selector = Selector(response)
ID = response.meta["ID"]
text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
info = InfoItem()
if text0:
num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # ???
num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # ???
num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # ???
if num_tweets:
info["num_tweets"] = int(num_tweets[0])
if num_follows:
info["num_follows"] = int(num_follows[0])
if num_fans:
info["num_fans"] = int(num_fans[0])
url_information1 = "http://weibo.cn/%s/info" % ID
yield Request(url=url_information1, meta={"item":info,"ID":ID}, dont_filter=True, callback=self.parse1)
def parse(self, response):
selector = Selector(response)
ID = response.meta["ID"]
text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
info = InfoItem()
if text0:
num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # ???
num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # ???
num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # ???
if num_tweets:
info["num_tweets"] = int(num_tweets[0])
if num_follows:
info["num_follows"] = int(num_follows[0])
if num_fans:
info["num_fans"] = int(num_fans[0])
url_information1 = "http://weibo.cn/%s/info" % ID
yield Request(url=url_information1, meta={"item":info,"ID":ID}, dont_filter=True, callback=self.parse1)
def parse3_fans(self, response):
""" ????????????ID """
selector = Selector(response)
text2 = selector.xpath('body//table/tr/td/a/@href').extract()
url_main = response.meta["url_main"]
ID_ = response.meta["ID"]
for elem in text2:
elem = re.findall('uid=(\d+)', elem)
if elem:
ID = int(elem[0])
if ID not in self.friends_id: # ??ID????????????
self.friends_id.add(ID)
url_next = selector.xpath(
u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract()
if url_next:
yield Request(url="http://weibo.cn%s" % url_next[0], meta={"url_main":url_main,"ID":ID_}, callback=self.parse3_fans)
else:
self.fans_finish = True
if self.fans_finish and self.follows_finish:
yield Request(url=url_main, meta={"ID":ID_}, dont_filter=True, callback=self.parse)
def parse3_follows(self, response):
""" ????????????ID """
selector = Selector(response)
text2 = selector.xpath('body//table/tr/td/a/@href').extract()
url_main = response.meta["url_main"]
ID_ = response.meta["ID"]
for elem in text2:
elem = re.findall('uid=(\d+)', elem)
if elem:
ID = int(elem[0])
if ID not in self.friends_id: # ??ID????????????
self.friends_id.add(ID)
url_next = selector.xpath(
u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract()
if url_next:
yield Request(url="http://weibo.cn%s" % url_next[0], meta={"url_main":url_main,"ID":ID_}, callback=self.parse3_follows)
else:
self.follows_finish = True
if self.fans_finish and self.follows_finish:
yield Request(url=url_main, meta={"ID":ID_}, dont_filter=True, callback=self.parse)
def parse_single_song(self, response):
loader = response.meta['loader']
selector = Selector(response)
singer = selector.xpath('//title/text()').extract()
loader.add_value('singer', singer)
loader.add_value('_id', response.meta['song_id'])
comment_data, comment_url = api_comment(response.meta['song_id'], 0, 100)
source_data, source_url = api_song_url(response.meta['song_id'])
comment_id = generate_comment_index()['comment_index']
loader.add_value('comment_id', comment_id)
yield scrapy.FormRequest(url=comment_url, method='POST', headers=self.headers,
formdata=comment_data, callback=self.parse_comments,
meta={'comment_id': comment_id})
yield scrapy.FormRequest(url=source_url, method='POST', headers=self.headers,
formdata=source_data, meta={'loader': loader}, callback=self.get_source_url)
def parse_entities(self, response):
"""
A table is displayed with the data about the person
who works at the Câmara
"""
mesano = response.meta['mesano']
self.log('Getting mesano: ' + mesano)
# Check if the table is empty
if not response.css('table tr td:nth-child(1)').extract_first():
return self.log('Nenhum dado disponível')
for tr in response.xpath('//table/tr').extract():
selector = Selector(text=tr)
entity_id = re.search("(javascript:pesquisa\()(\d*)(\);)", tr).group(2)
request = scrapy.FormRequest(
url=BASE_URL + 'holerite/consulta_beneficiario.html',
formdata={
'hol_ben_id': entity_id,
'hol_mesano': mesano,
'hol_tipo': '1',
'hol_grupo': GRUPO,
'acao':''
},
callback=self.parse_salaries
)
request.meta['name'] = selector.xpath("//tr/td/text()").extract_first()
request.meta['entity_id'] = entity_id
request.meta['mesano'] = mesano
yield request
def parse_item(self, response):
selector = Selector(response).xpath('//p[@align="center"]')
for sel in selector:
image_urls = sel.xpath('a/img/@src').extract()
path = []
for img in image_urls:
path.append(urlparse.urlparse(img).path)
item = SisyItem()
item['image_urls'] = image_urls
item['images'] = path
return item
def parse(self, response):
selector = Selector(response)
books = selector.xpath('//div[@class="info"]/h2/a/@href').extract()
for book in books:
print book
yield Request(book, callback=self.parse_item)
nextPage = selector.xpath('//span[@class="next"]/a/@href').extract()
if nextPage:
print nextPage[0]
yield Request(self.url+nextPage[0],callback=self.parse)
def parse(self, response):
# print response.body
value = shenZhouCarsItem()
item = fieldsItem()
selector = Selector(response)
cars = selector.xpath('//ul[@class="carInfor-xj clearfix"]')
for index in range(0, len(cars), 2):
basic = cars[index]
specific = cars[index+1]
item['car_brand'] = basic.xpath('li[1]/span[1]/text()').re(r'\s+(.*)')[0]
item['car_series'] = basic.xpath('li[2]/span/text()').re(r'\s+(.*)')[0]
item['car_issue_date'] = basic.xpath('li[3]/span/text()').re(r'\s+(.*)')[0]
item['car_config_model'] = basic.xpath('li[4]/span/text()').re(r'\s+(.*)')[0]
item['car_seats_num'] = specific.xpath('li[1]/span/text()').re(r'\s+(.*)')[0]
item['car_doors'] = specific.xpath('li[2]/span/text()').re(r'\s+(.*)')[0]
item['car_fuel_type'] = specific.xpath('li[3]/span/text()').re(r'\s+(.*)')[0]
item['car_gearbox_type'] = specific.xpath('li[4]/span/text()').re(r'\s+(.*)')[0]
item['car_displacement'] = specific.xpath('li[5]/span/text()').extract()[0]
item['car_fuel_num'] = specific.xpath('li[6]/span/text()').re(r'\s+(.*)')[0]
item['car_drive_way'] = specific.xpath('li[7]/span/text()').re(r'\s+(.*)')[0]
item['car_engine_intake'] = specific.xpath('li[8]/span/text()').re(r'\s+(.*)')[0]
item['car_skylight'] = specific.xpath('li[9]/span/text()').re(r'\s+(.*)')[0]
item['car_tank_capa'] = specific.xpath('li[10]/span/text()').re(r'\s+(.*)')[0]
item['car_voicebox'] = specific.xpath('li[11]/span/text()').re(r'^\s+(\w*)')[0]
item['car_seats_type'] = specific.xpath('li[12]/span/text()').re(r'\s+(.*)')[0]
item['car_reverse_radar'] = specific.xpath('li[13]/span/text()').re(r'\s+(.*)')[0]
item['car_airbag'] = specific.xpath('li[14]/span/text()').re(r'\s+(\w*)')[0]
item['car_dvd'] = specific.xpath('li[15]/span/text()').re(r'\s+(.*)')[0]
item['car_gps'] = specific.xpath('li[16]/span/text()').re(r'\s+(.*)')[0]
if item['car_airbag'] == u'6510'
item['car_airbag'] = "0"
value['model'] = 'RentMe.model_info'
value['pk'] = item['car_brand']+item['car_series']+item['car_issue_date']+item['car_config_model']
value['fields'] = {'car_brand': item['car_brand'], 'car_series': item['car_series'], 'car_issue_date': item['car_issue_date'], 'car_config_model': item['car_config_model'], 'car_seats_num': item['car_seats_num'], 'car_doors': item['car_doors'], 'car_fuel_type': item['car_fuel_type'], 'car_gearbox_type': item['car_gearbox_type'], 'car_displacement': item['car_displacement'], 'car_fuel_num': item['car_fuel_num'], 'car_drive_way': item['car_drive_way'], 'car_engine_intake': item['car_engine_intake'], 'car_skylight': item['car_skylight'], 'car_tank_capa': item['car_tank_capa'], 'car_voicebox': item['car_voicebox'], 'car_seats_type': item['car_seats_type'], 'car_reverse_radar': item['car_reverse_radar'], 'car_airbag': item['car_airbag'], 'car_dvd': item['car_dvd'], 'car_gps': item['car_gps'], 'car_deposit': 5000, 'car_day_price': 100, 'car_time_out_price': 150, 'car_over_kilo_price': 0.5}
yield value
design_picture_spider.py 文件源码
项目:decoration-design-crawler
作者: imflyn
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def parse_list(self, response):
selector = Selector(response)
items_selector = selector.xpath('//div[@class="xmp_container"]//div[@class="item"]')
for item_selector in items_selector:
# http://xiaoguotu.to8to.com/c10037052.html
cid = item_selector.xpath('div//a/@href').extract()[0][2:-6]
title = item_selector.xpath('div//a/@title').extract()[0]
# http://xiaoguotu.to8to.com/getxgtjson.php?a2=0&a12=&a11=10037052&a1=0
next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/getxgtjson.php?a2=0&a12=&a11={cid}&a1=0').format(cid=cid)
yield scrapy.Request(next_url, self.parse_content, meta={'cid': cid, 'title': title})
def parse_list(self, response):
selector = Selector(response)
items_selector = selector.xpath('//div[@class="xgt_topic"]')
for item_selector in items_selector:
# /topic/7334.html
href = item_selector.xpath('div//a/@href').extract()[0]
href = href.strip()
# http://xiaoguotu.to8to.com/topic/7334.html
next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + href)
if self.design_topic_service.is_duplicate_url(next_url):
continue
yield scrapy.Request(next_url, self.parse_content)
def parse_content(self, response):
selector = Selector(response)
title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0]
description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0]
items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p')
article = []
text = ''
for index, item_selector in enumerate(items_selector):
try:
text = item_selector.xpath('span/text()').extract()[0]
except IndexError:
try:
img_url = item_selector.xpath('img/@src').extract()[0]
img_width = 0
try:
img_width = item_selector.xpath('img/@width').extract()[0]
except IndexError:
pass
img_height = 0
try:
img_height = item_selector.xpath('img/@height').extract()[0]
except IndexError:
pass
article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height})
except IndexError:
continue
design_topic_item = DesignTopicItem()
design_topic_item['title'] = title
design_topic_item['description'] = description
design_topic_item['article'] = article
design_topic_item['html_url'] = response.url
return design_topic_item
design_strategy_spider.py 文件源码
项目:decoration-design-crawler
作者: imflyn
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def parse_list(self, response):
selector = Selector(response)
items_selector = selector.xpath('//div[@id="listITme"]//div[@class="gl-listItem"]')
for item_selector in items_selector:
id = item_selector.xpath('a/@href').extract()[0].replace('/strategy/', '')
# http://guju.com.cn/strategy/strategy_getStrategyInfo_ajax?strategyModel.id=4498
next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/strategy/strategy_getStrategyInfo_ajax?strategyModel.id={id}').format(
id=id)
if self.design_strategy_service.is_duplicate_url(next_url):
log.info("=================???" + next_url + "===========")
continue
yield scrapy.Request(next_url, self.parse_content, meta={'id': id})
def parse(self, response):
# print(response, type(response))
# from scrapy.http.response.html import HtmlResponse
item = TopStockItem()
selector = Selector(response)
stocks = selector.xpath('//td[@class="keyword"]/a[@class="list-title"]')
for index, stock in enumerate(stocks):
item['name'] = stock.xpath('text()').extract()[0]
item['num'] = index + 1
item['source'] = "baidu"
yield item
def parse_detail(self, response):
url = urlparse.urlparse(response.url)
path = url.path.split("/")
item = PostItem()
selector = Selector(response)
item['postId'] = path[2]
item['authorId'] = path[1]
item['postDetail'] = selector.xpath('//div[@class="detail"]').extract()[0]
yield item
def play_list_parse(self, response):
start_url = "http://music.163.com"
item = playListItem()
selector = Selector(text=response.body)
item['list_play'] = int(selector.xpath("//strong[@id='play-count']/text()").extract_first())
item['list_collection'] = int(selector.xpath("//a[@class='u-btni u-btni-fav ']/@data-count").extract_first())
# item['list_comment'] = int(selector.xpath("//span[@id='cnt_comment_count']/text()").extract_first())
item['list_name'] = selector.xpath("//h2[@class='f-ff2 f-brk']/text()").extract_first()
item['list_id'] = response.meta['id']
item['list_tag'] = selector.xpath("//a[@class='u-tag']/i/text()").extract()
item['list_creator'] = selector.xpath("//span[@class='name']/a/text()").extract_first()
item['list_creator_id'] = selector.xpath("//span[@class='name']/a/@href").extract_first()
item['type'] = response.meta['cat']
# urls = selector.xpath("//ul[@class='f-hide']/li/a/@href").extract()
# for url in urls:
# yield scrapy.Request(url=start_url + url, method="GET", callback=self.detail_parse)
yield item
# def detail_parse(self, response):
# selector = Selector(text=response.body)
# id = selector.xpath("//div[@id='content-operation']/@data-rid").extract_first()
# detail = validate.Validate(str(id))
# info = demjson.decode(detail.get_music_json())
# if info['total'] > 10000:
# item = detailItem()
# item['music_id'] = id
# item['music_name'] = selector.xpath("//em[@class='f-ff2']/text()").extract_first()
# item['music_album'] = selector.xpath("//p[@class='des s-fc4']/a/text()").extract_first()
# item['music_artist'] = selector.xpath("//p[@class='des s-fc4']/span/@title").extract_first()
# item['music_comment_num'] = int(info['total'])
# item['music_comment'] = info['hotComments']
# yield item
def parse(self,response):
item = DoubanmovieItem()
selector = Selector(response)
movies = selector.xpath('//div[@class="info"]')
for eachmovie in movies:
title = eachmovie.xpath('div[@class="hd"]/a/span/text()').extract()
fullTitle = ''
for each in fullTitle:
fullTitle += each
movieInfo = eachmovie.xpath('div[@class="bd"]/p/text()').extract()
star = eachmovie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()[0]
quote = eachmovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
if quote:
quote = quote[0]
else:
quote = ''
item['title'] = title
item['movieInfo'] = ';'.join(movieInfo)
item['star'] = star
item['quote'] = quote
yield item
nextlink = selector.xpath('//span[@class="next"]/link/@herf').extract()
if nextlink:
nextlink = nextlink[0]
print nextlink
#Request,?????????????????
yield Request(self.url + nextlink,callback=self.parse)
def parse(self, response):
item = ZhihupythonItem()
#selector = Selector(response)
question_Field = response.xpath('//div[@class="feed-main"]')
for each in question_Field:
question = each.xpath('div[@class="content"]/h2/a/text()')
print question
item['Question'] = question
yield item
def parse3(self, response):
""" ????????????ID """
selector = Selector(response)
text2 = selector.xpath('body//table/tr/td/a/@href').extract()
for elem in text2:
elem = re.findall('uid=(\d+)', elem)
if elem:
ID = int(elem[0])
if ID not in self.finish_ID: # ??ID????????????
self.scrawl_ID.append(ID)
url_next = selector.xpath(
u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract()
if url_next:
yield Request(url="http://weibo.cn%s" % url_next[0], callback=self.parse3)
def parse(self, response):
selector = Selector(response)
text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
info = InfoItem()
if text0:
num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # ???
num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # ???
num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # ???
if num_tweets:
info["num_tweets"] = int(num_tweets[0])
if num_follows:
info["num_follows"] = int(num_follows[0])
if num_fans:
info["num_fans"] = int(num_fans[0])
url_information1 = "http://weibo.cn/%s/info" % self.next_ID[-1]
yield Request(url=url_information1, meta={"item":info,"ID":self.next_ID[-1]}, dont_filter=True, callback=self.parse1)
# ???????????
if random.random() > float(info["num_follows"])/(info["num_follows"] + info["num_fans"]):
try:
url_fans = "http://weibo.cn/%s/fans" % self.next_ID[-1]
yield Request(url=url_fans, dont_filter=True, callback=self.parse3) # ????
except:
url_follows = "http://weibo.cn/%s/follow" % self.next_ID[-1]
yield Request(url=url_follows, dont_filter=True, callback=self.parse3) # ?????
else:
try:
url_follows = "http://weibo.cn/%s/follow" % self.next_ID[-1]
yield Request(url=url_follows, dont_filter=True, callback=self.parse3) # ?????
except:
url_fans = "http://weibo.cn/%s/fans" % self.next_ID[-1]
yield Request(url=url_fans, dont_filter=True, callback=self.parse3) # ????
def parse4(self, response):
""" ????????????? """
selector = Selector(response)
text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
if text0:
num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # ???
num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # ???
if num_follows and num_fans:
self.degree_v = num_fans + num_follows
else:
self.degree_v = False
def get_list_id(self, response):
selector = Selector(response)
# ?????????????
url_list = selector.xpath('//body//a[@class="s-fc0"]/@href')[:-1].extract()
type_ = 0
for url in url_list:
type_ += 1
yield scrapy.FormRequest(url='http://music.163.com/m{}'.format(url), method='GET',
callback=self.parse_song_list, headers=self.headers, meta={'type': type_})
def parse_song_list(self, response):
selector = Selector(response)
song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()
for index, id_ in enumerate(song_id_list):
l = ItemLoader(item=SongListItem())
l.add_value('song_name', song_name_list[index])
l.add_value('type', response.meta['type'])
yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
headers=self.headers, callback=self.parse_single_song)
def get_list_id(self, response):
selector = Selector(response)
# ?????????????
url_list = selector.xpath('//body//p[@class="dec"]/a/@href').extract()
for url in url_list:
yield scrapy.FormRequest(url='http://music.163.com/m{}'.format(url), method='GET',
callback=self.parse_song_list, headers=self.headers)
def parse_song_list(self, response):
selector = Selector(response)
song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()
title = selector.xpath('//title/text()').extract()
for index, id_ in enumerate(song_id_list):
l = ItemLoader(item=PlayListItem())
l.add_value('song_name', song_name_list[index])
l.add_value('title', title)
yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
headers=self.headers, callback=self.parse_single_song)
def parse_items(self, response):
print "------------"
print(response.url)
print("----------")
from scrapy.selector import Selector
import json
category = response.meta['category']['category']
sub_category = response.meta['category']['sub_category']
response_json = json.loads(response.body)
required_text = response_json["result"]["html"]
response = Selector(text=required_text)
all_items = response.xpath('//div[contains(@class, "grid_item")]')
for each_item in all_items:
name = each_item.xpath('.//div[@class="title"]/a/text()').extract_first()
price = each_item.xpath('.//span[@class="price"]/text()').extract_first()
image_urls = [each_item.xpath(".//img/@src").extract_first()]
affiliate_link = each_item.xpath(".//a/@href").extract_first()
website = "polyvore.com"
brand = [i for i in ALL_BRANDS if i.lower() in name.lower()]
if brand:
brand = brand[0]
print ("brand", brand)
else:
print (name, brand, "exited")
continue
item = ProductItem(
name=name.strip(),
price=price.strip(),
image_urls=image_urls,
brand=brand.strip(),
affiliate_link=affiliate_link,
category=category,
sub_category=sub_category,
website=website
)
yield item
if response_json["result"]["more_pages"] == "1":
next_page = int(response_json["result"]["page"]) + 1
else:
return
next_link = url_to_use.format(str(next_page), urllib.quote(sub_category))
my_request = scrapy.Request(
next_link,
self.parse_items)
my_request.meta['category'] = {
"sub_category": sub_category,
"category": category,
}
yield my_request
def parse1(self, response):
selector = Selector(response)
infoItem = response.meta["item"]
ID = response.meta["ID"]
text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract()) # ????????text()
nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1) # ??
gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1) # ??
place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1) # ???????????
signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1) # ????
birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1) # ??
sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1) # ???
marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1) # ????
url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1) # ????
if nickname:
infoItem['nickname'] = nickname[0]
if gender:
infoItem['gender'] = gender[0]
if place:
place = place[0].split(" ")
infoItem["province"] = place[0]
if len(place) > 1:
infoItem["city"] = place[1]
if signature:
infoItem["signature"] = signature[0]
if birthday:
try:
birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d")
infoItem["birthday"] = birthday - datetime.timedelta(hours=8)
except Exception:
pass
if sexorientation:
if sexorientation[0] == gender[0]:
infoItem["sexorientation"] = "gay"
else:
infoItem["sexorientation"] = "Heterosexual"
if marriage:
infoItem["marriage"] = marriage[0]
if url:
infoItem["url"] = url[0]
infoItem["user_id"] = ID
yield infoItem
############??#########
if len(self.scrawl_ID) > 0:
ID = self.scrawl_ID.popleft()
self.finish_ID.add(ID)
url_main = "http://weibo.cn/u/%s" % ID
url_fans = "http://weibo.cn/%s/fans" % ID
url_follows = "http://weibo.cn/%s/follow" % ID
# ???????????
if len(self.scrawl_ID) < 4:
yield Request(url=url_fans, dont_filter=True, callback=self.parse3) # ????
yield Request(url=url_follows, dont_filter=True, callback=self.parse3) # ?????
yield Request(url=url_main, meta={"ID":ID}, dont_filter=True, callback=self.parse)