def parse_book(self, response):
item = BookItem()
sel = Selector(response)
e = sel.xpath("//div[@id='wrapper']")
item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract()
item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract()
item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract()
item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract()
item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract()
item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract()
item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract()
item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract()
item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract()
item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract()
item['tag'] = response.xpath("//*[@id = 'db-tags-section']/descendant::a/text()").extract()
request = scrapy.Request(response.url + "/comments/hot", callback=self.parse_review) # ???????????
request.meta['item'] = item
return request
# ???????????
python类Selector()的实例源码
def parse_item(self, response):
item = BookItem()
sel = Selector(response)
e = sel.xpath("//div[@id='wrapper']")
item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract()
item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract()
item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract()
item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract()
item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract()
item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract()
item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract()
item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract()
item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract()
item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract()
return item
def parse(self, response):
item = BookItem()
sel = Selector(response)
e = sel.xpath("//div[@id='wrapper']")
item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract()
item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract()
item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract()
item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract()
item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract()
item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract()
item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract()
item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract()
item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract()
item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract()
item['tag'] = response.xpath("//*[@id = 'db-tags-section']/descendant::a/text()").extract()
request = scrapy.Request(response.url + "/comments/hot", callback=self.parse_review) # ???????????
request.meta['item'] = item
return request
# ???????????
def parse(self, response):
list_types = Selector(response).xpath('//div[@class="listado_1"]//ul/li/a')
for types in list_types:
href= types.xpath("./@href").extract()
text = types.xpath("./text()").extract()
if Terms.filterBytype(text[0]):
type = Terms.getType(text[0])
initiative_url = Utils.createUrl(response.url,href[0])
yield scrapy.Request(initiative_url,errback=self.errback_httpbin,callback=self.initiatives, meta={'type': type})
"""
urlsa = ""
urlsa = "http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Indice%20de%20Iniciativas?_piref73_1335503_73_1335500_1335500.next_page=/wc/servidorCGI&CMD=VERLST&BASE=IW12&PIECE=IWC2&FMT=INITXD1S.fmt&FORM1=INITXLUS.fmt&DOCS=100-100&QUERY=%28I%29.ACIN1.+%26+%28161%29.SINI."
yield scrapy.Request(urlsa, errback=self.errback_httpbin, callback=self.oneinitiative,
meta={'type': u"Proposición no de Ley en Comisión"})
"""
def initiatives(self, response):
type = response.meta['type']
first_url = Selector(response).xpath('//div[@class="resultados_encontrados"]/p/a/@href').extract()[0]
num_inis = Selector(response).xpath('//div[@class="SUBTITULO_CONTENIDO"]/span/text()').extract()
split = first_url.partition("&DOCS=1-1")
for i in range(1,int(num_inis[0])+1):
new_url = split[0]+"&DOCS="+str(i)+"-"+str(i)+split[2]
initiative_url = Utils.createUrl(response.url,new_url)
CheckItems.addElement(initiative_url)
if Blacklist.getElement(initiative_url):
if not Blacklist.getElement(initiative_url):
yield scrapy.Request(initiative_url,errback=self.errback_httpbin,
callback=self.oneinitiative, meta = {'type':type})
else:
yield scrapy.Request(initiative_url,errback=self.errback_httpbin,
callback=self.oneinitiative, meta = {'type':type})
def parse_items(self, response):
hxs = Selector(response)
jobs = hxs.xpath('//div[contains(@class, "searchResultTitle")]')
items = []
for job in jobs:
item = Job()
item["title"] = job.xpath('.//h2/a[contains(@id, "TITLE")]/text()').extract()[0].strip()
company = job.xpath('.//p/span[contains(@id, "CONTACT_OFFICE")]/text()').extract()
item["company"] = company[0].strip() if company else "n/a"
item["location"] = job.xpath('.//p/span[contains(@id, "FREE_LOCATION")]/text()').extract()[0].strip()
item["url"] = job.xpath('.//h2/a[contains(@id, "TITLE")]/@href').extract()[0]
item["date_posted"] = job.xpath('.//p/span[contains(@id, "POSTED_DATE")]/text()').extract()[0].strip()
salary = job.xpath('.//p/span[contains(@id, "SALARY")]/text()').extract()
item["salary"] = salary[0].strip() if salary else "n/a"
item["crawl_timestamp"] = datetime.now().strftime("%H:%M:%S %Y-%m-%d")
item["job_board"] = "dice"
items.append(item)
return items
def parse(self, response):
hxs = Selector(response)
jobs = hxs.xpath('//div[contains(@class, "-job-item")]')
items = []
for job in jobs:
item = Job()
item["title"] = job.xpath('.//a[@class="job-link"]/text()').extract()[0]
item["company"] = job.xpath('.//div[@class="-name"]/text()').extract()[0].strip()
item["location"] = re.sub(r'\W+', '', job.xpath('.//div[@class="-location"]/text()').extract()[0].strip())
item["url"] = job.xpath('.//a[@class="job-link"]/@href').extract()[0]
item["date_posted"] = job.xpath('.//p[contains(@class, "-posted-date")]/text()').extract()[0].strip()
item["salary"] = job.xpath('.//span[@class="-salary"]/text()').extract_first(default='n/a').strip()
item["tags"] = job.css('.-tags p a.post-tag::text').extract()
item["crawl_timestamp"] = datetime.now().strftime("%H:%M:%S %Y-%m-%d")
item["job_board"] = "stackOverflow"
items.append(item)
return items
def parse_page(self, response):
sel = Selector(text = response.body)
infos = sel.xpath('//tr[@class="odd"]').extract()
for info in infos:
val = Selector(text = info)
ip = val.xpath('//td[2]/text()').extract_first()
port = val.xpath('//td[3]/text()').extract_first()
country = val.xpath('//td[4]/a/text()').extract_first()
anonymity = val.xpath('//td[5]/text()').extract_first()
proxy = Proxy()
proxy.set_value(
ip = ip,
port = port,
country = country,
anonymity = anonymity,
source = self.name,
)
self.add_proxy(proxy = proxy)
def parse_page(self, response):
super(ProxyRoxSpider, self).parse_page(response)
data = response.xpath('//tr[@class="fat"]').extract()
for i, d in enumerate(data):
sel = Selector(text = d)
ip_port = sel.xpath('//td/a/text()').extract_first()
ip = ip_port.split(':')[0]
port = ip_port.split(':')[1]
country = sel.xpath('//td/span[@class="region"]/text()').extract_first()
anonymity = sel.xpath('//td/span/text()').extract_first()
proxy = Proxy()
proxy.set_value(
ip = ip,
port = port,
country = country,
anonymity = anonymity,
source = self.name
)
self.add_proxy(proxy = proxy)
def parse_page(self, response):
super(ProxyDBSpider, self).parse_page(response)
data = response.xpath('//tbody/tr').extract()
for i, d in enumerate(data):
sel = Selector(text = d)
ip_port = sel.xpath('//td/a/text()').extract_first()
ip = ip_port.split(':')[0]
port = ip_port.split(':')[1]
country = sel.xpath('//td/img/@title').extract_first()
anonymity = sel.xpath('//td/span[@class="text-success"]/text()').extract_first()
proxy = Proxy()
proxy.set_value(
ip = ip,
port = port,
country = country,
anonymity = anonymity,
source = self.name
)
self.add_proxy(proxy = proxy)
def parse_salaries(self, response):
"""
The values about person salary is in another table
in another page, that function grab all the table headers
and values and assign to the entity[entity_id]
The id was passed in the response.meta
"""
item = VereadorItem()
item['name'] = response.meta['name']
item['id'] = response.meta['entity_id']
item['mesano'] = response.meta['mesano']
for salary in response.xpath('//*[@id="holerite"]').extract():
selector = Selector(text=salary)
table = selector.xpath('//tr[@class="holerite_valor"]/td/text()').extract()
item["salary_gross"] = table[0]
item["salary_liquid"] = selector.xpath('//tr[@class="holerite_valor"]/td/strong/text()').extract_first()
return item
def parse(self, response):
sel = Selector(response)
self.item = AccountItem()
self.item['oj'] = 'poj'
self.item['username'] = self.username
if self.is_login:
try:
self.item['rank'] = sel.xpath('//center/table/tr')[1].\
xpath('.//td/font/text()').extract()[0]
self.item['accept'] = sel.xpath('//center/table/tr')[2].\
xpath('.//td/a/text()').extract()[0]
self.item['submit'] = sel.xpath('//center/table/tr')[3].\
xpath('.//td/a/text()').extract()[0]
yield Request(self.accepted_url % self.username,
callback = self.accepted
)
self.item['status'] = 'Authentication Success'
except:
self.item['status'] = 'Unknown Error'
else:
self.item['status'] = 'Authentication Failed'
yield self.item
def parse_item(self, response):
item = DoubanmovieItem()
sel = Selector(response)
title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0]
year = sel.xpath('//*[@id="content"]/h1/span[2]/text()').extract()[0]
commit_num = sel.xpath(
'//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()').extract()[0]
star = sel.xpath(
'//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()[0]
director = sel.xpath(
'//*[@id="info"]/span[1]/span[2]/a/text()').extract()[0]
screenwriter = sel.xpath(
'//*[@id="info"]/span[2]/span[2]/a/text()').extract()[0]
item['title'] = title
item['date'] = year
item['star'] = star
item['commit_num'] = commit_num
item['director'] = director
item['screenwriter'] = screenwriter
return item
def parse(self, response):
selector = Selector(response)
articles = selector.xpath('//ul[@class="article-list thumbnails"]/li')
for article in articles:
item = Jianshu2Item()
url = article.xpath('div/h4/a/@href').extract()
likeNum = article.xpath('div/div/span[2]/text()').extract()
posturl = 'http://www.jianshu.com'+url[0]
if len(likeNum) == 0:
item['likeNum'] = 0
else:
item['likeNum'] = int(likeNum[0].split(' ')[-1])
request = Request(posturl,callback=self.parse_donate)
request.meta['item'] = item
yield request
next_link = selector.xpath('//*[@id="list-container"]/div[@class="load-more"]/button/@data-url').extract()[0]
if next_link:
next_link = self.url + str(next_link)
yield Request(next_link,callback=self.parse)
def parse_article(self,response):
hxs = Selector(response)
keyword = response.meta['keyword']
movie_name = hxs.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
movie_roles_paths = hxs.xpath('//*[@id="info"]/span[3]/span[2]')
movie_roles = []
for movie_roles_path in movie_roles_paths:
movie_roles = movie_roles_path.select('.//*[@rel="v:starring"]/text()').extract()
movie_classification= hxs.xpath('//span[@property="v:genre"]/text()').extract()
douban_item = DoubanItem()
douban_item['movie_keyword'] = keyword
douban_item['movie_name'] = ''.join(movie_name).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';').replace(' ','')
douban_item['movie_roles'] = ';'.join(movie_roles).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
douban_item['movie_classification'] = ';'.join(movie_classification).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
article_link = hxs.xpath('//*[@id="review_section"]/div/div/div/h3/a/@href').extract()
tmp = "https://movie.douban.com/review/"
for item in article_link:
if tmp in item:
yield Request(item,meta={'item': douban_item},callback=self.parse_item,cookies=[{'name': 'COOKIE_NAME','value': 'VALUE','domain': '.douban.com','path': '/'},])
pictureSpider_demo.py 文件源码
项目:PythonCrawler-Scrapy-Mysql-File-Template
作者: lawlite19
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def parse(self, response):
se=Selector(response) #???????HtmlXPathSelector???
if(re.match("http://desk.zol.com.cn/fengjing/\d+x\d+/\d+.html", response.url)):#??url??????????url????
src=se.xpath("//ul[@class='pic-list2 clearfix']/li")#???ul?????li
for i in range(len(src)):#??li??
imgURLs=se.xpath("//ul[@class='pic-list2 clearfix']/li[%d]/a/img/@src"%i).extract() #??????????
titles=se.xpath("//ul[@class='pic-list2 clearfix']/li[%d]/a/img/@title"%i).extract()
if imgURLs:
realUrl=imgURLs[0].replace("t_s208x130c5","t_s2560x1600c5") #????????????????
file_name=u"%s.jpg"%titles[0] #????????
path=os.path.join("D:\pics",file_name)#??????????????F??pics????
type = sys.getfilesystemencoding()
print file_name.encode(type)
item=WebcrawlerScrapyItem() #??item??????item??,?????????????item???
item['name']=file_name
item['url']=realUrl
print item["name"],item["url"]
yield item #??item,???????item
urllib.urlretrieve(realUrl,path) #??????????????????????????????????????
all_urls=se.xpath("//a/@href").extract()#???????url
for url in all_urls:
if url.startswith("/fengjing/1920x1080/"):#??????????????
yield Request("http://desk.zol.com.cn"+url,callback=self.parse)
def parse(self, response):
#obtains links from page to page and passes links to parse_playerURL
sel = Selector(response) #define selector based on response object (points to urls in start_urls by default)
url_list = sel.xpath('//tbody/tr/td[@class="player"]/a/@href') #obtain a list of href links that contain relative links of players
for i in url_list:
relative_url = self.clean_str(i.extract()) #i is a selector and hence need to extract it to obtain unicode object
print urljoin(response.url, relative_url) #urljoin is able to merge absolute and relative paths to form 1 coherent link
req = Request(urljoin(response.url, relative_url),callback=self.parse_playerURL) #pass on request with new urls to parse_playerURL
req.headers["User-Agent"] = self.random_ua()
yield req
next_url=sel.xpath('//div[@class="right-nav pull-right"]/a[@rel="next"]/@href').extract_first()
if(next_url): #checks if next page exists
clean_next_url = self.clean_str(next_url)
reqNext = Request(urljoin(response.url, clean_next_url),callback=self.parse) #calls back this function to repeat process on new list of links
yield reqNext
huawei_spider.py 文件源码
项目:MonkeyKing_crawler_recommender
作者: BitTigerInst
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def parse(self, response):
page = Selector(response)
hrefs = page.xpath('//h4[@class="title"]/a/@href')
for href in hrefs:
url = href.extract()
yield scrapy.Request(url, callback=self.parse_item)
div = page.xpath('//div[@class="page-ctrl ctrl-app"]')
hrefs = div.xpath('.//a/@href').extract()
for href in hrefs:
url = response.urljoin(href)
print url
# yield scrapy.Request(url, self.parse, meta={
# 'splash': {
# 'endpoint': 'render.html',
# 'args': {'wait': 0.5}
# }
# })
huawei_spider.py 文件源码
项目:MonkeyKing_crawler_recommender
作者: BitTigerInst
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def parse_item(self, response):
page = Selector(response)
item = AppstoreItem()
item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()').extract_first().encode('utf-8')
item['url'] = response.url
item['appid'] = re.match(r'http://.*/(.*)', item['url']).group(1)
item['intro'] = page.xpath('//meta[@name="description"]/@content').extract_first().encode('utf-8')
divs = page.xpath('//div[@class="open-info"]')
recomm = ""
for div in divs:
url = div.xpath('./p[@class="name"]/a/@href').extract_first()
recommended_appid = re.match(r'http://.*/(.*)', url).group(1)
name = div.xpath('./p[@class="name"]/a/text()').extract_first().encode('utf-8')
recomm += "{0}:{1},".format(recommended_appid, name)
item['recommended'] = recomm
yield item
huawei_spider.py 文件源码
项目:MonkeyKing_crawler_recommender
作者: BitTigerInst
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def parse_item(self, response):
page = Selector(response)
item = AppstoreItem()
item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()').extract_first().encode('utf-8')
item['url'] = response.url
item['appid'] = re.match(r'http://.*/(.*)', item['url']).group(1)
item['intro'] = page.xpath('//meta[@name="description"]/@content').extract_first().encode('utf-8')
divs = page.xpath('//div[@class="open-info"]')
recomm = ""
for div in divs:
url = div.xpath('./p[@class="name"]/a/@href').extract_first()
recommended_appid = re.match(r'http://.*/(.*)', url).group(1)
name = div.xpath('./p[@class="name"]/a/text()').extract_first().encode('utf-8')
recomm += "{0}:{1},".format(recommended_appid, name)
item['recommended'] = recomm
yield item
xiaomi_spider.py 文件源码
项目:MonkeyKing_crawler_recommender
作者: BitTigerInst
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def parse_page(self, response):
page = Selector(response)
lis = page.xpath('//ul[@class="applist"]/li')
if lis == None:
return
url_common = 'http://app.mi.com'
for li in lis:
item = XiaomiAppstoreCrawlerItem()
item['title'] = li.xpath('./h5/a/text()').extract_first().encode('utf-8')
url = li.xpath('./h5/a/@href').extract_first()
appid = re.match(r'/detail/(.*)', url).group(1)
item['appid'] = appid
# import pudb; pu.db
req = scrapy.Request(url_common + url, callback=self.parse_details)
req.meta["item"] = item
yield req
def parse_item(self, response):
url_trim = response.url.split('?')[0]
page = Selector(response)
title = page.xpath('//span[@itemprop="name"]/text()').extract_first()
images = page.xpath('//img[@id="J_BigImg"]/@src').extract_first()
availability = page.xpath('//dd[@class="num clearfix"]/div[@class="J_GoodsStock goods-stock fl"]/text()').extract_first()
status = response.status
item = FashionItem()
item['url'] = url_trim
item['title'] = title.encode('utf-8')
item['images'] = images
item['availability'] = availability.encode('utf-8')
item['status'] = status
return item
def getMusListToFile(qqid, line, browser, filename):
m_url = 'http://g.gogoqq.com/music.htm?uin=%s' % qqid
browser.get(m_url)
#time.sleep(2)
WebDriverWait(browser, 2, 0.5).until(lambda item:item.find_element_by_xpath('//*[@id="list"]').is_displayed())
time.sleep(1)
liList = Selector(text = browser.page_source).xpath(u'//*[@id="list"]/li/a')
mList = []
for m in liList:
mus = m.xpath('text()')[0].extract()
print mus
mList.append(mus)
f = open(filename, 'a')
string = line + ' #music#:' + '##m##'.join(mList)
f.write(string + '\n')
f.close()
def parse(self, response):
sel = Selector(response)
movie_name = sel.xpath("//div[@class='pl2']/a/text()[1]").extract()
movie_url = sel.xpath("//div[@class='pl2']/a/@href").extract()
movie_star = sel.xpath("//div[@class='pl2']/div/span[@class='rating_nums']/text()").extract()
# item = DoubanNewMovieItem()
item = {}
# item['movie_name'] = [n.encode('utf-8') for n in movie_name]
item['movie_name'] = movie_name
item['movie_star'] = [n for n in movie_star]
item['movie_url'] = [n for n in movie_url]
yield item
print(item['movie_name'], item['movie_star'], item['movie_url'])
def parse_category(self, response):
self.log("=================================================")
sel = Selector(response)
shop_type = response.meta['shop_type']
city_id = response.meta['city_id']
cat_url = response.url
http_status = response.status
self.log("http_url = %s" % cat_url)
self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))
self.log("shop_type = %s" % shop_type)
items = []
shop_list = sel.xpath('//li[@class="t-item-box t-district J_li"]/div[@class="t-item"]/div[@class="t-list"]/ul/li')
self.log("shop_list_len = %d" % len(shop_list))
for shop in shop_list:
uri = shop.xpath('a/@href').extract()[0]
self.log("page_uri = %s" % uri)
yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
def parse_category(self, response):
self.log("=================================================")
sel = Selector(response)
shop_type = response.meta['shop_type']
city_id = response.meta['city_id']
cat_url = response.url
http_status = response.status
self.log("http_url = %s" % cat_url)
self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))
self.log("shop_type = %s" % shop_type)
items = []
#shop_list = sel.xpath('//li[@class="t-item-box t-district J_li"]/div[@class="t-item"]/div[@class="t-list"]/ul/li')
region_list = sel.xpath('//div[@id="region-nav"]/a')
self.log("region_list_len = %d" % len(region_list))
for region in region_list:
uri = region.xpath('@href').extract()[0]
self.log("page_uri = %s" % uri)
#yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
yield scrapy.Request(uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
def parse(self, response):
sel = Selector(response)
cat_url = response.url
http_status = response.status
self.log("http_url = %s" % cat_url)
self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))
item = SpiderDianpingXmtItem()
item['chenshi_name'] = ""
item['shop_type'] = 0
item['shop_url'] = ""
item['shop_name'] = ""
item['shop_addr'] = ""
item['shop_mobile'] = ""
item['shop_intro'] = ""
return item
def parse(self, response):
sel = Selector(response)
if response.meta.has_key("shop_type"):
shop_type = response.meta['shop_type']
else:
shop_type = self.shop_type_map[response.url]['shop_type']
if response.meta.has_key("city_id"):
city_id = response.meta['city_id']
else:
city_id = self.shop_type_map[response.url]['city_id']
cat_url = response.url
http_status = response.status
self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))
self.log("shop_type = %s" % shop_type)
items = []
shop_list = sel.xpath('//div[@id="region-nav"]/a')
for shop in shop_list:
uri = shop.xpath('@href').extract()[0]
self.log("page_uri = %s" % uri)
yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
def parse(self, response):
sel = Selector(response)
xiaoqu_uri = sel.xpath('//span[@class="title"]/a/@href').extract()[0]
xiaoqu_list = xiaoqu_uri.split('/')
xiaoqu_id = xiaoqu_list[2]
items = []
house_lists = sel.xpath('//div[@class="list-wrap"]/ul[@class="house-lst"]/li')
for house in house_lists:
item = SpiderScrapyLianjiaItem()
item['xiaoqu_id'] = xiaoqu_id
item['house_id'] = house.xpath('@data-id').extract()[0]
item['title'] = house.xpath('div[@class="info-panel"]/h2/a/text()').extract()[0]
item['price'] = house.xpath('div[@class="info-panel"]/div[@class="col-3"]/div[@class="price"]/span/text()').extract()[0]
item['view_count'] = house.xpath('div[@class="info-panel"]/div[@class="col-2"]/div[@class="square"]/div/span/text()').extract()[0]
#item['size'] = house.xpath('div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/span/text()').extract()
items.append(item)
return items
def parse_item(self, response):
item = Cl1024Item()
item['cl_title'] = response.meta['cl_title']
item['cl_url'] = response.meta['cl_url']
item['cl_bankuai'] = response.meta['cl_bankuai']
item['posted'] = response.meta['posted']
# redownloaded = re.search('downloaded:(.+?)<BR>', response.body)
# downloaded = redownloaded[12:-4]
sel = Selector(response)
downloaded = sel.xpath('//td/table/tr/td/text()').extract()[2]
item['torrent_downloaded'] = downloaded[17:]
item['torrent_url'] = response.url
ref = sel.xpath('//input[@name="ref"]/@value').extract_first()
reff = sel.xpath('//input[@name="reff"]/@value').extract_first()
dl = ('http://www.rmdown.com/download.php?ref=%s&&reff=%s&submit=download' % (ref, reff)).encode('utf-8')
item['torrent_download_urls'] = dl
yield item