def parse_video_or_audio(self, response):
item = response.meta['item']
item['media_type'], result = self.__video_or_audio(response.body)
item['file_dir'] = os.path.join(settings['FILES_STORE'], item['media_type'], self.name)
self.logger.info('type: {}, result: {} url: {}'.format(item['media_type'], result, response.url))
if item['media_type'] == 'video':
url = 'https://v.qq.com/x/page/{}.html'.format(result)
meta = {
'item': item,
'vid': result,
}
yield scrapy.FormRequest(url, method='GET', meta=meta, callback=self.parse_info)
elif item['media_type'] == 'audio':
item['media_urls'] = [result]
t = urlparse(result).path.split('.')
item['file_name'] += ('.' + t[1]) if ((len(t) >= 2) and t[1]) else '.mp3'
yield item
python类FormRequest()的实例源码
def parse(self, response):
user = response.meta['user']
count = int(response.xpath('//h3[@node-type="hdTitle"]/following-sibling::span/text()'
).extract()[0][1:-1].replace(',', ''))
params = {
'spm': 'a2hzp.8253869.0.0',
'order': '1',
'last_item': '',
# 'last_vid': re.search(r'last_vid=(\d+)', response.body),
}
page, current, num = 1, 0, 50
while current < count:
params['page'] = str(page)
# params['last_pn'] = i
yield scrapy.FormRequest(url=response.url.split('?')[0], method='GET', meta={'user': user},
formdata=params, callback=self.parse_items)
current = num * page
page += 1
def parse_video_url(self, response):
item = response.meta['item']
vid = re.search(r'id_(.*?).html|$', response.url).group(1)
if vid is None:
self.logger.error('url: {}, error: failed to find vid'.format(response.url))
return
params = {
'vid': vid,
'ccode': '0401',
'client_ip': '192.168.1.1',
'utid': 'tB2PEWHIKgECAbaWLjUeiFyE',
'client_ts': str(round(time.time())),
}
url = 'https://ups.youku.com/ups/get.json'
yield scrapy.FormRequest(url, method='GET', meta={'item': item}, formdata=params,
callback=self.parse_download_url)
def pass_valid(self, response):
print("?????")
i = Image.open(BytesIO(response.body))
i.save("yz.png")
validcode_value = input("?? yz.png,??????")
data = {
"__EVENTTARGET": "",
"__EVENTARGUMENT": "",
"__VIEWSTATE": response.meta['view_state'],
"__EVENTVALIDATION": response.meta['event_validation'],
"txt_ValidCode": validcode_value,
"btnSubmit": "? ?"
}
func = self.parse_zz if response.meta['type'] == 'zz' else self.parse_bid
yield scrapy.FormRequest(response.meta['last_url'], meta={"cookiejar": response.meta["cookiejar"]},
formdata=data, callback=func, dont_filter=True)
def lohin_after_captcha(self,response):
'''?????????'''
with open("captcha.jpg","wb") as f:
f.write(response.body)
f.close()
# from PIL import Image
# try:
# im=Image.open('captcha.jpg')
# im.show()
# except:
# pass
captcha=input('???????')
post_data=response.meta.get('post_data',{}) #???,?????
post_url = "https://www.zhihu.com/login/phone_num"
post_data['captcha']=captcha
return [scrapy.FormRequest(
url=post_url,
formdata=post_data,
headers=self.headers,
callback=self.check_login
)]
def parse_(self, response):
detail = response.xpath('//table[@bordercolor="lightgray"]/tr')
# ???????
for temp in detail[:-1]:
item = SiteItem()
item['title'] = temp.xpath('td/span/@title').extract_first().strip()
if temp.xpath('td/span/@onclick').extract_first():
item['link'] = 'http://www.chinaunicombidding.cn' + \
(temp.xpath('td/span/@onclick').extract_first()).split(',')[0].split(
'(')[1][1:-1].strip()
item['pubtime'] = temp.xpath('td[@width="15%"]/text()').extract_first().strip()
yield item
nowPage = str(int(response.xpath('//span[@id="nowPage"]/text()').extract_first()) + 1)
print ('nowpage======================================' + str(nowPage))
if item['pubtime'] == date.get_curdate():
yield scrapy.FormRequest(
"http://www.chinaunicombidding.cn/jsp/cnceb/web/info1/infoList.jsp?page=" + nowPage,
formdata={
"type": "",
"province": "",
"city": "",
"notice": "",
"time1": "",
"time2": ""
}, callback=self.parse_)
def parse(self, response):
detail = response.xpath('//ul[@class="lby-list"]//li')
pubtime = None
for temp in detail[:20]:
item = SiteItem()
temp_pubtime = temp.xpath('span/text()').extract_first().strip()[1:11]
if temp_pubtime:
item['pubtime'] = temp.xpath('span/text()').extract_first().strip()[1:11]
pubtime = item['pubtime']
item['title'] = temp.xpath('a//text()').extract_first()
print "------------------------------{}----".format(item['title'])
if temp.xpath('a/@href').extract_first():
item['link'] = "http://www.zycg.gov.cn" + temp.xpath('a//@href').extract_first()
yield item
# ???????????????
# print ('-----------------------??-------------------------------')
# print ('-------pubtime----------------{}-------------------------------'.format(pubtime))
# print ('------date.get_curdate-----------------{}-------------------------------'.format(date.get_curdate()))
if pubtime == date.get_curdate():
# ?????
# print "-----------------??-----------------"
next_page_href = "http://www.zycg.gov.cn" + (
str(response.xpath('//a[@class="next_page"]//@href').extract_first()))
yield scrapy.FormRequest(next_page_href, callback=self.parse)
def parse(self, response):
detail = response.xpath('//ul[@class="m_m_c_list"]/li')
for temp in detail:
item = SiteItem()
item['title'] = temp.xpath('a/text()').extract_first().strip()
item['link'] = "http://www.gdgpo.gov.cn" + temp.xpath('a/@href').extract_first().strip()
item['pubtime'] = temp.xpath('em/text()').extract_first().strip()[0:10]
print("------------------------------------------------------------------------------")
yield item
if date.get_curdate() == (item['pubtime']):
pageindex = response.xpath('//input[@id="pointPageIndexId"]/@value').extract_first()
self.iipage += 1
last_page = response.xpath(
u'//a/span[contains(text(),"? ?")]/../@href').extract_first()
total_pagenum = last_page.split('(')[1][:-1]
if int(self.iipage) < int(total_pagenum):
yield scrapy.FormRequest("http://www.gdgpo.gov.cn/queryMoreInfoList.do",
formdata={
"sitewebId": "4028889705bebb510105bec068b00003",
"channelCode": '0005',
'pageIndex': str(self.iipage),
'pageSize': "15",
'pointPageIndexId': "1"
}, callback=self.parse)
def login_after_captcha(self, response):
with open('captcha.jpg', 'wb') as f:
f.write(response.body)
f.close()
from PIL import Image
try:
img = Image.open('captcha.jpg')
img.show()
img.close()
except:
pass
captcha = input('??????')
post_data = response.meta.get('post_data', {})
post_url = 'https://www.zhihu.com/login/phone_num'
post_data['captcha'] = captcha
return scrapy.FormRequest(post_url, formdata=post_data, headers=self.headers, callback=self.check_login)
def parse_single_song(self, response):
loader = response.meta['loader']
selector = Selector(response)
singer = selector.xpath('//title/text()').extract()
loader.add_value('singer', singer)
loader.add_value('_id', response.meta['song_id'])
comment_data, comment_url = api_comment(response.meta['song_id'], 0, 100)
source_data, source_url = api_song_url(response.meta['song_id'])
comment_id = generate_comment_index()['comment_index']
loader.add_value('comment_id', comment_id)
yield scrapy.FormRequest(url=comment_url, method='POST', headers=self.headers,
formdata=comment_data, callback=self.parse_comments,
meta={'comment_id': comment_id})
yield scrapy.FormRequest(url=source_url, method='POST', headers=self.headers,
formdata=source_data, meta={'loader': loader}, callback=self.get_source_url)
def parse_single_song(self, response):
loader = response.meta['loader']
selector = Selector(response)
singer = selector.xpath('//title/text()').extract()
loader.add_value('singer', singer)
loader.add_value('_id', response.meta['song_id'])
comment_data, comment_url = api_comment(response.meta['song_id'], 0, 100)
source_data, source_url = api_song_url(response.meta['song_id'])
comment_id = generate_comment_index()['comment_index']
loader.add_value('comment_id', comment_id)
yield scrapy.FormRequest(url=comment_url, method='POST', headers=self.headers,
formdata=comment_data, callback=self.parse_comments,
meta={'comment_id': comment_id})
yield scrapy.FormRequest(url=source_url, method='POST', headers=self.headers,
formdata=source_data, meta={'loader': loader}, callback=self.get_source_url)
def generate_firm_susong(self, response):
if len(response.body) < 10:
return
qitem = response.meta["item"]
page_n = response.meta["page_n"]
self.append_susong_detail({"????": self.clean_content(response.body)}, qitem._id)
anjian_list = response.xpath("//table[@class='m_changeList']//a[@class='c_a']/@onclick").extract()
anjian_name = response.xpath("//table[@class='m_changeList']//tr//td[2]//a[@class='c_a']/text()").extract()
for i in range(0, len(anjian_list)):
yield scrapy.FormRequest(
"http://www.qichacha.com/company_wenshuView",
callback=self.generate_firm_anjian,
cookies=self.qicha_cookie,
method='POST',
dont_filter="true",
formdata={"id": self.generate_anjian_id(anjian_list[i])},
meta={"item_id": qitem._id, "anjian_name": anjian_name[i]}
)
# ?????
yield scrapy.Request(
response.meta["chacha_url_pre"] + '&tab=susong&box=wenshu&p=' + str(page_n),
encoding='utf-8',
callback=self.generate_firm_susong,
cookies=self.qicha_cookie,
meta={"item": qitem, "chacha_url_pre": response.meta["chacha_url_pre"], "page_n": int(page_n)+1}
)
def start_requests(self):
return [scrapy.FormRequest("http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Indice%20de%20Iniciativas?_piref73_1335505_73_1335500_1335500.next_page=/wc/cambioLegislatura",
formdata = {'idLegislatura':'12'} , callback = self.parse)]
def parse(self, response):
"""
:param response:
:return:???????post??
post???
inslider
page
pagesize
Content-Type:application/x-www-form-urlencoded
"""
soup = BeautifulSoup(response.body)
menu = soup.find_all("a",class_="ui-more") #????????
if menu:
for topic in menu:
topic_name = topic.text.replace(u"??","")
topic_url = topic.get("href")
self.flag.setdefault(topic_url,0)
page="1"
#post_data?????
post_data = {
"inslider":"0",
"page":page,
"pagesize":"10"
}
# yield scrapy.Request(topic_url,
# callback=self.parse_topic,
# method="POST",
# headers={"Content-Type":"application/x-www-form-urlencoded"},
# body=json.dumps(post_data)
# )
yield scrapy.FormRequest(
url=topic_url,
formdata=post_data,
callback=self.parse_topic,
meta={"page":page,"topic_name":topic_name}
)
def start_requests(self):
return [
scrapy.Request("http://www.ctcnn.com/",callback=self.parse),
# scrapy.FormRequest(self.start_url,formdata={'page':'1'},callback=self.parse_newest), #TODO something wrong
]
#???????
def parse(self,response):
yield scrapy.FormRequest(self.start_url,formdata={'page':'1'},callback=self.parse_newest)
soup = BeautifulSoup(response.body,"lxml")
index_list = soup.find(class_="index-first-list")("li") if soup.find(class_="index-first-list") else None
for news in index_list:
title = news.h2.a.string if news.h2.a else None
abstract = news.p.string if news.p else None
news_url = self.domain+news.a.get("href",None) if news.a else None
item = NewsItem(title=title,abstract=abstract,news_url=news_url,catalogue=u"????")
request = scrapy.Request(news_url,self.parse_news,dont_filter=True)
request.meta["item"] = item
yield request
#???????
def parse_newest(self, response):
soup = BeautifulSoup(response.body,"lxml")
page =response.request.body.split('=')[-1]
li = soup.find_all('li')
if li:
for news in li :
news_date = news.find(class_="time").string[2:] if news.find(class_="time") else None
struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M")
news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
title = news.find(class_="title").string if news.find(class_="title") else None
news_url = self.domain+news.find(class_="title").a.get("href",None) if news.find(class_="title") else None
abstract = news.find(class_="info").string if news.find(class_="info") else None
pic = self.domain+news.find('img').get('src',None) if news.find('img') else None
topic = news.find(class_="type").string if news.find(class_="type") else None
item = NewsItem(catalogue=u"????",
title=title,
news_url=news_url,
abstract=abstract,
pic=pic,
topic=topic,
news_date=news_date)
item = judge_news_crawl(item)
if item:
request = scrapy.Request(news_url,callback=self.parse_news,dont_filter=True)
request.meta["item"] = item
yield request
else:
self.flag=page
else:
logger.info("can't find news list")
#???
if not self.flag:
new_request = scrapy.FormRequest(self.start_url,formdata={'page':str(int(page)+1)},callback=self.parse_newest)
yield new_request
def start_requests(self):
return [scrapy.FormRequest("https://bbs.byr.cn/user/ajax_login.json",
formdata=LOGIN_FORMDATA,
meta={'cookiejar': 1},
headers=HEADERS,
callback=self.logged_in)]
# ???(hour??????????????????????????????)
def start_requests(self):
return [scrapy.FormRequest("http://bbs.byr.cn/user/ajax_login.json",
formdata=LOGIN_FORMDATA,
meta={'cookiejar': 1},
headers=HEADERS,
callback=self.logged_in)]