python类FormRequest()的实例源码

weixin_ergeng.py 文件源码 项目:multimedia_crawler 作者: JFluo2011 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse_video_or_audio(self, response):
        item = response.meta['item']
        item['media_type'], result = self.__video_or_audio(response.body)
        item['file_dir'] = os.path.join(settings['FILES_STORE'], item['media_type'], self.name)
        self.logger.info('type: {}, result: {} url: {}'.format(item['media_type'], result, response.url))
        if item['media_type'] == 'video':
            url = 'https://v.qq.com/x/page/{}.html'.format(result)
            meta = {
                'item': item,
                'vid': result,
            }
            yield scrapy.FormRequest(url, method='GET', meta=meta, callback=self.parse_info)
        elif item['media_type'] == 'audio':
            item['media_urls'] = [result]
            t = urlparse(result).path.split('.')
            item['file_name'] += ('.' + t[1]) if ((len(t) >= 2) and t[1]) else '.mp3'
            yield item
youku.py 文件源码 项目:multimedia_crawler 作者: JFluo2011 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def parse(self, response):
        user = response.meta['user']
        count = int(response.xpath('//h3[@node-type="hdTitle"]/following-sibling::span/text()'
                                   ).extract()[0][1:-1].replace(',', ''))

        params = {
            'spm': 'a2hzp.8253869.0.0',
            'order': '1',
            'last_item': '',
            # 'last_vid': re.search(r'last_vid=(\d+)', response.body),
        }
        page, current, num = 1, 0, 50
        while current < count:
            params['page'] = str(page)
            # params['last_pn'] = i
            yield scrapy.FormRequest(url=response.url.split('?')[0], method='GET', meta={'user': user},
                                     formdata=params, callback=self.parse_items)
            current = num * page
            page += 1
youku.py 文件源码 项目:multimedia_crawler 作者: JFluo2011 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def parse_video_url(self, response):
        item = response.meta['item']
        vid = re.search(r'id_(.*?).html|$', response.url).group(1)
        if vid is None:
            self.logger.error('url: {}, error: failed to find vid'.format(response.url))
            return
        params = {
            'vid': vid,
            'ccode': '0401',
            'client_ip': '192.168.1.1',
            'utid': 'tB2PEWHIKgECAbaWLjUeiFyE',
            'client_ts': str(round(time.time())),
        }
        url = 'https://ups.youku.com/ups/get.json'
        yield scrapy.FormRequest(url, method='GET', meta={'item': item}, formdata=params,
                                 callback=self.parse_download_url)
LoginSpider.py 文件源码 项目:scrapy-spider1 作者: thuzhangjw 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def pass_valid(self, response):
        print("?????")
        i = Image.open(BytesIO(response.body))
        i.save("yz.png")
        validcode_value = input("?? yz.png,??????")

        data = {
            "__EVENTTARGET": "",
            "__EVENTARGUMENT": "",
            "__VIEWSTATE": response.meta['view_state'],
            "__EVENTVALIDATION": response.meta['event_validation'],
            "txt_ValidCode": validcode_value,
            "btnSubmit": "? ?"
        }
        func = self.parse_zz if response.meta['type'] == 'zz' else self.parse_bid
        yield scrapy.FormRequest(response.meta['last_url'], meta={"cookiejar": response.meta["cookiejar"]},
                                 formdata=data, callback=func, dont_filter=True)
Zhihu.py 文件源码 项目:zhihu_spider 作者: pujinxiao 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def lohin_after_captcha(self,response):
        '''?????????'''
        with open("captcha.jpg","wb") as f:
            f.write(response.body)
            f.close()
        # from PIL import Image
        # try:
        #     im=Image.open('captcha.jpg')
        #     im.show()
        # except:
        #     pass
        captcha=input('???????')
        post_data=response.meta.get('post_data',{}) #???,?????
        post_url = "https://www.zhihu.com/login/phone_num"
        post_data['captcha']=captcha
        return [scrapy.FormRequest(
            url=post_url,
            formdata=post_data,
            headers=self.headers,
            callback=self.check_login
        )]
chinaunicombidding_spider.py 文件源码 项目:scrapy_site 作者: hl10502 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse_(self, response):
        detail = response.xpath('//table[@bordercolor="lightgray"]/tr')
        # ???????
        for temp in detail[:-1]:
            item = SiteItem()
            item['title'] = temp.xpath('td/span/@title').extract_first().strip()
            if temp.xpath('td/span/@onclick').extract_first():
                item['link'] = 'http://www.chinaunicombidding.cn' + \
                               (temp.xpath('td/span/@onclick').extract_first()).split(',')[0].split(
                                   '(')[1][1:-1].strip()
            item['pubtime'] = temp.xpath('td[@width="15%"]/text()').extract_first().strip()
            yield item
        nowPage = str(int(response.xpath('//span[@id="nowPage"]/text()').extract_first()) + 1)
        print ('nowpage======================================' + str(nowPage))
        if item['pubtime'] == date.get_curdate():
            yield scrapy.FormRequest(
                "http://www.chinaunicombidding.cn/jsp/cnceb/web/info1/infoList.jsp?page=" + nowPage,
                formdata={
                    "type": "",
                    "province": "",
                    "city": "",
                    "notice": "",
                    "time1": "",
                    "time2": ""
                }, callback=self.parse_)
zycggov_spider.py 文件源码 项目:scrapy_site 作者: hl10502 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def parse(self, response):
        detail = response.xpath('//ul[@class="lby-list"]//li')
        pubtime = None
        for temp in detail[:20]:
            item = SiteItem()
            temp_pubtime = temp.xpath('span/text()').extract_first().strip()[1:11]
            if temp_pubtime:
                item['pubtime'] = temp.xpath('span/text()').extract_first().strip()[1:11]
                pubtime = item['pubtime']
            item['title'] = temp.xpath('a//text()').extract_first()
            print "------------------------------{}----".format(item['title'])
            if temp.xpath('a/@href').extract_first():
                item['link'] = "http://www.zycg.gov.cn" + temp.xpath('a//@href').extract_first()
            yield item
        # ???????????????
        # print ('-----------------------??-------------------------------')
        # print ('-------pubtime----------------{}-------------------------------'.format(pubtime))
        # print ('------date.get_curdate-----------------{}-------------------------------'.format(date.get_curdate()))
        if pubtime == date.get_curdate():
            # ?????
            # print "-----------------??-----------------"
            next_page_href = "http://www.zycg.gov.cn" + (
                str(response.xpath('//a[@class="next_page"]//@href').extract_first()))
            yield scrapy.FormRequest(next_page_href, callback=self.parse)
gdgpogov_spider.py 文件源码 项目:scrapy_site 作者: hl10502 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def parse(self, response):
        detail = response.xpath('//ul[@class="m_m_c_list"]/li')
        for temp in detail:
            item = SiteItem()
            item['title'] = temp.xpath('a/text()').extract_first().strip()
            item['link'] = "http://www.gdgpo.gov.cn" + temp.xpath('a/@href').extract_first().strip()
            item['pubtime'] = temp.xpath('em/text()').extract_first().strip()[0:10]
            print("------------------------------------------------------------------------------")
            yield item
        if date.get_curdate() == (item['pubtime']):
            pageindex = response.xpath('//input[@id="pointPageIndexId"]/@value').extract_first()
            self.iipage += 1
            last_page = response.xpath(
                u'//a/span[contains(text(),"?  ?")]/../@href').extract_first()
            total_pagenum = last_page.split('(')[1][:-1]
            if int(self.iipage) < int(total_pagenum):
                yield scrapy.FormRequest("http://www.gdgpo.gov.cn/queryMoreInfoList.do",
                                         formdata={
                                             "sitewebId": "4028889705bebb510105bec068b00003",
                                             "channelCode": '0005',
                                             'pageIndex': str(self.iipage),
                                             'pageSize': "15",
                                             'pointPageIndexId': "1"
                                         }, callback=self.parse)
zhihu.py 文件源码 项目:ZhihuSpider 作者: ShayChris 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def login_after_captcha(self, response):
        with open('captcha.jpg', 'wb') as f:
            f.write(response.body)
            f.close()

        from PIL import Image
        try:
            img = Image.open('captcha.jpg')
            img.show()
            img.close()
        except:
            pass

        captcha = input('??????')
        post_data = response.meta.get('post_data', {})
        post_url = 'https://www.zhihu.com/login/phone_num'
        post_data['captcha'] = captcha
        return scrapy.FormRequest(post_url, formdata=post_data, headers=self.headers, callback=self.check_login)
song.py 文件源码 项目:cloudmusic_api 作者: yqh231 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def parse_single_song(self, response):
        loader = response.meta['loader']
        selector = Selector(response)
        singer = selector.xpath('//title/text()').extract()
        loader.add_value('singer', singer)
        loader.add_value('_id', response.meta['song_id'])

        comment_data, comment_url = api_comment(response.meta['song_id'], 0, 100)
        source_data, source_url = api_song_url(response.meta['song_id'])
        comment_id = generate_comment_index()['comment_index']
        loader.add_value('comment_id', comment_id)


        yield scrapy.FormRequest(url=comment_url, method='POST', headers=self.headers,
                                 formdata=comment_data, callback=self.parse_comments,
                                 meta={'comment_id': comment_id})

        yield scrapy.FormRequest(url=source_url, method='POST', headers=self.headers,
                                 formdata=source_data, meta={'loader': loader}, callback=self.get_source_url)
playlist.py 文件源码 项目:cloudmusic_api 作者: yqh231 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def parse_single_song(self, response):
        loader = response.meta['loader']
        selector = Selector(response)
        singer = selector.xpath('//title/text()').extract()
        loader.add_value('singer', singer)
        loader.add_value('_id', response.meta['song_id'])

        comment_data, comment_url = api_comment(response.meta['song_id'], 0, 100)
        source_data, source_url = api_song_url(response.meta['song_id'])
        comment_id = generate_comment_index()['comment_index']
        loader.add_value('comment_id', comment_id)

        yield scrapy.FormRequest(url=comment_url, method='POST', headers=self.headers,
                                 formdata=comment_data, callback=self.parse_comments,
                                 meta={'comment_id': comment_id})

        yield scrapy.FormRequest(url=source_url, method='POST', headers=self.headers,
                                 formdata=source_data, meta={'loader': loader}, callback=self.get_source_url)
qichacha.py 文件源码 项目:lichking 作者: melonrun 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def generate_firm_susong(self, response):
        if len(response.body) < 10:
            return
        qitem = response.meta["item"]
        page_n = response.meta["page_n"]

        self.append_susong_detail({"????": self.clean_content(response.body)}, qitem._id)

        anjian_list = response.xpath("//table[@class='m_changeList']//a[@class='c_a']/@onclick").extract()
        anjian_name = response.xpath("//table[@class='m_changeList']//tr//td[2]//a[@class='c_a']/text()").extract()
        for i in range(0, len(anjian_list)):
            yield scrapy.FormRequest(
                "http://www.qichacha.com/company_wenshuView",
                callback=self.generate_firm_anjian,
                cookies=self.qicha_cookie,
                method='POST',
                dont_filter="true",
                formdata={"id": self.generate_anjian_id(anjian_list[i])},
                meta={"item_id": qitem._id, "anjian_name": anjian_name[i]}
            )
        # ?????
        yield scrapy.Request(
            response.meta["chacha_url_pre"] + '&tab=susong&box=wenshu&p=' + str(page_n),
            encoding='utf-8',
            callback=self.generate_firm_susong,
            cookies=self.qicha_cookie,
            meta={"item": qitem, "chacha_url_pre": response.meta["chacha_url_pre"], "page_n": int(page_n)+1}
        )
initiatives.py 文件源码 项目:tipi-engine 作者: CIECODE-Madrid 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def start_requests(self):
        return [scrapy.FormRequest("http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Indice%20de%20Iniciativas?_piref73_1335505_73_1335500_1335500.next_page=/wc/cambioLegislatura",
                                   formdata = {'idLegislatura':'12'} , callback = self.parse)]
wshang_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse(self, response):
        """

        :param response:
        :return:???????post??

                post???
                    inslider
                    page
                    pagesize
                Content-Type:application/x-www-form-urlencoded
        """
        soup = BeautifulSoup(response.body)
        menu = soup.find_all("a",class_="ui-more")  #????????
        if menu:
            for topic in menu:
                topic_name = topic.text.replace(u"??","")
                topic_url = topic.get("href")
                self.flag.setdefault(topic_url,0)
                page="1"
                #post_data?????
                post_data = {
                    "inslider":"0",
                    "page":page,
                    "pagesize":"10"
                }
                # yield scrapy.Request(topic_url,
                #                      callback=self.parse_topic,
                #                      method="POST",
                #                      headers={"Content-Type":"application/x-www-form-urlencoded"},
                #                      body=json.dumps(post_data)
                #                      )
                yield scrapy.FormRequest(
                    url=topic_url,
                    formdata=post_data,
                    callback=self.parse_topic,
                    meta={"page":page,"topic_name":topic_name}
                )
ctcnn_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def start_requests(self):
        return [
            scrapy.Request("http://www.ctcnn.com/",callback=self.parse),
            # scrapy.FormRequest(self.start_url,formdata={'page':'1'},callback=self.parse_newest),  #TODO something wrong

        ]
    #???????
ctcnn_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse(self,response):
        yield scrapy.FormRequest(self.start_url,formdata={'page':'1'},callback=self.parse_newest)
        soup = BeautifulSoup(response.body,"lxml")

        index_list = soup.find(class_="index-first-list")("li") if soup.find(class_="index-first-list") else None
        for news in index_list:
            title = news.h2.a.string if news.h2.a else None
            abstract = news.p.string if news.p else None
            news_url = self.domain+news.a.get("href",None) if news.a else None
            item = NewsItem(title=title,abstract=abstract,news_url=news_url,catalogue=u"????")
            request = scrapy.Request(news_url,self.parse_news,dont_filter=True)
            request.meta["item"] = item
            yield request

    #???????
ctcnn_spider.py 文件源码 项目:NewsScrapy 作者: yinzishao 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def parse_newest(self, response):
        soup = BeautifulSoup(response.body,"lxml")
        page =response.request.body.split('=')[-1]
        li = soup.find_all('li')
        if li:
            for news in li :
                news_date = news.find(class_="time").string[2:] if news.find(class_="time") else None
                struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M")
                news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
                title = news.find(class_="title").string if news.find(class_="title") else None
                news_url = self.domain+news.find(class_="title").a.get("href",None) if news.find(class_="title") else None
                abstract = news.find(class_="info").string if news.find(class_="info") else None
                pic = self.domain+news.find('img').get('src',None) if news.find('img') else None
                topic = news.find(class_="type").string if news.find(class_="type") else None
                item = NewsItem(catalogue=u"????",
                                title=title,
                                news_url=news_url,
                                abstract=abstract,
                                pic=pic,
                                topic=topic,
                                news_date=news_date)
                item = judge_news_crawl(item)
                if item:
                    request = scrapy.Request(news_url,callback=self.parse_news,dont_filter=True)
                    request.meta["item"] = item
                    yield request
                else:
                    self.flag=page
        else:
            logger.info("can't find news list")


        #???
        if not self.flag:
            new_request = scrapy.FormRequest(self.start_url,formdata={'page':str(int(page)+1)},callback=self.parse_newest)
            yield new_request
byrbbs_article_hour.py 文件源码 项目:byrbbs-py3 作者: ryderchan 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def start_requests(self):
        return [scrapy.FormRequest("https://bbs.byr.cn/user/ajax_login.json",
                                   formdata=LOGIN_FORMDATA,
                                   meta={'cookiejar': 1},
                                   headers=HEADERS,
                                   callback=self.logged_in)]

    # ???(hour??????????????????????????????)
byrbbs_section.py 文件源码 项目:byrbbs-py3 作者: ryderchan 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def start_requests(self):
            return [scrapy.FormRequest("http://bbs.byr.cn/user/ajax_login.json",
                                       formdata=LOGIN_FORMDATA,
                                       meta={'cookiejar': 1},
                                       headers=HEADERS,
                                       callback=self.logged_in)]


问题


面经


文章

微信
公众号

扫码关注公众号