def parse_data(self, response):
item = AfscrapyItem()
datas = json.loads(response.body.decode('utf-8'))
for data in datas['msg']:
item['goods_id'] = data['id']
item['shop_name'] = "??"
item['category_name'] = response.meta["cat"]
item['title'] = data['product_name']
item['sales_num'] = 0
item['unit'] = data['volume']
item['price'] = data['price']
item['location'] = ""
yield item
next_page = int(response.meta['page']) + 1
yield FormRequest(response.url, formdata={"class_id": response.meta["class_id"], "curr_page": str(next_page)},
callback=self.parse_data,
meta={"cat": response.meta["cat"],
"class_id": response.meta["class_id"], "page": next_page})
python类FormRequest()的实例源码
def login_verify(self, response):
if response.url == self.login_verify_url:
self.is_login = True
self.login_time = time.mktime(time.strptime(\
response.headers['Date'], \
'%a, %d %b %Y %H:%M:%S %Z')) + (8 * 60 * 60)
time.sleep(1)
return [FormRequest(self.submit_url,
formdata = {
'problem_id': self.problem_id,
'language': LANGUAGE.get(self.language, '0'),
'source': self.source,
'submit': 'Submit',
'encoded': '1'
},
callback = self.after_submit,
dont_filter = True
)]
else:
return Request(self.start_urls[0], callback=self.parse_start_url)
def post_get_playlist(self, response):
collection = self.db.playlist
result = json.loads(response.body, encoding='utf-8')['result']
# inserted = collection.update({'id': result['id']}, result, upsert=True) # upsert=True??insert or update
# logger.info('Update or Insert to playlist database[%s]' % (str(inserted),))
if result['id'] not in self.playlist_id_buffer:
collection.insert(result)
for song in result['tracks']:
artists = []
for detail in song['artists']:
artists.append(detail['name'])
comment_url = 'http://music.163.com/weapi/v1/resource/comments/%s/?csrf_token=' % (song['commentThreadId'],)
# ??FormRequest???POST??????????????
# Request(url, method='POST', body=json.dumps(data))
yield FormRequest(comment_url, formdata=self.post_data, callback=self.parse,
meta={'m_id': song['id'], 'm_name': song['name'], 'artists': artists})
def parse(self, response):
EVENTVALIDATION = response.xpath("//*[@id='__EVENTVALIDATION']/@value").extract()
VIEWSTATE = response.xpath("//*[@id='__VIEWSTATE']/@value").extract()
for i in range(1, 5):
yield FormRequest(
'http://environmentclearance.nic.in/Search.aspx',
headers = {'user-agent': 'Mozilla/5.0'},
formdata = {
'ww': 'rr|GridView1',
'__LASTFOCUS': '',
'__EVENTTARGET': 'GridView1',
'__EVENTARGUMENT': 'Page${}'.format(i),
'__VIEWSTATE': VIEWSTATE,
'__EVENTVALIDATION': EVENTVALIDATION,
'a': 'rb1',
'dd1status': 'UPEChome',
'ddlyear': '-All Years-',
'ddlcategory': '-All Category-',
'ddlstate': '-All State-',
'textbox2': '',
'DropDownList1': 'UPEC'
},
callback = self.parse_item
)
def parse(self, response):
all_urls = response.xpath('//div[@class="tit_sort"]//dl')
if len(all_urls):
for url in all_urls:
category_name = url.xpath('./dt/a/text()').extract()[0]
next_urls = url.xpath('.//em//a/@href').extract()
for next_url in next_urls:
class_id = re.search("list-(\d+)-(\d+)-(\d+)", next_url)
c1 = class_id.group(1)
c2 = class_id.group(2)
c3 = class_id.group(3)
next_url = "http://www.benlai.com/NewCategory/GetLuceneProduct"
yield FormRequest(next_url, formdata={"c1": c1, "c2": c2, "c3": c3, "page": "1"},
callback=self.parse_data,
meta={"cat": category_name, "c1": c1, "c2": c2, "c3": c3, "page": "1"})
# ?????????
def parse_data(self, response):
item = AfscrapyItem()
datas = json.loads(response.body.decode('utf-8'))
for data in datas['ProductList']:
item['goods_id'] = data['ProductSysNo']
item['shop_name'] = "??"
item['category_name'] = response.meta["cat"]
item['title'] = data['ProductName']
item['sales_num'] = 0
item['unit'] = ""
item['price'] = data['ProductNowPrice']
item['location'] = ""
yield item
if len(datas['ProductList']):
next_page = int(response.meta["page"]) + 1
yield FormRequest(response.url,
formdata={"c1": response.meta['c1'], "c2": response.meta['c2'], "c3": response.meta['c3'],
"page": str(next_page)},
callback=self.parse_data,
meta={"cat": response.meta["cat"], "c1": response.meta['c1'], "c2": response.meta['c2'],
"c3": response.meta['c3'], "page": str(next_page)})
def start_login(self,response):
xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract_first()
return [FormRequest('https://www.zhihu.com/login/phone_num',method='POST',meta={'cookiejar':response.meta['cookiejar']},formdata={
#'_xsrf':xsrf,
'password':'feifengwind',
'remember_me':"true",
'phone_num':'18983848805'},
callback=self.after_login
)]
def start_requests(self):
return [FormRequest(self.login_url,
formdata = {
'user_id1': self.username,
'password1': self.password,
'B1': 'login',
},
callback = self.after_login,
)]
def start_requests(self):
return [FormRequest(self.login_url,
formdata = {
'user_id1': self.username,
'password1': self.password,
'B1': 'login',
},
callback = self.after_login,
)]
def parse(self, response):
return [FormRequest("https://m.facebook.com/login.php",
formdata={
'email': self.email,
'pass': self.password
}, callback=self.parse_post_login)
]
def start_requests(self):
url = "http://per.spdb.com.cn/was5/web/search"
for i in range(1, 3):
formdata = {
"page": str(i),
"metadata": "finance_state|finance_no|finance_allname|finance_anticipate_rate|finance_limittime|finance_lmttime_info|finance_type|docpuburl|finance_ipo_enddate|finance_indi_ipominamnt|finance_indi_applminamnt",
"channelid": "266906",
"searchword": "(product_type=3)*finance_limittime = %*(finance_currency = 01)*(finance_state='???')"
}
yield FormRequest(url, callback=self.parse_model, formdata=formdata)
def start_requests(self):
self.logger.info('Login')
self.cookies['m-login'] = '0'
for one in self.start_urls:
yield FormRequest(one, cookies=self.cookies, formdata=self.frmdata, callback=self.parse, headers={
'Referer': 'https://www.quora.com/'}, dont_filter=True)
def start_requests(self):
for one in self.start_urls:
yield FormRequest(one, cookies=self.cookies, formdata=self.frmdata, callback=self.parse, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Referer': 'https://www.quora.com/'})
def start_requests(self):
for i, url in enumerate(self.start_urls):
yield FormRequest(url,
meta = {'cookiejar': i},
headers = self.headers,
cookies = self.cookies,
callback = self.parse,
dont_filter = True)#jump to login page
def login(self, response):
print('post_login')
# FormRequeset.from_response?Scrapy???????, ??post??
self.headers["X-Requested-With"] = "XMLHttpRequest"
self.headers["Referer"] = self.index_url
return [FormRequest(
url=self.login_url,
formdata=self.login_formdata,
headers=self.headers,
callback=self.check_login_status,
)]
def start_requests(self):
for i, url in enumerate(self.start_urls):
yield FormRequest(
url = url,
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'www.assetstore.unity3d.com',
'Referer': 'https://www.assetstore.unity3d.com/en/',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
'Firefox/50.0',
'X-Kharma-Version': '0',
'X-Requested-With': 'UnityAssetStore',
'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41',
},
method = 'POST',
formdata = {
'current_package_id': '',
'hardware_hash': '',
'language_code': 'en',
'license_hash': '',
},
meta = {
'download_timeout': 20,
'is_proxy': False,
},
callback = self.get_unity_version,
)
#??? unity asset store ??????
# ??????????
def parse_item(self, response):
loader = ItemLoader(GaokaopaiZhiyeItem(), response)
loader.add_value('url', response.url)
loader.add_value('code', response.url, re=ur'-([^-]+)\.html')
loader.add_css('name', u'.modTitle>h1::text')
def parse_category():
for e in response.css(u'.catType>a'):
yield {
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
'name': e.css('::text').extract_first(),
}
loader.add_value('category', list(parse_category()))
loader.add_css('detail', u'.zhiyeShow')
item = loader.load_item()
return FormRequest(
url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html',
formdata={'code': item['code'][0]},
meta={'item': item},
dont_filter=True,
callback=self.parse_majors
)
def start_requests(self):
return [ FormRequest("http://spys.ru/en/free-proxy-list/",
formdata={'xpp': '3', 'xf1': '0', 'xf2' : '0'},
callback=self.parse) ]
# Helper function to process the abstaction
def login(self, response):
self.log('Logging in...')
try:
full_args, args, url, method, params = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
validated_url = self.url_valid(url, response.url)
real_url = urlsplit(validated_url)
result_db.add_to_result(method.upper(), real_url.scheme + "://" + real_url.hostname + real_url.path,
list(dict(full_args).keys()))
yield FormRequest(validated_url,
method=method,
formdata=args,
callback=self.confirm_login,
dont_filter=True)
except Exception as e:
print(e)
self.log('Login failed')
for start_url in self.start_urls:
if (";" in start_url):
split_arr = start_url.split(';')
validated_url = split_arr[0]
yield Request(url=validated_url, dont_filter=True, callback=self.parse_res)
time.sleep(int(split_arr[1]))
else:
validated_url = start_url
yield Request(url=validated_url, dont_filter=True, callback=self.parse_res)
real_url = urlsplit(validated_url)
if len(real_url.query) > 0 and self.get_ext(real_url.path) not in self.not_allowed:
# only add to result if have parameters
param_dict = parse_qs(real_url.query, keep_blank_values=True)
result_db.add_to_result("GET", real_url.scheme + "://" + real_url.hostname + real_url.path, list(param_dict.keys()))
if self.ignore_params:
tag_url = real_url.scheme + "://" + real_url.hostname + real_url.path
else:
tag_url = validated_url
for param in self.ignore_fields:
if param in real_url.query:
tag_url = real_url.path
if tag_url not in self.urls_visited and self.get_ext(real_url.path) not in self.not_allowed:
self.urls_visited.append(tag_url)
def start_requests(self):
url = "https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8"
requests = []
for i in range(1,60):
formdata = {"q":"",
"viewFlag":"A",
"sortType":"default",
"searchStyle":"",
"searchRegion":"city:",
"searchFansNum":"",
"currentPage":str(i),
"pageSize":"100"}
request = FormRequest(url,callback=self.parse_model,formdata=formdata)
requests.append(request)
return requests
def start_requests(self):
url = "http://pluto.babyun.com.cn/pluto/api/user/signin"
return [FormRequest(url,meta={'cookiejar':1},
formdata = {
'password':'3aF9Ac3R4M76e',
'username':'admin',
'remember':'true',
},
callback = self.after_login,
)]
def start_requests(self):
# used for checking that a ticker isn't downloaded twice
self.requested_tickers = set()
for category in self.categories:
self.logger.info('POST request for category "' + category['name'] + '"')
# return a POST request for getting the index list in this category group
yield FormRequest(url="https://indexes.nasdaqomx.com/Index/DirectoryData",
formdata={'categoryID': str(category['id'])},
meta={'exchange': category['name']},
callback=self.parse_categories)
def parse_categories(self, response):
# unpack meta values
exchange = response.meta['exchange']
# get a dict with the json data
data = json.loads(response.body_as_unicode())
# for all instruments in the list
for instrument in data['aaData']:
ticker = instrument['Symbol']
name = instrument['Name']
paper_type = instrument['AssetType']
if ticker in self.requested_tickers:
self.logger.warning('Ticker "' + ticker + '" has already been requested. Skipping')
continue
# POST request for historical data for this ticker
self.logger.info('Sending POST request for ticker "' + ticker + '"')
yield FormRequest(url="https://indexes.nasdaqomx.com/Index/HistoryData",
formdata={
'id': ticker,
'startDate': '1950-09-03T00:00:00.000',
'endDate': '2050-09-03T00:00:00.000',
'timeOfDay': 'EOD'},
meta={'ticker': ticker,
'name': name,
'paper_type': paper_type,
'exchange': exchange},
callback=self.parse_historical_data)
# parse the POST response containing the ticker data
def parse(self, response):
parent_path = response.xpath('//section[@id="m-category"]')
for i in range(1, 9):
category_name = parent_path.xpath("./ul/li["+str(i)+"]/a/text()").extract()[0]
all_urls = parent_path.xpath(".//div/div["+str(i)+"]/ul/li/a/@href").extract()
for url in all_urls:
class_id = re.search('\d+', url).group()
next_url = "http://m.fruitday.com/ajax/prolist/index"
yield FormRequest(next_url, formdata={"class_id": class_id, "curr_page": "0"},
callback=self.parse_data,
meta={"cat": category_name, "class_id": class_id, 'page': "0"})
# ?????????
def process_pagination_form(self, form, page=None, product_id=None):
action = form.xpath('@action').extract_first()
names = form.xpath('input/@name').extract()
values = form.xpath('input/@value').extract()
formdata = dict(zip(names, values))
meta = dict(prev_page=page, product_id=product_id)
return FormRequest(
url=action,
method='GET',
formdata=formdata,
callback=self.parse,
meta=meta
)
def parse_product(self, response):
# Circumvent age selection form.
if '/agecheck/app' in response.url:
logger.debug(f"Form-type age check triggered for {response.url}.")
form = response.css('#agegate_box form')
action = form.xpath('@action').extract_first()
name = form.xpath('input/@name').extract_first()
value = form.xpath('input/@value').extract_first()
formdata = {
name: value,
'ageDay': '1',
'ageMonth': '1',
'ageYear': '1955'
}
yield FormRequest(
url=action,
method='POST',
formdata=formdata,
callback=self.parse_product
)
else:
yield load_product(response)
def start_requests(self):
count = self.sql.get_proxy_count(self.name)
count_httpbin = self.sql.get_proxy_count(config.httpbin_table)
ids = self.sql.get_proxy_ids(self.name)
ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)
for i in range(0, count + count_httpbin):
table = self.name if (i < count) else config.httpbin_table
id = ids[i] if i < count else ids_httpbin[i - len(ids)]
proxy = self.sql.get_proxy_with_id(table, id)
if proxy == None:
continue
for url in self.urls:
cur_time = time.time()
yield FormRequest(
url = url,
headers = self.headers,
method = 'POST',
meta = {
'cur_time': cur_time,
'download_timeout': self.timeout,
'proxy_info': proxy,
'table': table,
'id': proxy.get('id'),
'proxy': 'http://%s:%s' % (proxy.get('ip'), proxy.get('port')),
'vali_count': proxy.get('vali_count', 0),
},
cookies = {
'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488937030',
'_ga': 'GA1.2.40497390.1488937014',
'TG-TRACK-CODE': 'search_code',
'index_location_city': '%E5%8C%97%E4%BA%AC',
'LGRID': '20170308093710-bf6755eb-039f-11e7-8025-525400f775ce',
'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488881288,1488936799,1488936947,1488937014',
'JSESSIONID': 'BDCBB6167F960CE43AF54B75A651F586',
'LGSID': '20170308093653-b59316f0-039f-11e7-9229-5254005c3644',
'LGUID': '20170308093653-b593185f-039f-11e7-9229-5254005c3644',
'user_trace_token': '20170308093654-723efcfac8fb4c28a670d073d5113e02',
'SEARCH_ID': '4db4dc3dea1c46b49018ae5421b53ffa'
},
formdata = {
'first': 'true',
'kd': 'ios',
'pn': '1',
},
dont_filter = True,
callback = self.success_parse,
errback = self.error_parse,
)
def start_requests(self):
for url in self.start_urls:
yield FormRequest(url,headers=self.header,callback=self.parse_item)
def start_requests(self):
count = self.sql.get_proxy_count(self.name)
count_httpbin = self.sql.get_proxy_count(config.httpbin_table)
ids = self.sql.get_proxy_ids(self.name)
ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)
for i in range(0, count + count_httpbin):
table = self.name if (i < count) else config.httpbin_table
id = ids[i] if i < count else ids_httpbin[i - len(ids)]
proxy = self.sql.get_proxy_with_id(table, id)
if proxy == None:
continue
for url in self.urls:
cur_time = time.time()
yield FormRequest(
url = url,
headers = self.headers,
method = 'POST',
meta = {
'cur_time': cur_time,
'download_timeout': self.timeout,
'proxy_info': proxy,
'table': table,
'id': proxy.id,
'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
'vali_count': proxy.vali_count,
},
cookies = {
'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488937030',
'_ga': 'GA1.2.40497390.1488937014',
'TG-TRACK-CODE': 'search_code',
'index_location_city': '%E5%8C%97%E4%BA%AC',
'LGRID': '20170308093710-bf6755eb-039f-11e7-8025-525400f775ce',
'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488881288,1488936799,1488936947,1488937014',
'JSESSIONID': 'BDCBB6167F960CE43AF54B75A651F586',
'LGSID': '20170308093653-b59316f0-039f-11e7-9229-5254005c3644',
'LGUID': '20170308093653-b593185f-039f-11e7-9229-5254005c3644',
'user_trace_token': '20170308093654-723efcfac8fb4c28a670d073d5113e02',
'SEARCH_ID': '4db4dc3dea1c46b49018ae5421b53ffa'
},
formdata = {
'first': 'true',
'kd': 'ios',
'pn': '1',
},
dont_filter = True,
callback = self.success_parse,
errback = self.error_parse,
)
def getherproxy_req(self):
"""get proxy from gatherproxy.com"""
block = True
if not block:
# method1-nonblock
url = 'http://gatherproxy.com/proxylist/anonymity/?t=Elite'
settings = Settings()
@defer.inlineCallbacks
def getpage(request,page):
try:
print("Request {},pagenumber:{}".format(request,page))
response = yield HTTP11DownloadHandler(settings).download_request(request,spider=None)
if response.status==200:
self._get_proxy(response.body.decode(),country=self.country)
except Exception as e:
print(e)
print("[!] Failed: request {} of page:{}".format(request,page))
pass##
def iter_page():
work =(
getpage(FormRequest(url=url,
headers=self.headers,
formdata={'Type':'elite','PageIdx':str(page),'Uptime':'0'},
meta={'download_timeout':60}),page=page) for page in range(1,self.maxpage+1)
)
coop = task.Cooperator()
join = defer.DeferredList(coop.coiterate(work) for i in range(self.concurrent))
join.addBoth(lambda _: reactor.stop())
iter_page()
reactor.run()
else:
# method 2- block
url = 'http://gatherproxy.com/proxylist/anonymity/?t=Elite'
for pagenum in range(1,self.maxpage+1):
try:
data = {'Type':'elite','PageIdx':str(pagenum),'Uptime':'0'}
headers = copy.copy(self.headers)
r = requests.post(url, headers=headers, data=data)
except Exception as e:
print(str(e))
print('[!] Failed: %s' % url)
gatherproxy_list = []
return gatherproxy_list
self._get_proxy(r.text,country=self.country)