def parse_detail(self, response):
content = response.css('#work span::text').extract()
reg = "^(http|https|ftp)://.*(.com|.cn|.html|.htm|.asp|.jsp)"
url = response.url
reg_url_name = ".*?(\d+)"
get_url = re.match(reg_url_name, url)
if get_url:
self.get_name = get_url.group(1)
reference_url_list = []
for each_line in content:
get_reference_url = re.match(reg, each_line)
if get_reference_url:
reference_url_list.append(get_reference_url.group(0))
self.count = 0
if reference_url_list:
for each_url in reference_url_list:
yield Request(url=each_url, dont_filter=True, callback=self.parse_reference)
self.count += 1
else:
pass
python类Request()的实例源码
def post_get_playlist(self, response):
collection = self.db.playlist
result = json.loads(response.body, encoding='utf-8')['result']
# inserted = collection.update({'id': result['id']}, result, upsert=True) # upsert=True??insert or update
# logger.info('Update or Insert to playlist database[%s]' % (str(inserted),))
if result['id'] not in self.playlist_id_buffer:
collection.insert(result)
for song in result['tracks']:
artists = []
for detail in song['artists']:
artists.append(detail['name'])
comment_url = 'http://music.163.com/weapi/v1/resource/comments/%s/?csrf_token=' % (song['commentThreadId'],)
# ??FormRequest???POST??????????????
# Request(url, method='POST', body=json.dumps(data))
yield FormRequest(comment_url, formdata=self.post_data, callback=self.parse,
meta={'m_id': song['id'], 'm_name': song['name'], 'artists': artists})
def parse_list(self, response):
url = response.meta['splash']['args']['url']
pattern = re.compile(r'http://www.mogujie.com/book/\w+/\d+/')
if (pattern.match(url)):
page = int(pattern.split(url)[1])
url = pattern.findall(url)[0]
page += 1
url = url + str(page)
else:
url = url + '/2'
print '+++++++++++++++++++++++++ Next url:', url
req = SplashRequest(url = url, callback = self.parse_list)
yield req
pattern_detail = re.compile(r'http://shop.mogujie.com/detail/.{7}')
for item_url in pattern_detail.findall(response.body):
req = Request(url = item_url, callback = self.parse_item)
yield req
def default(self, o):
if isinstance(o, datetime.datetime):
return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
elif isinstance(o, datetime.date):
return o.strftime(self.DATE_FORMAT)
elif isinstance(o, datetime.time):
return o.strftime(self.TIME_FORMAT)
elif isinstance(o, decimal.Decimal):
return str(o)
elif isinstance(o, defer.Deferred):
return str(o)
elif isinstance(o, BaseItem):
return dict(o)
elif isinstance(o, Request):
return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
elif isinstance(o, Response):
return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
elif isinstance(o, Crawler):
return o.stats.get_stats()
else:
return super(ScrapyJSONEncoder, self).default(o)
def get_all_page(self, response):
all_page = 0 # ???
current_page = 1 # ????
body = str(response.body)
regex_str = ".*?PAGE.pager = ({.*?});.*"
pager = re.match(regex_str, body)
if pager:
pager_data = pager.group(1).replace('\\n', '').replace('\\r', '').replace(" ", "")
regex_str = '.*count:"(\d+)".*'
all_page = int(re.match(regex_str, pager_data).group(1))
print("all_page :" + str(all_page))
# ????????scrapy????
while current_page <= all_page:
url = apiconstants.get_douyu_list_url(current_page)
print(url)
current_page = current_page + 1
yield Request(url=url, callback=self.parse)
print("????")
def get_all_page(self, response):
all_page = 0 # ???
current_page = 1 # ????
body = str(response.body)
regex_str = ".*?PAGE.pager = ({.*?});.*"
pager = re.match(regex_str, body)
if pager:
pager_data = pager.group(1).replace('\\n', '').replace('\\r', '').replace(" ", "")
regex_str = '.*count:"(\d+)".*'
all_page = int(re.match(regex_str, pager_data).group(1))
print("all_page :" + str(all_page))
# ????????scrapy????
while current_page <= all_page:
url = apiconstants.get_douyu_list_url(current_page)
print(url)
current_page = current_page + 1
yield Request(url=url, callback=self.parse)
print("????")
def get_torrent(self, response):
sel = Selector(response)
cl_title = sel.xpath('//td[@class="h"]/text()[2]').extract_first()
cl_bankuai = sel.xpath('//div[@class="t3"]/table/tr/td/b/a[2]/text()').extract_first()
cl_url = response.url
torrent = re.search('rmdown\.com(.+?)</a>', response.body)
torrent_url = 'http://www.' + torrent.group()[:-4]
posted = sel.xpath('//div[@class="tipad"]/text()').extract()[1]
posted = posted.encode('utf-8')[9:-7]
yield Request(
url=torrent_url,
meta={
'cl_title': cl_title,
'cl_bankuai': cl_bankuai,
'cl_url': cl_url,
'posted': posted,
},
callback=self.parse_item,
dont_filter=True)
def init_request(self):
"""This function is called before crawling starts."""
# Do not start a request on error,
# simply return nothing and quit scrapy
if self.abort:
return
logging.info('All set, start crawling with depth: ' + str(self.max_depth))
# Do a login
if self.config['login']['enabled']:
# Start with login first
logging.info('Login required')
return Request(url=self.login_url, callback=self.login)
else:
# Start with pase function
logging.info('Not login required')
return Request(url=self.base_url, callback=self.parse)
#----------------------------------------------------------------------
def parse_followers(self, response):
nametoken = response.meta['nametoken']
api_followees_url = self.base_url + '/api/v4/members/' + response.url.split('/')[-2] + '/followees'
api_followers_url = self.base_url + '/api/v4/members/' + response.url.split('/')[-2] + '/followers'
yield scrapy.Request(url=api_followees_url, callback=self.parser_follow_json, headers=ZHIHU_HEADER,
cookies=ZHIHU_COOKIE, meta={
'nametoken': nametoken
})
yield scrapy.Request(url=api_followers_url, callback=self.parser_follow_json, headers=ZHIHU_HEADER,
cookies=ZHIHU_COOKIE, meta={
'nametoken': nametoken
})
# ??json
def parse(self, response):
item = DoubanspiderItem()
selector = Selector(response)
Movies = selector.xpath('//div[@class="info"]')
for eachMovie in Movies:
title = eachMovie.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract()
movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract()
star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()
quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
item['title'] = title
item['movieInfo'] = ';'.join(movieInfo)
item['star'] = star
item['quote'] = quote
# ??item
yield item
nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
if nextLink:
nextLink = nextLink[0]
print(nextLink)
yield Request(self.url + nextLink,callback=self.parse)
def parse(self, response):
for i in range(10):
self.current += 1
if self.current >= self.rule.max_page:
break
yield Request(self.rule.url_fmt.format(self.current))
if response.status != 200:
return None
ip_list = response.xpath(self.rule.row_xpath)[1:]
for ip_item in ip_list:
l = ProxyItemLoader(item=ProxyItem(), selector=ip_item)
l.add_xpath('proxy', self.rule.host_xpath)
l.add_xpath('proxy', self.rule.port_xpath)
l.add_xpath('ip', self.rule.host_xpath)
l.add_xpath('port', self.rule.port_xpath)
l.add_xpath('addr', self.rule.addr_xpath)
l.add_xpath('mode', self.rule.mode_xpath)
l.add_xpath('protocol', self.rule.proto_xpath)
l.add_xpath('validation_time', self.rule.vt_xpath)
l.add_value('src_rule', self.rule.name)
yield l.load_item()
def parse_ph_key(self, response):
selector = Selector(response)
logging.debug('request url:------>' + response.url)
# logging.info(selector)
divs = selector.xpath('//div[@class="phimage"]')
for div in divs:
viewkey = re.findall('viewkey=(.*?)"', div.extract())
# logging.debug(viewkey)
yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],
callback=self.parse_ph_info)
url_next = selector.xpath(
'//a[@class="orangeButton" and text()="Next "]/@href').extract()
logging.debug(url_next)
if url_next:
# if self.test:
logging.debug(' next page:---------->' + self.host + url_next[0])
yield Request(url=self.host + url_next[0],
callback=self.parse_ph_key)
# self.test = False
def parse_articles(self, response):
article_ptn = "http://www.theglobeandmail.com/opinion/(.*?)/article(\d+)/"
resp_url = response.url
article_m = re.match(article_ptn, resp_url)
article_id = ''
if article_m != None:
article_id = article_m.group(2)
if article_id == '32753320':
print('***URL***', resp_url)
soup = BeautifulSoup(response.text, 'html.parser')
text = Selector(text=response.text).xpath('//*[@id="content"]/div[1]/article/div/div[3]/div[2]').extract()
if text:
print("*****in Spider text*****", soup.title.string)
yield {article_id: {"title": soup.title.string, "link": resp_url, "article_text": text}}
comments_link = response.url + r'comments/'
if comments_link == 'http://www.theglobeandmail.com/opinion/a-fascists-win-americas-moral-loss/article32753320/comments/':
yield Request(comments_link, callback=self.parse_comments)
def parse_follows(self, response):
''' parse the follows '''
url = response.url
_id = url.split('=')[-1]
item = response.meta['item']
driver = response.meta['driver']
try:
driver.switch_to.default_content()
g_iframe = driver.find_elements_by_tag_name('iframe')[0]
driver.switch_to.frame(g_iframe)
lis = driver.find_elements_by_xpath('//*[@id="main-box"]/li')
follows = {}
for li in lis:
a = li.find_element_by_tag_name('a')
title = a.get_attribute('title')
href = a.get_attribute('href')
uid = href.split('=')[-1]
follows[uid] = title
item['follows'] = follows
except Exception as e:
item['follows'] = None
print e
# driver.close()
request = Request(url='http://music.163.com/user/fans?id=' + _id, callback=self.parse_fans)
request.meta['item'] = copy.deepcopy(item)
yield request
# TODO: ??
def parse_fans(self, response):
''' parse the follows '''
url = response.url
_id = url.split('=')[-1]
item = response.meta['item']
driver = response.meta['driver']
try:
driver.switch_to.default_content()
g_iframe = driver.find_elements_by_tag_name('iframe')[0]
driver.switch_to.frame(g_iframe)
lis = driver.find_elements_by_xpath('//*[@id="main-box"]/li')
fans = {}
for li in lis:
a = li.find_element_by_tag_name('a')
title = a.get_attribute('title')
href = a.get_attribute('href')
uid = href.split('=')[-1]
fans[uid] = title
item['fans'] = fans
except Exception as e:
item['fans'] = None
print e
# driver.close()
request = Request(url='http://music.163.com/user/songs/rank?id=' + _id, callback=self.parse_songs_rank)
request.meta['item'] = copy.deepcopy(item)
yield request
def start_requests(self):
for u in self.start_urls:
yield Request(u,callback=self.parse,
errback=self.errback)
def parse(self, response):
yield self.parse_item(response)
for a in response.css('a::attr(href)').extract():
if not a:
continue
next_url = response.urljoin(a)
yield Request(next_url,callback=self.parse)
def pop(self, timeout=0):
"""Pop a request"""
if timeout > 0:
data = self.server.brpop(self.key, timeout=timeout)
if isinstance(data, tuple):
data = data[1]
else:
data = self.server.rpop(self.key)
if data:
cb, url = data.split('--', 1)
try:
cb = getattr(self.spider, str(cb))
return Request(url=url, callback=cb)
except AttributeError:
raise ValueError("Method %r not found in: %s" % (cb, self.spider))
def parse(self, response):
article_nodes = response.css('#block-content-article .mainer .item a.title')
for article_node in article_nodes:
article_url = urlparse.urljoin(response.url, str(article_node.css("::attr(href)").extract_first(
""))) # "http://www.acfun.cn" + str(article_node.css("::attr(href)").extract_first(""))
yield Request(url=article_url, callback=self.parse_detail, dont_filter=True)
next_nodes = response.css(".pager")
next_node = next_nodes[len(next_nodes) - 1]
next_url = str(next_node.css("::attr(href)").extract_first(""))
if next_url:
next_url = urlparse.urljoin(response.url, next_url)
yield Request(url=next_url, callback=self.parse, dont_filter=True)
def start_requests(self):
for i, url in enumerate(self.urls):
yield Request(
url = url,
headers = self.headers,
meta = self.meta,
dont_filter = True,
callback = self.parse_page,
errback = self.error_parse,
)