def rank_checker(url,hatebu_url):
try:
html = request.urlopen(hatebu_url)
except request.HTTPError as e:
print(e.reason)
except request.URLError as e:
print(e.reason)
soup = BeautifulSoup(html,"lxml")
a = soup.find("a",href=url)
if a == None:
rank = None
else:
rank = a.get("data-entryrank")
return rank
# ????????????????????
python类URLError()的实例源码
def prepare(self, first_url):
tmp_cookie = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(tmp_cookie))
try:
response = opener.open(self.logout_url, timeout=3)
except urllib.HTTPError as e:
logging.warn("server process request error: err_code=%s", e.code)
return -5, None
except urllib.URLError as e:
logging.warn("reach server error: reason=%s", e.reason)
return -10, None
except Exception as e:
logging.warn("other exception: msg=%s", e.message)
return -100, None
for item in tmp_cookie:
self.my_cookie+=item.name + "=" +item.value + ";"
#htm = response.read()
return 0, None
########## post data to request_url ##############
def get_category(url):
try:
html = request.urlopen("http://b.hatena.ne.jp/entry/{}".format(url))
soup = BeautifulSoup(html,"lxml")
return soup.find("html").get("data-category-name")
except request.HTTPError as e:
print(e.reason)
except request.URLError as e:
print(e.reason)
#??????????????????
def is_hatenatop(url):
try:
html = request.urlopen("http://hatenablog.com/")
except urllib.HTTPError as e:
print(e.reason)
except urllib.URLError as e:
print(e.reason)
soup = BeautifulSoup(html,"lxml")
a = soup.find("a",href=url)
if a is None:
return False
return url == a.get("href")
def crawl_detail(self): #????????????
for ipo in self.ipo_list:
attr_report(446076)
url = self.detail_url % ipo["code"]
req = urllib.request.Request(
url=url,
)
req.add_header('User-agent', user_agent)
retry_cnt = 0
while retry_cnt < 3:
try:
try:
resp = urllib.request.urlopen(req, timeout=3)
except urllib.HTTPError as e:
logging.warning("server process request error: err_code=%s", e.code)
return -5, None
except urllib.URLError as e:
logging.warning("reach server error: reason=%s", e.reason)
return -10, None
except Exception as e:
logging.warning("other exception: msg=%s", e.message)
return -100, None
html_text = resp.read().decode("gbk")
resp.close()
if self._parse_detail(html_text, ipo):
break
else:
retry_cnt += 1
except Exception as e:
retry_cnt += 1
logging.warning('cn craw {0} detail ex:{1}, {2}'.format(ipo["code"], e, traceback.format_exc()))
if retry_cnt >= 3:
attr_report(441846)
logging.info("cn craw {0} detail fail".format(ipo["code"]))
else:
attr_report(441845)
def post_to_url(self, request_url, post_data):
post_encode = urlencode(post_data).encode()
#print post_encode
req = urllib.request.Request(
url=request_url,
data=post_encode
)
req.add_header('Cookie', self.my_cookie)
#print req.headers
#print req.data
try:
resp = urllib.request.urlopen(req, timeout=3)
except urllib.error.HTTPError as e:
logging.warn("server process request error: err_code=%s", e.code)
return -5, None
except urllib.error.URLError as e:
logging.warn("reach server error: reason=%s", e.reason)
return -10, None
except Exception as e:
logging.warn("other exception: msg=%s", e.__str__())
return -100, None
htm = resp.read()
return 0, htm
########## get data to request_url ###############
def get_to_url(self, request_url, get_data):
if get_data == "":
tmp_url = request_url
else:
tmp_url = request_url + "?" + get_data
#print tmp_url
#print self.my_cookie
req = urllib.request.Request(
url=request_url,
)
req.add_header('Cookie', self.my_cookie)
#print req.headers
#print req.data
try:
resp = urllib.request.urlopen(req, timeout=3)
except urllib.HTTPError as e:
logging.warn("server process request error: err_code=%s", e.code)
return -5, None
except urllib.URLError as e:
logging.warn("reach server error: reason=%s", e.reason)
return -10, None
except Exception as e:
logging.warn("other exception: msg=%s", e.message)
return -100, None
htm = resp.read()
return 0, htm
def _fetch_url(self):
try:
r = urllib
data = r.read()
self.f = xio.StringIO(data)
self._fetched = True
except urllib.URLError:
return self.fetch_next_server()