def get_html_by_urllib(url, code = 'utf-8', headers = {}, proxies = {}):
html = None
if not url.endswith('.exe') and not url.endswith('.EXE'):
page = None
is_timeout = False
try:
def timeout_handler(response):
is_timeout = True
if response:
response.close()
if proxies:
proxy_support = request.ProxyHandler(proxies)
opener = request.build_opener(proxy_support)
page = opener.open(quote(url,safe='/:?=&'), timeout = TIME_OUT)
else:
page = request.urlopen(quote(url,safe='/:?=&'), timeout = TIME_OUT)
# ????? ???read???
t = Timer(TIMER_TIME, timeout_handler, [page])
t.start()
# charset = chardet.detect(page.read())['encoding']
html = page.read().decode(code,'ignore')
t.cancel()
except Exception as e:
log.error(e)
finally:
# page and page.close()
if page and not is_timeout:
page.close()
return html and len(html) < 1024 * 1024 and html or None
评论列表
文章目录