tools.py 文件源码-python代码片段

tools.py 文件源码

python

阅读 33 收藏 0 点赞 0 评论 0

项目：internet-content-detection 作者: liubo0621 项目源码文件源码

def get_html_by_urllib(url, code = 'utf-8', headers = {}, proxies = {}):
    html = None
    if not url.endswith('.exe') and not url.endswith('.EXE'):
        page = None
        is_timeout = False
        try:
            def timeout_handler(response):
                is_timeout = True
                if response:
                    response.close()

            if proxies:
                proxy_support = request.ProxyHandler(proxies)
                opener = request.build_opener(proxy_support)
                page = opener.open(quote(url,safe='/:?=&'), timeout = TIME_OUT)

            else:
                page = request.urlopen(quote(url,safe='/:?=&'), timeout = TIME_OUT)
            # ????? ???read???
            t = Timer(TIMER_TIME, timeout_handler, [page])
            t.start()
            # charset = chardet.detect(page.read())['encoding']
            html = page.read().decode(code,'ignore')
            t.cancel()

        except Exception as e:
            log.error(e)
        finally:
            # page and page.close()
            if page and not is_timeout:
                page.close()

    return html and len(html) < 1024 * 1024 and html or None