worker_filter.py 文件源码-python代码片段

worker_filter.py 文件源码

python

阅读 29 收藏 0 点赞 0 评论 0

项目：crawler_old 作者: salmonx 项目源码文件源码

def filter_encoding(self,seed, headers,content):

        code = utils.get_encoding_from_headers(headers)
        if code:
            if code.lower() == 'gbk' or code.lower() == 'gb2312':
                code = 'gbk'
                return True
            elif code.lower() == 'utf-8' or code.lower() == 'utf8':
                code = 'utf8'
                # as for utf8, we should check the content
            else: #  'ISO-8859-1' and so on, 
                code = None

        # chinese website may also miss the content-encoding header, so detect the content
        if code == None:
            codes = utils.get_encodings_from_content(content)
            if codes:
                for code in codes:
                    if code.lower() in [ 'gbk','gb2312']:
                        return True
                    elif code.lower() == 'utf8' or code.lower() == 'utf-8':
                        code = 'utf8'
                        break

        if code != 'utf8':
            return False

        # here handle utf8
        # to detect any chinese char win
        try:
            ucon = content.decode('utf8')
            for uchar in ucon:
                i = ord(uchar)
                if i >= 0x4e00 and i <= 0x9fa5:
                    return True
        except Exception, e:
            print url, e
            pass
        return False