python类UnicodeDammit()的实例源码-面圈网

rsc.py 文件源码项目：ChemDataExtractor 作者: mcs07 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root

serializer.py 文件源码项目：maoyan 作者: Eastwu5788 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def __init__(self, data, encoding=None):
        """
         Initialize serializer class
         :param data: ori data
         :param encoding: encoding type of your ori data
         """
        self.data = data

        if not self.data:
            raise ValueError("You must input origin data to this class")

        # if you don't support encoding type we will use chardet to check the type
        self.encoding = encoding if encoding else UnicodeDammit(self.data).original_encoding
        self.encoding = None if self.encoding == "utf-8" else self.encoding

        # initialize beautiful soup
        # only_content_div = SoupStrainer("body")
        self.obj = BeautifulSoup(data, features="lxml", from_encoding=self.encoding)

clean.py 文件源码项目：presswork 作者: hangtwenty 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def simplify_quotes(text):
    """ Even though UnicodeDammit smart_quotes_to="ascii" takes care of many cases, some crap can still be left...

    In addition to the smart-quotes, on *output* we also want to catch the case of `` -> " and '' -> "
    (NLTK has some tokenizers that convert like that).

    So, this can be used in the input cleaners chain, AFTER UnicodeDammit; it can also be used from OutputProofreader.

        >>> text = b'Have some ``weird" “quotes” and curlies,”  won’t you please. Quotes are ‘fun’'.decode('utf8')
        >>> print simplify_quotes(text)
        Have some "weird" "quotes" and curlies,"  won't you please. Quotes are 'fun'
        >>> print simplify_quotes(unichr(8220) + u"foo" + unichr(8221) + unichr(8216) + u"bar" + unichr(8217))
        "foo"'bar'
        >>> text = b'``weird" “quotes” aren’t very ‘fun’ I don’t think'.decode('utf8')
        >>> print simplify_quotes(text)
        "weird" "quotes" aren't very 'fun' I don't think
    """
    return (text
            .replace(u"``", u'"')
            .replace(u"''", u'"')
            .replace(u'“', u'"')
            .replace(u'”', u'"')
            .replace(u'’', u"'")
            .replace(u'‘', u"'"))

LinkedinSpider.py 文件源码项目：spiders 作者: poodarchu 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def parse(self, response):
        """
        default parse method, rule is not useful now
        """
        # import pdb; pdb.set_trace()
        response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
        hxs = HtmlXPathSelector(response)
        index_level = self.determine_level(response)
        log.msg("Parse: index level:" + str(index_level))
        if index_level in [1, 2, 3, 4]:
            self.save_to_file_system(index_level, response)
            relative_urls = self.get_follow_links(index_level, hxs)
            if relative_urls is not None:
                for url in relative_urls:
                    log.msg('yield process, url:' + url)
                    yield Request(url, callback=self.parse)
        elif index_level == 5:
            personProfile = HtmlParser.extract_person_profile(hxs)
            linkedin_id = self.get_linkedin_id(response.url)
            linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
            if linkedin_id:
                personProfile['_id'] = linkedin_id
                personProfile['url'] = UnicodeDammit(response.url).markup
                yield personProfile

getEncoding.py 文件源码项目：simplestock 作者: destinym 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def decode_html(html_string):
    """??BS4?UnicodeDammit???????, ???unicode??, ???????100%, ????????
    """
    dammit = UnicodeDammit(html_string, ['GB2312', 'GBK', 'GB18030'], smart_quotes_to="html", is_html=True)
    doc = dammit.unicode_markup
    #print("dammit —— ", dammit.original_encoding)
    # FIXME ???????'ISO-8859-2', ??????, ????????????
    if dammit.original_encoding == 'ISO-8859-2':
        enc = get_encoding(html_string)
        print(enc)
        enc ="utf-8"
        doc = html_string.decode(enc)
    elif not dammit.unicode_markup:
        raise UnicodeDecodeError("Failed to detect encoding, tried [%s]", ', '.join(dammit.triedEncodings))
    # print(doc.encode('utf-8'))
    return doc

str_clean.py 文件源码项目：lichking 作者: melonrun 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def clean_unicode(comment_str):
        comment_str = comment_str.replace('\n', '').replace('\r', '').strip()
        comment_str = ' '.join(comment_str.split())
        return UnicodeDammit(comment_str).unicode_markup

selector.py 文件源码项目：ChemDataExtractor 作者: mcs07 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def _get_encoding(cls, input_string, encoding):
        converted = UnicodeDammit(input_string, [encoding] if encoding else [])
        # Not worth raising exception? lxml will raise if parse fails.
        # if not converted.unicode_markup:
        #     raise UnicodeDecodeError('Failed to detect encoding')
        return converted.original_encoding

__init__.py 文件源码项目：ChemDataExtractor 作者: mcs07 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def get_encoding(input_string, guesses=None, is_html=False):
    """Return the encoding of a byte string. Uses bs4 UnicodeDammit.

    :param string input_string: Encoded byte string.
    :param list[string] guesses: (Optional) List of encoding guesses to prioritize.
    :param bool is_html: Whether the input is HTML.
    """
    converted = UnicodeDammit(input_string, override_encodings=[guesses] if guesses else [], is_html=is_html)
    return converted.original_encoding

utils.py 文件源码项目：htmltab 作者: flother 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def parse_html(html_file):
    """
    Read the HTML file using lxml's HTML parser, but convert to Unicode
    using Beautiful Soup's UnicodeDammit class.

    Can raise LxmlError or TypeError if the file can't be opened or
    parsed.
    """
    unicode_html = UnicodeDammit(html_file, smart_quotes_to="html",
                                 is_html=True)
    if unicode_html.unicode_markup is None:
        raise ValueError("no HTML provided")
    if not unicode_html.unicode_markup:
        raise ValueError("could not detect character encoding")
    return lxml.html.fromstring(unicode_html.unicode_markup)

clean.py 文件源码项目：presswork 作者: hangtwenty 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def unicode_dammit(s, override_encodings=('utf-8', 'windows-1252', 'iso-8859-1', 'latin-1'), smart_quotes_to="ascii"):
    """ using bs4.UnicodeDammit, "coerce" text to unicode. replaces (some) 'smart quotes'. fixes (some) mixed encodings

    What's it do under the hood? The docs explain some, the source explains even more of course.
    https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit

        >>> with_smart_quotes = b"I just \x93love\x94 your word processor\x92s smart quotes"
        >>> assert unicode_dammit(with_smart_quotes) == 'I just "love" your word processor\\'s smart quotes'

    :param override_encodings: why these defaults - in short, they are commonly seen in input texts I've played with.
        whether they are mixed or not. someday-maybe this can be configured with better control if needed.
    """

    cleaned = UnicodeDammit(s, smart_quotes_to=smart_quotes_to, override_encodings=override_encodings).unicode_markup
    return cleaned

addon_old.py 文件源码项目：service.subtitles.brokensubs 作者: iamninja 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def decode_html(html_string):
  converted = UnicodeDammit(html_string)
  if not converted.unicode_markup:
    raise UnicodeDecodeError(
    "Failed to detect encoding, tried [%s]",
    ', '.join(converted.tried_encodings))
  # print converted.original_encoding
  return converted.unicode_markup

markdownhtml.py 文件源码项目：transformer 作者: zapier 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def to_unicode_or_bust(self, obj, encoding='utf-8'):
        try:
            if isinstance(obj, basestring):
                if not isinstance(obj, unicode):
                    obj = unicode(obj, encoding)
            return obj
        except:
            return bs4.UnicodeDammit(obj, is_html=False).unicode_markup

one.py 文件源码项目：falsy 作者: pingf 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def pycurl_get_resp(data_buf, headers, payload, resp):
    charset = None
    if 'content-type' in headers:
        content_type = headers['content-type'].lower()
        match = re.search('charset=(\S+)', content_type)
        if match:
            charset = match.group(1)
            print('Decoding using %s' % charset)
    body = data_buf.getvalue()
    if len(body) == 0:
        data = ''
        charset = 'utf-8'
    else:
        if charset is None:
            dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
            data = dammit.unicode_markup
            charset = dammit.original_encoding
        else:
            data = body.decode(charset, 'ignore')
    # headers.remove({})
    headers['content'] = [h for h in headers['content'] if len(h) > 0]
    soup_lxml = BeautifulSoup(data, 'lxml')
    soup_html = BeautifulSoup(data, 'html.parser')
    resp.update({
        'url': payload.get('url'),
        # 'soup': soup,
        'title': get_title(soup_lxml),
        'links': get_links(soup_lxml),
        'links2': get_links2(soup_lxml),
        'metas': get_metas(soup_lxml),
        'images': get_images(soup_lxml),
        'scripts': get_scripts(soup_lxml),
        'text': get_text(soup_html),
        'data': data,
        'headers': headers,
        'charset': charset,
        'spider': 'pycurl',
        'payload': payload,
    })

chromeboy.py 文件源码项目：falsy 作者: pingf 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def beautify(self, data, charset):
        dammit = UnicodeDammit(data, [charset, "utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
        data = dammit.unicode_markup
        return data

utils.py 文件源码项目：idealoom 作者: conversence 项目源码文件源码阅读 47 收藏 0 点赞 0 评论 0

def normalize_email_name(name):
    name = UnicodeDammit(name).unicode_markup
    # sanitize, keep only words, spaces and minimal punctuation
    # includes unicode apostrophes, though.
    name = re.sub(
        r"[^-\w\s'\u2019\u2032\u00b4\.\(\)]", '', name, 0, re.UNICODE)
    return name

request.py 文件源码项目：falsy 作者: pingf 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def get_request(payload, share=None):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_get(c, payload, data_buf, headers, share)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            charset = None
            if 'content-type' in headers:
                content_type = headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    charset = match.group(1)
                    print('Decoding using %s' % charset)
            body = data_buf.getvalue()
            if len(body) == 0:
                data = ''
                charset = 'utf-8'
            else:
                if charset is None:
                    dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
                    data = dammit.unicode_markup
                    charset = dammit.original_encoding
                else:
                    data = body.decode(charset, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]
            soup_lxml = BeautifulSoup(data, 'lxml')
            soup_html = BeautifulSoup(data, 'html.parser')
            resp.update({
                'url': payload.get('url'),
                # 'soup': soup,
                'title': get_title(soup_lxml),
                'links': get_links(soup_lxml),
                'links2': get_links2(soup_lxml),
                'metas': get_metas(soup_lxml),
                'images': get_images(soup_lxml),
                'scripts': get_scripts(soup_lxml),
                'text': get_text(soup_html),
                'data': data,
                'headers': headers,
                'charset': charset,
                'spider': 'pycurl',
                'payload': payload,
            })
            post_func = payload.get('post_func')
            if post_func:
                post_func = load(post_func)
                resp = post_func(payload, resp)
            return resp
    finally:
        c.close()

request.py 文件源码项目：falsy 作者: pingf 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def post_request(payload, share=None):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_post(c, payload, data_buf, headers, share)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            # encoding = None
            # if 'content-type' in headers:
            #     content_type = headers['content-type'].lower()
            #     match = re.search('charset=(\S+)', content_type)
            #     if match:
            #         encoding = match.group(1)
            #         print('Decoding using %s' % encoding)
            body = data_buf.getvalue()
            encoding = 'utf-8'
            data = body.decode(encoding, 'ignore') if len(body) > 0 else ''

            # if encoding is None:
            #     dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
            #     data = dammit.unicode_markup
            #     encoding = dammit.original_encoding
            # else:
            #     data = body.decode(encoding, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]

            resp.update({
                # 'url': payload.get('url'),
                'data': data,
                'headers': headers,
                'encoding': encoding,
            })
            post_func = payload.get('post_func')
            if type(post_func) == str:
                post_func = load(post_func)
            if post_func:
                resp = post_func(payload, resp)
            # post_func = payload.get('post_func')
            # if post_func:
            #     post_func = load(post_func)
            #     resp = post_func(payload, resp)
            return resp
    finally:
        c.close()