python类UnicodeDammit()的实例源码

rsc.py 文件源码 项目:ChemDataExtractor 作者: mcs07 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root
serializer.py 文件源码 项目:maoyan 作者: Eastwu5788 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def __init__(self, data, encoding=None):
        """
         Initialize serializer class
         :param data: ori data
         :param encoding: encoding type of your ori data
         """
        self.data = data

        if not self.data:
            raise ValueError("You must input origin data to this class")

        # if you don't support encoding type we will use chardet to check the type
        self.encoding = encoding if encoding else UnicodeDammit(self.data).original_encoding
        self.encoding = None if self.encoding == "utf-8" else self.encoding

        # initialize beautiful soup
        # only_content_div = SoupStrainer("body")
        self.obj = BeautifulSoup(data, features="lxml", from_encoding=self.encoding)
clean.py 文件源码 项目:presswork 作者: hangtwenty 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def simplify_quotes(text):
    """ Even though UnicodeDammit smart_quotes_to="ascii" takes care of many cases, some crap can still be left...

    In addition to the smart-quotes, on *output* we also want to catch the case of `` -> " and '' -> "
    (NLTK has some tokenizers that convert like that).

    So, this can be used in the input cleaners chain, AFTER UnicodeDammit; it can also be used from OutputProofreader.

        >>> text = b'Have some ``weird" “quotes” and curlies,”  won’t you please. Quotes are ‘fun’'.decode('utf8')
        >>> print simplify_quotes(text)
        Have some "weird" "quotes" and curlies,"  won't you please. Quotes are 'fun'
        >>> print simplify_quotes(unichr(8220) + u"foo" + unichr(8221) + unichr(8216) + u"bar" + unichr(8217))
        "foo"'bar'
        >>> text = b'``weird" “quotes” aren’t very ‘fun’ I don’t think'.decode('utf8')
        >>> print simplify_quotes(text)
        "weird" "quotes" aren't very 'fun' I don't think
    """
    return (text
            .replace(u"``", u'"')
            .replace(u"''", u'"')
            .replace(u'“', u'"')
            .replace(u'”', u'"')
            .replace(u'’', u"'")
            .replace(u'‘', u"'"))
LinkedinSpider.py 文件源码 项目:spiders 作者: poodarchu 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse(self, response):
        """
        default parse method, rule is not useful now
        """
        # import pdb; pdb.set_trace()
        response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
        hxs = HtmlXPathSelector(response)
        index_level = self.determine_level(response)
        log.msg("Parse: index level:" + str(index_level))
        if index_level in [1, 2, 3, 4]:
            self.save_to_file_system(index_level, response)
            relative_urls = self.get_follow_links(index_level, hxs)
            if relative_urls is not None:
                for url in relative_urls:
                    log.msg('yield process, url:' + url)
                    yield Request(url, callback=self.parse)
        elif index_level == 5:
            personProfile = HtmlParser.extract_person_profile(hxs)
            linkedin_id = self.get_linkedin_id(response.url)
            linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
            if linkedin_id:
                personProfile['_id'] = linkedin_id
                personProfile['url'] = UnicodeDammit(response.url).markup
                yield personProfile
getEncoding.py 文件源码 项目:simplestock 作者: destinym 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def decode_html(html_string):
    """??BS4?UnicodeDammit???????, ???unicode??, ???????100%, ????????
    """
    dammit = UnicodeDammit(html_string, ['GB2312', 'GBK', 'GB18030'], smart_quotes_to="html", is_html=True)
    doc = dammit.unicode_markup
    #print("dammit —— ", dammit.original_encoding)
    # FIXME ???????'ISO-8859-2', ??????, ????????????
    if dammit.original_encoding == 'ISO-8859-2':
        enc = get_encoding(html_string)
        print(enc)
        enc ="utf-8"
        doc = html_string.decode(enc)
    elif not dammit.unicode_markup:
        raise UnicodeDecodeError("Failed to detect encoding, tried [%s]", ', '.join(dammit.triedEncodings))
    # print(doc.encode('utf-8'))
    return doc
str_clean.py 文件源码 项目:lichking 作者: melonrun 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def clean_unicode(comment_str):
        comment_str = comment_str.replace('\n', '').replace('\r', '').strip()
        comment_str = ' '.join(comment_str.split())
        return UnicodeDammit(comment_str).unicode_markup
selector.py 文件源码 项目:ChemDataExtractor 作者: mcs07 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def _get_encoding(cls, input_string, encoding):
        converted = UnicodeDammit(input_string, [encoding] if encoding else [])
        # Not worth raising exception? lxml will raise if parse fails.
        # if not converted.unicode_markup:
        #     raise UnicodeDecodeError('Failed to detect encoding')
        return converted.original_encoding
__init__.py 文件源码 项目:ChemDataExtractor 作者: mcs07 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_encoding(input_string, guesses=None, is_html=False):
    """Return the encoding of a byte string. Uses bs4 UnicodeDammit.

    :param string input_string: Encoded byte string.
    :param list[string] guesses: (Optional) List of encoding guesses to prioritize.
    :param bool is_html: Whether the input is HTML.
    """
    converted = UnicodeDammit(input_string, override_encodings=[guesses] if guesses else [], is_html=is_html)
    return converted.original_encoding
utils.py 文件源码 项目:htmltab 作者: flother 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse_html(html_file):
    """
    Read the HTML file using lxml's HTML parser, but convert to Unicode
    using Beautiful Soup's UnicodeDammit class.

    Can raise LxmlError or TypeError if the file can't be opened or
    parsed.
    """
    unicode_html = UnicodeDammit(html_file, smart_quotes_to="html",
                                 is_html=True)
    if unicode_html.unicode_markup is None:
        raise ValueError("no HTML provided")
    if not unicode_html.unicode_markup:
        raise ValueError("could not detect character encoding")
    return lxml.html.fromstring(unicode_html.unicode_markup)
clean.py 文件源码 项目:presswork 作者: hangtwenty 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def unicode_dammit(s, override_encodings=('utf-8', 'windows-1252', 'iso-8859-1', 'latin-1'), smart_quotes_to="ascii"):
    """ using bs4.UnicodeDammit, "coerce" text to unicode. replaces (some) 'smart quotes'. fixes (some) mixed encodings

    What's it do under the hood? The docs explain some, the source explains even more of course.
    https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit

        >>> with_smart_quotes = b"I just \x93love\x94 your word processor\x92s smart quotes"
        >>> assert unicode_dammit(with_smart_quotes) == 'I just "love" your word processor\\'s smart quotes'

    :param override_encodings: why these defaults - in short, they are commonly seen in input texts I've played with.
        whether they are mixed or not. someday-maybe this can be configured with better control if needed.
    """

    cleaned = UnicodeDammit(s, smart_quotes_to=smart_quotes_to, override_encodings=override_encodings).unicode_markup
    return cleaned
addon_old.py 文件源码 项目:service.subtitles.brokensubs 作者: iamninja 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def decode_html(html_string):
  converted = UnicodeDammit(html_string)
  if not converted.unicode_markup:
    raise UnicodeDecodeError(
    "Failed to detect encoding, tried [%s]",
    ', '.join(converted.tried_encodings))
  # print converted.original_encoding
  return converted.unicode_markup
markdownhtml.py 文件源码 项目:transformer 作者: zapier 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def to_unicode_or_bust(self, obj, encoding='utf-8'):
        try:
            if isinstance(obj, basestring):
                if not isinstance(obj, unicode):
                    obj = unicode(obj, encoding)
            return obj
        except:
            return bs4.UnicodeDammit(obj, is_html=False).unicode_markup
one.py 文件源码 项目:falsy 作者: pingf 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def pycurl_get_resp(data_buf, headers, payload, resp):
    charset = None
    if 'content-type' in headers:
        content_type = headers['content-type'].lower()
        match = re.search('charset=(\S+)', content_type)
        if match:
            charset = match.group(1)
            print('Decoding using %s' % charset)
    body = data_buf.getvalue()
    if len(body) == 0:
        data = ''
        charset = 'utf-8'
    else:
        if charset is None:
            dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
            data = dammit.unicode_markup
            charset = dammit.original_encoding
        else:
            data = body.decode(charset, 'ignore')
    # headers.remove({})
    headers['content'] = [h for h in headers['content'] if len(h) > 0]
    soup_lxml = BeautifulSoup(data, 'lxml')
    soup_html = BeautifulSoup(data, 'html.parser')
    resp.update({
        'url': payload.get('url'),
        # 'soup': soup,
        'title': get_title(soup_lxml),
        'links': get_links(soup_lxml),
        'links2': get_links2(soup_lxml),
        'metas': get_metas(soup_lxml),
        'images': get_images(soup_lxml),
        'scripts': get_scripts(soup_lxml),
        'text': get_text(soup_html),
        'data': data,
        'headers': headers,
        'charset': charset,
        'spider': 'pycurl',
        'payload': payload,
    })
chromeboy.py 文件源码 项目:falsy 作者: pingf 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def beautify(self, data, charset):
        dammit = UnicodeDammit(data, [charset, "utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
        data = dammit.unicode_markup
        return data
utils.py 文件源码 项目:idealoom 作者: conversence 项目源码 文件源码 阅读 42 收藏 0 点赞 0 评论 0
def normalize_email_name(name):
    name = UnicodeDammit(name).unicode_markup
    # sanitize, keep only words, spaces and minimal punctuation
    # includes unicode apostrophes, though.
    name = re.sub(
        r"[^-\w\s'\u2019\u2032\u00b4\.\(\)]", '', name, 0, re.UNICODE)
    return name
request.py 文件源码 项目:falsy 作者: pingf 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def get_request(payload, share=None):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_get(c, payload, data_buf, headers, share)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            charset = None
            if 'content-type' in headers:
                content_type = headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    charset = match.group(1)
                    print('Decoding using %s' % charset)
            body = data_buf.getvalue()
            if len(body) == 0:
                data = ''
                charset = 'utf-8'
            else:
                if charset is None:
                    dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
                    data = dammit.unicode_markup
                    charset = dammit.original_encoding
                else:
                    data = body.decode(charset, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]
            soup_lxml = BeautifulSoup(data, 'lxml')
            soup_html = BeautifulSoup(data, 'html.parser')
            resp.update({
                'url': payload.get('url'),
                # 'soup': soup,
                'title': get_title(soup_lxml),
                'links': get_links(soup_lxml),
                'links2': get_links2(soup_lxml),
                'metas': get_metas(soup_lxml),
                'images': get_images(soup_lxml),
                'scripts': get_scripts(soup_lxml),
                'text': get_text(soup_html),
                'data': data,
                'headers': headers,
                'charset': charset,
                'spider': 'pycurl',
                'payload': payload,
            })
            post_func = payload.get('post_func')
            if post_func:
                post_func = load(post_func)
                resp = post_func(payload, resp)
            return resp
    finally:
        c.close()
request.py 文件源码 项目:falsy 作者: pingf 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def post_request(payload, share=None):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_post(c, payload, data_buf, headers, share)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            # encoding = None
            # if 'content-type' in headers:
            #     content_type = headers['content-type'].lower()
            #     match = re.search('charset=(\S+)', content_type)
            #     if match:
            #         encoding = match.group(1)
            #         print('Decoding using %s' % encoding)
            body = data_buf.getvalue()
            encoding = 'utf-8'
            data = body.decode(encoding, 'ignore') if len(body) > 0 else ''

            # if encoding is None:
            #     dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
            #     data = dammit.unicode_markup
            #     encoding = dammit.original_encoding
            # else:
            #     data = body.decode(encoding, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]

            resp.update({
                # 'url': payload.get('url'),
                'data': data,
                'headers': headers,
                'encoding': encoding,
            })
            post_func = payload.get('post_func')
            if type(post_func) == str:
                post_func = load(post_func)
            if post_func:
                resp = post_func(payload, resp)
            # post_func = payload.get('post_func')
            # if post_func:
            #     post_func = load(post_func)
            #     resp = post_func(payload, resp)
            return resp
    finally:
        c.close()


问题


面经


文章

微信
公众号

扫码关注公众号