python类html()的实例源码-面圈网

util.py 文件源码项目：citeas-api 作者: Impactstory 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def get_random_dois(n):
    url = u"http://api.crossref.org/works?filter=from-pub-date:2006-01-01&sample={}".format(n)
    r = requests.get(url)
    items = r.json()["message"]["items"]
    dois = [item["DOI"] for item in items]
    print dois

# from https://github.com/elastic/elasticsearch-py/issues/374
# to work around unicode problem
# import elasticsearch
# class JSONSerializerPython2(elasticsearch.serializer.JSONSerializer):
#     """Override elasticsearch library serializer to ensure it encodes utf characters during json dump.
#     See original at: https://github.com/elastic/elasticsearch-py/blob/master/elasticsearch/serializer.py#L42
#     A description of how ensure_ascii encodes unicode characters to ensure they can be sent across the wire
#     as ascii can be found here: https://docs.python.org/2/library/json.html#basic-usage
#     """
#     def dumps(self, data):
#         # don't serialize strings
#         if isinstance(data, elasticsearch.compat.string_types):
#             return data
#         try:
#             return json.dumps(data, default=self.default, ensure_ascii=True)
#         except (ValueError, TypeError) as e:
#             raise elasticsearch.exceptions.SerializationError(data, e)

util.py 文件源码项目：oadoi 作者: Impactstory 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def get_tree(page):
    page = page.replace("&nbsp;", " ")  # otherwise starts-with for lxml doesn't work
    try:
        tree = html.fromstring(page)
    except (etree.XMLSyntaxError, etree.ParserError) as e:
        print u"not parsing, beause etree error in get_tree: {}".format(e)
        tree = None
    return tree

tor_elasticsearch.py 文件源码项目：freshonions-torscraper 作者: dirtyfilthy 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def elasticsearch_pages(context, sort, page):
    result_limit = int(os.environ['RESULT_LIMIT'])
    max_result_limit = int(os.environ['MAX_RESULT_LIMIT'])
    start = (page - 1) * result_limit
    end   = start + result_limit
    domain_query = Q("term", is_banned=False)
    if context["is_up"]:
        domain_query = domain_query & Q("term", is_up=True)
    if not context["show_fh_default"]:
        domain_query = domain_query & Q("term", is_crap=False)
    if not context["show_subdomains"]:
        domain_query = domain_query & Q("term", is_subdomain=False)
    if context["rep"] == "genuine":
        domain_query = domain_query & Q("term", is_genuine=True)
    if context["rep"] == "fake":
        domain_query = domain_query & Q("term", is_fake=True)



    limit = max_result_limit if context["more"] else result_limit

    has_parent_query = Q("has_parent", type="domain", query=domain_query)
    if context['phrase']:
        query = Search().filter(has_parent_query).query(Q("match_phrase", body_stripped=context['search']))
    else:
        query = Search().filter(has_parent_query).query(Q("match", body_stripped=context['search']))

    query = query.highlight_options(order='score', encoder='html').highlight('body_stripped')[start:end]
    query = query.source(['title','domain_id','created_at', 'visited_at']).params(request_cache=True)

    if   context["sort"] == "onion":
        query = query.sort("_parent")
    elif context["sort"] == "visited_at":
        query = query.sort("-visited_at")
    elif context["sort"] == "created_at":
        query = query.sort("-created_at")
    elif context["sort"] == "last_seen":
        query = query.sort("-visited_at")

    return query.execute()

json-to-pkl.py 文件源码项目：message-author-classifier 作者: IvayloAtanasov 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def main():
    # load files
    # TODO: json loading is different every time, use object_pairs_hook?
    #  https://docs.python.org/3/library/json.html#json.load
    with open('../slack-data/users.json', 'r', encoding='utf-8') as users_json:
        users = json.load(users_json)

    with open('../slack-data/channels.json', 'r', encoding='utf-8') as channels_json:
        channels = json.load(channels_json)

    with open('../slack-data/privateChannels.json', 'r', encoding='utf-8') as private_channels_json:
        private_channels = json.load(private_channels_json)

    # merge channels with private channels
    channels = channels + private_channels

    # merge from "per-channel" to "per-user" messages collection
    users_messages = flatten_messages(channels)
    # remove users with not enough messages as over-sampling their messages can lead to overfitting
    users_messages = discard_insufficient_data_users(users_messages, users)
    # stem words in messages
    users_messages = stem_messages(users_messages)
    # make all remained users have equal number of messages
    users_messages = balance_messages(users_messages)

    messages_output = []
    authors_output = []
    for user_id, messages in users_messages.items():
        for message in messages:
            authors_output.append(user_index_by_id(user_id, users))
            messages_output.append(message)

    pickle.dump(messages_output, open('messages.pkl', 'wb'))
    pickle.dump(authors_output, open('authors.pkl', 'wb'))

    print('Saved a total of ' + str(len(messages_output)) + ' processed messages')

desert_mirage_lib.py 文件源码项目：desert-mirage 作者: valentour 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def json_config(jfile, jobj_hook=None, jwrite_obj=None, jappend=None):
    """
    Simple interface to json library functions. Reads JSON data into object
    dictionary or appends json data to existing file.
    See the json library documentation for  more info.
    `json <https://docs.python.org/3/library/json.html>`_

    Parameters
    ----------
    jfile : str
        json file path.
    jobj_hook : function (default: None)
        Decoder. If None, decodes to dict.
    jwrite_obj : obj (default: None)
        Obj to write to existing json file ``jfile``. 
        Evaluated before ``jappend``.
    jappend : obj (default: None)
        New data to append to existing json file ``jfile``.
    """
    # write if file does not exist.
    if jwrite_obj is not None:
        # Write `jwrite_obj` if file does not exist.
        if not any([os.path.isfile(jfile),
                    os.path.isfile(os.path.abspath(jfile)),
                    jwrite_obj]):
            print('writing `jwrite_obj` to new json `jfile`.')
            with open(jfile, 'w') as f:
                json.dump(jwrite_obj, f, sort_keys=True, ensure_ascii=False)
        else:
            print('No json in path provided.')
        return
    if jappend is not None:
        with open(jfile, 'r+') as f:
            json_dict = json.load(f, object_hook=None)
            json_dict.update(jappend)
            f.seek(0)
            f.truncate()  # todo: Improve to only truncate if needed.
            # print(len(f.readlines()))
            json.dump(json_dict, f, sort_keys=True, indent=4)
            f.close()
        return
    with open(jfile) as f:
        if jobj_hook is not None:
            return json.load(f, object_hook=jobj_hook)
        return json.load(f)

_cartesian_class_io.py 文件源码项目：chemcoord 作者: mcocdawc 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def to_cjson(self, buf=None, **kwargs):
        """Write a cjson file or return dictionary.

        The cjson format is specified
        `here <https://github.com/OpenChemistry/chemicaljson>`_.

        Args:
            buf (str): If it is a filepath, the data is written to
                filepath. If it is None, a dictionary with the cjson
                information is returned.
            kwargs: The keyword arguments are passed into the
                ``dump`` function of the
                `json library <https://docs.python.org/3/library/json.html>`_.

        Returns:
            dict:
        """
        cjson_dict = {'chemical json': 0}

        cjson_dict['atoms'] = {}

        atomic_number = constants.elements['atomic_number'].to_dict()
        cjson_dict['atoms'] = {'elements': {}}
        cjson_dict['atoms']['elements']['number'] = [
            int(atomic_number[x]) for x in self['atom']]

        cjson_dict['atoms']['coords'] = {}
        coords = self.loc[:, ['x', 'y', 'z']].values.reshape(len(self) * 3)
        cjson_dict['atoms']['coords']['3d'] = [float(x) for x in coords]

        bonds = []
        bond_dict = self.get_bonds()
        for i in bond_dict:
            for b in bond_dict[i]:
                bonds += [int(i), int(b)]
                bond_dict[b].remove(i)

        cjson_dict['bonds'] = {'connections': {}}
        cjson_dict['bonds']['connections']['index'] = bonds

        if buf is not None:
            with open(buf, mode='w') as f:
                f.write(json.dumps(cjson_dict, **kwargs))
        else:
            return cjson_dict