def get_random_dois(n):
url = u"http://api.crossref.org/works?filter=from-pub-date:2006-01-01&sample={}".format(n)
r = requests.get(url)
items = r.json()["message"]["items"]
dois = [item["DOI"] for item in items]
print dois
# from https://github.com/elastic/elasticsearch-py/issues/374
# to work around unicode problem
# import elasticsearch
# class JSONSerializerPython2(elasticsearch.serializer.JSONSerializer):
# """Override elasticsearch library serializer to ensure it encodes utf characters during json dump.
# See original at: https://github.com/elastic/elasticsearch-py/blob/master/elasticsearch/serializer.py#L42
# A description of how ensure_ascii encodes unicode characters to ensure they can be sent across the wire
# as ascii can be found here: https://docs.python.org/2/library/json.html#basic-usage
# """
# def dumps(self, data):
# # don't serialize strings
# if isinstance(data, elasticsearch.compat.string_types):
# return data
# try:
# return json.dumps(data, default=self.default, ensure_ascii=True)
# except (ValueError, TypeError) as e:
# raise elasticsearch.exceptions.SerializationError(data, e)
python类html()的实例源码
def get_tree(page):
page = page.replace(" ", " ") # otherwise starts-with for lxml doesn't work
try:
tree = html.fromstring(page)
except (etree.XMLSyntaxError, etree.ParserError) as e:
print u"not parsing, beause etree error in get_tree: {}".format(e)
tree = None
return tree
def elasticsearch_pages(context, sort, page):
result_limit = int(os.environ['RESULT_LIMIT'])
max_result_limit = int(os.environ['MAX_RESULT_LIMIT'])
start = (page - 1) * result_limit
end = start + result_limit
domain_query = Q("term", is_banned=False)
if context["is_up"]:
domain_query = domain_query & Q("term", is_up=True)
if not context["show_fh_default"]:
domain_query = domain_query & Q("term", is_crap=False)
if not context["show_subdomains"]:
domain_query = domain_query & Q("term", is_subdomain=False)
if context["rep"] == "genuine":
domain_query = domain_query & Q("term", is_genuine=True)
if context["rep"] == "fake":
domain_query = domain_query & Q("term", is_fake=True)
limit = max_result_limit if context["more"] else result_limit
has_parent_query = Q("has_parent", type="domain", query=domain_query)
if context['phrase']:
query = Search().filter(has_parent_query).query(Q("match_phrase", body_stripped=context['search']))
else:
query = Search().filter(has_parent_query).query(Q("match", body_stripped=context['search']))
query = query.highlight_options(order='score', encoder='html').highlight('body_stripped')[start:end]
query = query.source(['title','domain_id','created_at', 'visited_at']).params(request_cache=True)
if context["sort"] == "onion":
query = query.sort("_parent")
elif context["sort"] == "visited_at":
query = query.sort("-visited_at")
elif context["sort"] == "created_at":
query = query.sort("-created_at")
elif context["sort"] == "last_seen":
query = query.sort("-visited_at")
return query.execute()
def main():
# load files
# TODO: json loading is different every time, use object_pairs_hook?
# https://docs.python.org/3/library/json.html#json.load
with open('../slack-data/users.json', 'r', encoding='utf-8') as users_json:
users = json.load(users_json)
with open('../slack-data/channels.json', 'r', encoding='utf-8') as channels_json:
channels = json.load(channels_json)
with open('../slack-data/privateChannels.json', 'r', encoding='utf-8') as private_channels_json:
private_channels = json.load(private_channels_json)
# merge channels with private channels
channels = channels + private_channels
# merge from "per-channel" to "per-user" messages collection
users_messages = flatten_messages(channels)
# remove users with not enough messages as over-sampling their messages can lead to overfitting
users_messages = discard_insufficient_data_users(users_messages, users)
# stem words in messages
users_messages = stem_messages(users_messages)
# make all remained users have equal number of messages
users_messages = balance_messages(users_messages)
messages_output = []
authors_output = []
for user_id, messages in users_messages.items():
for message in messages:
authors_output.append(user_index_by_id(user_id, users))
messages_output.append(message)
pickle.dump(messages_output, open('messages.pkl', 'wb'))
pickle.dump(authors_output, open('authors.pkl', 'wb'))
print('Saved a total of ' + str(len(messages_output)) + ' processed messages')
def json_config(jfile, jobj_hook=None, jwrite_obj=None, jappend=None):
"""
Simple interface to json library functions. Reads JSON data into object
dictionary or appends json data to existing file.
See the json library documentation for more info.
`json <https://docs.python.org/3/library/json.html>`_
Parameters
----------
jfile : str
json file path.
jobj_hook : function (default: None)
Decoder. If None, decodes to dict.
jwrite_obj : obj (default: None)
Obj to write to existing json file ``jfile``.
Evaluated before ``jappend``.
jappend : obj (default: None)
New data to append to existing json file ``jfile``.
"""
# write if file does not exist.
if jwrite_obj is not None:
# Write `jwrite_obj` if file does not exist.
if not any([os.path.isfile(jfile),
os.path.isfile(os.path.abspath(jfile)),
jwrite_obj]):
print('writing `jwrite_obj` to new json `jfile`.')
with open(jfile, 'w') as f:
json.dump(jwrite_obj, f, sort_keys=True, ensure_ascii=False)
else:
print('No json in path provided.')
return
if jappend is not None:
with open(jfile, 'r+') as f:
json_dict = json.load(f, object_hook=None)
json_dict.update(jappend)
f.seek(0)
f.truncate() # todo: Improve to only truncate if needed.
# print(len(f.readlines()))
json.dump(json_dict, f, sort_keys=True, indent=4)
f.close()
return
with open(jfile) as f:
if jobj_hook is not None:
return json.load(f, object_hook=jobj_hook)
return json.load(f)
def to_cjson(self, buf=None, **kwargs):
"""Write a cjson file or return dictionary.
The cjson format is specified
`here <https://github.com/OpenChemistry/chemicaljson>`_.
Args:
buf (str): If it is a filepath, the data is written to
filepath. If it is None, a dictionary with the cjson
information is returned.
kwargs: The keyword arguments are passed into the
``dump`` function of the
`json library <https://docs.python.org/3/library/json.html>`_.
Returns:
dict:
"""
cjson_dict = {'chemical json': 0}
cjson_dict['atoms'] = {}
atomic_number = constants.elements['atomic_number'].to_dict()
cjson_dict['atoms'] = {'elements': {}}
cjson_dict['atoms']['elements']['number'] = [
int(atomic_number[x]) for x in self['atom']]
cjson_dict['atoms']['coords'] = {}
coords = self.loc[:, ['x', 'y', 'z']].values.reshape(len(self) * 3)
cjson_dict['atoms']['coords']['3d'] = [float(x) for x in coords]
bonds = []
bond_dict = self.get_bonds()
for i in bond_dict:
for b in bond_dict[i]:
bonds += [int(i), int(b)]
bond_dict[b].remove(i)
cjson_dict['bonds'] = {'connections': {}}
cjson_dict['bonds']['connections']['index'] = bonds
if buf is not None:
with open(buf, mode='w') as f:
f.write(json.dumps(cjson_dict, **kwargs))
else:
return cjson_dict