def main(file, table_file):
questions = read_file(file)
tables = read_file(table_file)
table_dict = dict()
for t in tables:
table_dict[t["map_id"]] = t
result = list()
for q in questions:
try:
result.append(json.dumps(process_question(q, table_dict), default=json_util))
except Exception as e:
print(e)
continue
file_base_name = os.path.basename(file)
dirname = os.path.dirname(file)
with open(os.path.join(dirname, "preprocessed_" + file_base_name), "w") as f:
f.write('\n'.join(result))
print(len(result))
python类json_util()的实例源码
def process_questions(file):
questions = read_file(file)
result = list()
for question in questions:
sentence = question["tokenized_sentence"]
stemmed = list()
lemmatized = list()
for word in sentence:
stemmed.append(stem(word))
lemmatized.append(lemma(word))
question["stem"] = stemmed
question["lemma"] = lemmatized
result.append(json.dumps(question, default=json_util))
file_base_name = os.path.basename(file)
dirname = os.path.dirname(file)
save_file = os.path.join(dirname, "lemmatized_" + file_base_name)
save(save_file, result)
print(len(result))
def dump_json(self, d):
"""
Encodes the object *d* as JSON string.
This method *can be overridden* if you want to use your own json
serializer.
The default implementation uses the :mod:`json` module of the standard
library and (if available) the :mod:`bson` json utils.
:arg d:
:rtype: str
"""
indent = 1 if self.debug else None
if bson:
return json.dumps(d, default=bson.json_util.default, indent=indent)
else:
return json.dumps(d, indent=indent)
def load_json(self, s):
"""
Decods the JSON string *s*.
This method *can be overridden* if you want to use your own json
serializer.
The default implementation uses the :mod:`json` module of the standard
library and (if available) the :mod:`bson` json utils.
:arg str s:
"""
if bson:
return json.loads(s, object_hook=bson.json_util.object_hook)
else:
return json.loads(s)
def transform_value(self, value):
if isinstance(value, dict):
return self.format_document(value)
elif isinstance(value, list):
return [self.transform_value(v) for v in value]
elif isinstance(value, numbers.Number):
if math.isnan(value):
raise ValueError("nan")
elif math.isinf(value):
raise ValueError("inf")
return value
elif isinstance(value, bson.binary.Binary):
return bson.json_util.default(value)
elif compat.is_string(value) or isinstance(value, bool) or value is None:
return value
return bson.json_util.default(value)
def hoplite_dumps(obj, *args, **kwargs):
"""
Serializes a dictionary into unicode(unless specified otherwise)
bson.json_util.dumps does exactly what hoplite needs, so that's what
we call
:param obj: Python dictionary to be serialized
:param args: Please refer to online documentation for bson.json_util.dumps
and json.dumps
:param kwargs: Please refer to online documentation for
bson.json_util.dumps and json.dumps
:return: serialized obj in unicode
"""
return dumps(obj, *args, **kwargs)
def jsonify(*args, **kwargs):
""" jsonify with support for MongoDB ObjectId
See https://gist.github.com/akhenakh/2954605
"""
return Response(json.dumps(dict(*args, **kwargs),
default=json_util.default,
indent=2,
cls=MongoJsonEncoder),
mimetype='application/json')
def humanify(obj, status_code=200):
""" Gets an obj and possibly a status code and returns a flask Resonse
with a jsonified obj, not suitable for humans
>>> humanify({"a": 1})
<Response 8 bytes [200 OK]>
>>> humanify({"a": 1}, 404)
<Response 8 bytes [404 NOT FOUND]>
>>> humanify({"a": 1}).get_data()
'{"a": 1}'
>>> humanify([1,2,3]).get_data()
'[1, 2, 3]'
"""
# TODO: refactor the name to `response`
# jsonify function doesn't work with lists
if type(obj) == list:
data = json.dumps(obj, default=json_util.default)
elif type(obj) == pymongo.cursor.Cursor:
rv = []
for doc in obj:
doc['_id'] = str(doc['_id'])
rv.append(dumps(doc))
data = '[' + ',\n'.join(rv) + ']' + '\n'
else:
data = dumps(obj,
default=json_util.default,
cls=MongoJsonEncoder)
resp = Response(data, mimetype='application/json')
resp.status_code = status_code
return resp
def process_tables(file):
tables = read_file(file)
result = list()
for table in tables:
# Table
lemma_table_name = list()
stem_table_name = list()
table_name = word_tokenize(table["table_name"])
for word in table_name:
lemma_table_name.append(lemma(word))
stem_table_name.append(stem(word))
# Column Name
lemma_column_name = list()
stem_column_name = list()
for cn in table["column_name"]:
tokenized = word_tokenize(cn)
l_temp = list()
s_temp = list()
for word in tokenized:
l_temp.append(lemma(word))
s_temp.append(stem(word))
lemma_column_name.append(l_temp)
stem_column_name.append(s_temp)
# Columns
lemma_columns = list()
stem_columns = list()
for column in table["columns"]:
l_temp = list()
s_temp = list()
for cell in column:
l = list()
s = list()
tokenized = word_tokenize(cell)
for word in tokenized:
l.append(lemma(word))
s.append(stem(word))
l_temp.append(l)
s_temp.append(s)
lemma_columns.append(l_temp)
stem_columns.append(s_temp)
table.update({
"lemma_table_name": lemma_table_name,
"stem_table_name": stem_table_name,
"lemma_column_name": lemma_column_name,
"stem_column_name": stem_column_name,
"lemma_columns": lemma_columns,
"stem_columns": stem_columns
})
result.append(json.dumps(table, default=json_util))
file_base_name = os.path.basename(file)
dirname = os.path.dirname(file)
save_file = os.path.join(dirname, "lemmatized_" + file_base_name)
save(save_file, result)
print(len(result))