def run_query(query, cache_key, expire=3600, dialect='legacy'):
memcached_client = memcached_discovery.get_client()
if memcached_client is None:
return _run(query, dialect=dialect)
else:
json = memcached_client.get(cache_key)
if json is not None:
df = pd.read_json(json, orient='records')
else:
df = _run(query, dialect=dialect)
memcached_client.set(cache_key, df.to_json(orient='records'), expire=expire)
return df
python类read_json()的实例源码
def main():
start_time = time.time()
args = parse_args()
logger.setLevel(getattr(logging, args.verbosity.upper()))
logger.info("Started")
build_constants()
df = pd.read_json(path_or_buf=DATA_PATH, orient='records', encoding="UTF8")
logger.debug("Loaded {} rows into df".format(len(df)))
df = utils.get_data_subset.crop(df, None, None)
df = utils.get_data_subset.filter_rows_by_string(df,
[TARGET_COL],
['Rock',
'Hip Hop'])
df = utils.clean_data.execute_cleaners(df)
df = utils.normalize_data.normalize_genres(df, TARGET_COL)
X, y = utils.get_data_subset.get_x_y(df, SAMPLE_COL, TARGET_COL)
clf = model_pipeline.get_pipeline(SAMPLE_COL)
utils.persistence.dump(DF_DUMP_NAME, df)
utils.persistence.dump(CLF_DUMP_NAME, clf)
if args.train:
train_and_test.train_and_dump(X, y, clf)
elif args.test:
train_and_test.test_using_kfold(X, y, clf)
logger.info("Finished in {0:.2f} seconds".format(time.time() - start_time))
def handle_dotio_url(wf_module, url, split_url, num_rows):
"""
Processes response for any request to enigma.io. Here, we assume that the API key is provided,
because, at least at first glance (or two or three) there doesn't seem to be any provisions for
accessing dataset endpoints sans API key.
"""
if num_rows > 500:
wf_module.set_error("You can request a maximum of 500 rows.")
return
if "/limit/" not in url:
if url.endswith('/'):
url += "limit/{}".format(num_rows)
else:
url += "/limit/{}".format(num_rows)
response = requests.get(url)
if response.status_code != 200:
error = json.loads(response.text)
if "message" in error:
message = error["message"]
else:
message = error["info"]["message"]
if "additional" in error["info"]:
message += ": " + error["info"]["additional"]["message"]
wf_module.set_error("Unable to retrieve data from Enigma. Received {} status, with message {}"
.format(response.status_code, message))
return
try:
json_text = json.loads(response.text)
table = pd.read_json(json.dumps(json_text['result']))
return table
except Exception as ex: # Generic exceptions suck, but is it the most pragmatic/all-encompassing here?
wf_module.set_error("Unable to process request: {}".format(str(ex)))
return
def _from_json(self, value, obj=None):
if value is not None:
df = pd.read_json(json.dumps(value), orient="split")
else:
df = pd.DataFrame()
return df
def _from_json(self, value, obj=None):
if value is not None:
df = pd.read_json(json.dumps(value), orient="split")
else:
df = pd.DataFrame()
return df
def pd_json_to_df(self, data_json, sorted_by_key="Date", in_ascending=True):
import pandas as pd
new_df = pd.read_json(data_json).sort_values(by=sorted_by_key, ascending=in_ascending)
return new_df
# end of pd_json_to_df
medium_posts_data_reader.py 文件源码
项目:Medium-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def read_posts():
posts = list()
file_in = open('./post_list.txt', 'r')
post_list = str(file_in.read()).split(' ')
file_in.close()
num = 0
for post_id in post_list:
if not post_id:
continue
if not os.path.exists('./data/Posts/%s.json' % post_id):
continue
try:
file_in = open('./data/Posts/%s.json' % post_id, 'r')
raw_data = json.loads(str(file_in.read()))
file_in.close()
post = dict()
post['post_id'] = post_id
post['published_date'] = raw_data['published_date']
post['recommends'] = raw_data['recommends']
post['responses'] = raw_data['responses']
posts.append(post)
except:
continue
num += 1
print(post_id)
print(num)
return pd.read_json(json.dumps(posts))
medium_tags_data_reader.py 文件源码
项目:Medium-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def read_posts():
posts = list()
file_in = open('./post_list.txt', 'r')
post_list = str(file_in.read()).split(' ')
file_in.close()
num = 0
for post_id in post_list:
if not post_id:
continue
if not os.path.exists('./data/Posts/%s.json' % post_id):
continue
try:
file_in = open('./data/Posts/%s.json' % post_id, 'r')
raw_data = json.loads(str(file_in.read()))
file_in.close()
for tag in raw_data['tags']:
post = dict()
post['post_id'] = post_id
post['published_date'] = raw_data['published_date']
post['recommends'] = raw_data['recommends']
post['responses'] = raw_data['responses']
post['tag'] = tag['name']
posts.append(post)
print(post)
except:
continue
num += 1
print(post_id)
print(num)
return pd.read_json(json.dumps(posts))
medium_users_data_reader.py 文件源码
项目:Medium-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def read_users():
users = list()
file_in = open('./username_list.txt', 'r')
username_list = str(file_in.read()).split(' ')
file_in.close()
num = 0
for username in username_list:
if not username:
continue
if not os.path.exists('./data/Users/%s.json' % username):
continue
try:
file_in = open('./data/Users/%s.json' % username, 'r')
raw_data = json.loads(str(file_in.read()))
file_in.close()
user = dict()
user['username'] = username
user['reg_date'] = datetime.date.fromtimestamp(raw_data['profile']['user']['createdAt']/1000.0).isoformat()
if not raw_data['profile']['user']['lastPostCreatedAt']:
raw_data['profile']['user']['lastPostCreatedAt'] = raw_data['profile']['user']['createdAt']
user['last_post_date'] = datetime.date.fromtimestamp(raw_data['profile']['user']['lastPostCreatedAt']/1000.0).isoformat()
user['posts_count'] = raw_data['profile']['numberOfPostsPublished']
user['following_count'] = raw_data['profile']['user']['socialStats']['usersFollowedCount']
user['followers_count'] = raw_data['profile']['user']['socialStats']['usersFollowedByCount']
users.append(user)
except:
continue
num += 1
print(username)
print(num)
return pd.read_json(json.dumps(users))
def data_received(self, data):
updateOZ_event.data=pd.read_json(data.decode())
updateOZ_event.set()
def handle_OZServer(loop):
reader, writer = yield from asyncio.open_connection('127.0.0.1', 2222,loop=loop)
symbolList=list()
while True:
if updateOZ_event.is_set():
print('In Server send')
updateOZ_event.clear()
for element in updateOZ_event.data :
writer.write(('Add_'+ element+'_End').encode())
writer.write('Send'.encode())
outputbuffer = StringIO()
condition = True
while condition:
data = yield from reader.read(1024)
message=data.decode()
if message.find('!ENDMSG!') != -1:
message = message.replace('!ENDMSG!', '')
condition = False
print('End found')
outputbuffer.write(message)
outputbuffer.seek(0)
DF=pd.read_json(outputbuffer)
#print(DF)
yield from updateOZ_queue.put(DF)
yield None
writer.close()
reader.close()
def _load_data(filename, columns=None):
data = pd.read_json(filename, lines=True)
data = data.sort_values('validation_mrr', ascending=False)
mrr_cols = ['validation_mrr', 'test_mrr']
if columns is None:
columns = [x for x in data.columns if
(x not in mrr_cols and x != 'hash')]
cols = data.columns
cols = mrr_cols + columns
return data[cols]
def _load_data(filename, columns=None):
data = pd.read_json(filename, lines=True)
data = data.sort_values('validation_mrr', ascending=False)
mrr_cols = ['validation_mrr', 'test_mrr']
if columns is None:
columns = [x for x in data.columns if
(x not in mrr_cols and x != 'hash')]
cols = data.columns
cols = mrr_cols + columns
return data[cols]
test_pandas.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 37
收藏 0
点赞 0
评论 0
def test_frame_double_encoded_labels(self):
df = DataFrame([['a', 'b'], ['c', 'd']],
index=['index " 1', 'index / 2'],
columns=['a \\ b', 'y / z'])
assert_frame_equal(df, read_json(df.to_json(orient='split'),
orient='split'))
assert_frame_equal(df, read_json(df.to_json(orient='columns'),
orient='columns'))
assert_frame_equal(df, read_json(df.to_json(orient='index'),
orient='index'))
df_unser = read_json(df.to_json(orient='records'), orient='records')
assert_index_equal(df.columns, df_unser.columns)
np.testing.assert_equal(df.values, df_unser.values)
test_pandas.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def test_frame_non_unique_index(self):
df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 1],
columns=['x', 'y'])
self.assertRaises(ValueError, df.to_json, orient='index')
self.assertRaises(ValueError, df.to_json, orient='columns')
assert_frame_equal(df, read_json(df.to_json(orient='split'),
orient='split'))
unser = read_json(df.to_json(orient='records'), orient='records')
self.assertTrue(df.columns.equals(unser.columns))
np.testing.assert_equal(df.values, unser.values)
unser = read_json(df.to_json(orient='values'), orient='values')
np.testing.assert_equal(df.values, unser.values)
test_pandas.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def test_frame_non_unique_columns(self):
df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2],
columns=['x', 'x'])
self.assertRaises(ValueError, df.to_json, orient='index')
self.assertRaises(ValueError, df.to_json, orient='columns')
self.assertRaises(ValueError, df.to_json, orient='records')
assert_frame_equal(df, read_json(df.to_json(orient='split'),
orient='split', dtype=False))
unser = read_json(df.to_json(orient='values'), orient='values')
np.testing.assert_equal(df.values, unser.values)
# GH4377; duplicate columns not processing correctly
df = DataFrame([['a', 'b'], ['c', 'd']], index=[
1, 2], columns=['x', 'y'])
result = read_json(df.to_json(orient='split'), orient='split')
assert_frame_equal(result, df)
def _check(df):
result = read_json(df.to_json(orient='split'), orient='split',
convert_dates=['x'])
assert_frame_equal(result, df)
for o in [[['a', 'b'], ['c', 'd']],
[[1.5, 2.5], [3.5, 4.5]],
[[1, 2.5], [3, 4.5]],
[[Timestamp('20130101'), 3.5],
[Timestamp('20130102'), 4.5]]]:
_check(DataFrame(o, index=[1, 2], columns=['x', 'x']))
test_pandas.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 39
收藏 0
点赞 0
评论 0
def test_frame_from_json_nones(self):
df = DataFrame([[1, 2], [4, 5, 6]])
unser = read_json(df.to_json())
self.assertTrue(np.isnan(unser[2][0]))
df = DataFrame([['1', '2'], ['4', '5', '6']])
unser = read_json(df.to_json())
self.assertTrue(np.isnan(unser[2][0]))
unser = read_json(df.to_json(), dtype=False)
self.assertTrue(unser[2][0] is None)
unser = read_json(df.to_json(), convert_axes=False, dtype=False)
self.assertTrue(unser['2']['0'] is None)
unser = read_json(df.to_json(), numpy=False)
self.assertTrue(np.isnan(unser[2][0]))
unser = read_json(df.to_json(), numpy=False, dtype=False)
self.assertTrue(unser[2][0] is None)
unser = read_json(df.to_json(), numpy=False,
convert_axes=False, dtype=False)
self.assertTrue(unser['2']['0'] is None)
# infinities get mapped to nulls which get mapped to NaNs during
# deserialisation
df = DataFrame([[1, 2], [4, 5, 6]])
df.loc[0, 2] = np.inf
unser = read_json(df.to_json())
self.assertTrue(np.isnan(unser[2][0]))
unser = read_json(df.to_json(), dtype=False)
self.assertTrue(np.isnan(unser[2][0]))
df.loc[0, 2] = np.NINF
unser = read_json(df.to_json())
self.assertTrue(np.isnan(unser[2][0]))
unser = read_json(df.to_json(), dtype=False)
self.assertTrue(np.isnan(unser[2][0]))
test_pandas.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def test_frame_empty_mixedtype(self):
# mixed type
df = DataFrame(columns=['jim', 'joe'])
df['joe'] = df['joe'].astype('i8')
self.assertTrue(df._is_mixed_type)
assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df,
check_index_type=False)
test_pandas.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def test_frame_mixedtype_orient(self): # GH10289
vals = [[10, 1, 'foo', .1, .01],
[20, 2, 'bar', .2, .02],
[30, 3, 'baz', .3, .03],
[40, 4, 'qux', .4, .04]]
df = DataFrame(vals, index=list('abcd'),
columns=['1st', '2nd', '3rd', '4th', '5th'])
self.assertTrue(df._is_mixed_type)
right = df.copy()
for orient in ['split', 'index', 'columns']:
inp = df.to_json(orient=orient)
left = read_json(inp, orient=orient, convert_axes=False)
assert_frame_equal(left, right)
right.index = np.arange(len(df))
inp = df.to_json(orient='records')
left = read_json(inp, orient='records', convert_axes=False)
assert_frame_equal(left, right)
right.columns = np.arange(df.shape[1])
inp = df.to_json(orient='values')
left = read_json(inp, orient='values', convert_axes=False)
assert_frame_equal(left, right)