def app_activity_features():
train = pd.read_csv("gender_age_train.csv")
test = pd.read_csv("gender_age_test.csv")
train.drop(['gender','age','group'],axis=1,inplace=True)
data = train.append(test)
""" Merge with brand_model table"""
device_table = pd.read_csv("phone_brand_device_model.csv")
data = pd.merge(data,device_table,how='left',on='device_id')
data = data.drop_duplicates() #drop duplicates #note: there is still one device associated with 2 brands/models
del device_table
print "data build"
"""
Create dataframe indicating for each device id, which app is present, and how much is it active
- merge events and app_events on event_id
- group by device_id and app_id, and take the mean of activity
"""
events = pd.read_csv("events.csv")
events = events[events['device_id'].isin(list(data['device_id']))]
apps = pd.read_csv("app_events.csv")
apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
del events
print "events build"
"""Reshape the dataframe so that each app is a new feature"""
reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
reshaped[list(pd.unique(apps['app_id']))]=0
for app in list(pd.unique(apps['app_id'])):
sliced = apps[apps['app_id']==app]
reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
del apps
return reshaped
########################################################################################################################################
########################################################################################################################################
python类unique()的实例源码
def app_activity_features():
train = pd.read_csv("gender_age_train.csv")
test = pd.read_csv("gender_age_test.csv")
train.drop(['gender','age','group'],axis=1,inplace=True)
data = train.append(test)
""" Merge with brand_model table"""
device_table = pd.read_csv("phone_brand_device_model.csv")
data = pd.merge(data,device_table,how='left',on='device_id')
data = data.drop_duplicates() #drop duplicates #note: there is still one device associated with 2 brands/models
del device_table
print "data build"
"""
Create dataframe indicating for each device id, which app is present, and how much is it active
- merge events and app_events on event_id
- group by device_id and app_id, and take the mean of activity
"""
events = pd.read_csv("events.csv")
events = events[events['device_id'].isin(list(data['device_id']))]
apps = pd.read_csv("app_events.csv")
apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
del events
print "events build"
"""Reshape the dataframe so that each app is a new feature"""
reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
reshaped[list(pd.unique(apps['app_id']))]=0
for app in list(pd.unique(apps['app_id'])):
sliced = apps[apps['app_id']==app]
reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
del apps
return reshaped
def _process_dataset(anno, sample_rate, n_samples, n_threads):
"""Processes, and saves MagnaTagATune dataset using multi-processes.
Args:
anno: Annotation DataFrame contains tags, mp3_path, split, and shard.
sample_rate: Sampling rate of the audios. If the sampling rate is different
with an audio's original sampling rate, then it re-samples the audio.
n_samples: Number of samples one segment contains.
n_threads: Number of threads to process the dataset.
"""
args_queue = Queue()
split_and_shard_sets = pd.unique([tuple(x) for x in anno[['split', 'shard']].values])
for split, shard in split_and_shard_sets:
assigned_anno = anno[(anno['split'] == split) & (anno['shard'] == shard)]
n_shards = anno[anno['split'] == split]['shard'].nunique()
args = (assigned_anno, sample_rate, n_samples, split, shard, n_shards)
args_queue.put(args)
if FLAGS.n_threads > 1:
threads = []
for _ in range(FLAGS.n_threads):
thread = Thread(target=_process_audio_files, args=[args_queue])
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
else:
_process_audio_files(args_queue)
def aggregate_regions(fp):
# aggregate regions and supply portfolios
# easier to do this with pandas by just reading the CSVs again
sc = pd.read_csv(fp + '/shortage_cost.csv', index_col=0, parse_dates=True)
sv = pd.read_csv(fp + '/shortage_volume.csv', index_col=0, parse_dates=True)
flow = pd.read_csv(fp + '/flow.csv', index_col=0, parse_dates=True)
demand_nodes = pd.read_csv('calvin/data/demand_nodes.csv', index_col = 0)
portfolio = pd.read_csv('calvin/data/portfolio.csv', index_col = 0)
for R in demand_nodes.region.unique():
for t in demand_nodes.type.unique():
ix = demand_nodes.index[(demand_nodes.region == R) &
(demand_nodes.type == t)]
sc['%s_%s' % (R,t)] = sc[ix].sum(axis=1)
sv['%s_%s' % (R,t)] = sv[ix].sum(axis=1)
for P in portfolio.region.unique():
for k in portfolio.supplytype.unique():
for t in portfolio.type.unique():
ix = portfolio.index[(portfolio.region == P) &
(portfolio.type ==t) &
(portfolio.supplytype == k)]
flow['%s_%s_%s' % (P,k,t)] = flow[ix].sum(axis=1)
sc.to_csv(fp + '/shortage_cost.csv')
sv.to_csv(fp + '/shortage_volume.csv')
flow.to_csv(fp + '/flow.csv')
def remove_debug_links(self):
df = self.df
ix = df.index[df.index.str.contains('DBUG')]
df.drop(ix, inplace=True, axis=0)
self.nodes = pd.unique(df[['i','j']].values.ravel()).tolist()
self.links = list(zip(df.i,df.j,df.k))
return df
def nominal_to_numeric(array):
mapper = {name: i for i, name in enumerate(pd.unique(array))}
return np.array([mapper[name] for name in array])
def __init__(self, data_dir, work_dir, train_folds, validation_folds, test_folds, esc10=False):
super().__init__(data_dir, work_dir)
self.meta = pd.read_csv(data_dir + 'esc50.csv')
self.train_folds = train_folds
self.validation_folds = validation_folds
self.test_folds = test_folds
self.class_count = 50
self.bands = 60
self.segment_length = 101
self.esc10 = esc10
if self.esc10:
self.class_count = 10
self.meta = self.meta[self.meta['esc10']]
self.categories = pd.unique(self.meta.sort_values('target')['category'])
self.meta['target'] = self.to_targets(self.meta['category'])
else:
self.categories = pd.unique(self.meta.sort_values('target')['category'])
self.train_meta = self.meta[self.meta['fold'].isin(self.train_folds)]
self.validation_data.meta = self.meta[self.meta['fold'].isin(self.validation_folds)]
self.test_data.meta = self.meta[self.meta['fold'].isin(self.test_folds)]
self._validation_size = len(self.validation_data.meta)
self._test_size = len(self.test_data.meta)
self._generate_spectrograms()
self._populate(self.validation_data)
self._populate(self.test_data)
def dataconf_eval_time_check(self, _wf_data_conf_node, _node_name):
"""
data conf? ???, eval?? unique?? ????.
:param data_dfconf_list (nn00001_1_dataconf_node)
:return True:
"""
_value = False
if ('evaldata' in _node_name):
_value = True
return _value
def set_dataconf_for_labels(self, df, label):
"""
csv? ?? label? distict ?? ???
Extract distinct label values
:param wf_data_config, df, nnid, ver, node:
:param conf_data:
"""
#TODO : set_default_dataconf_from_csv ???? ?? ??
label_values = pd.unique(df[label].values.ravel().astype('str')).tolist()
return label_values
def test_get_events(self, mock_query):
urlread_sideeffect = ["""1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
"""]
data = self.get_events_df(urlread_sideeffect, self.session, "http://eventws", db_bufsize=self.db_buf_size)
# assert only first two events events were successfully saved
assert len(self.session.query(Event).all()) == len(pd.unique(data['id'])) == 2
# AND data to save has length 2:
assert len(data) == 2
# now download again, with an url error:
urlread_sideeffect = [413, """1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
""", URLError('blabla23___')]
data = self.get_events_df(urlread_sideeffect, self.session, "http://eventws", db_bufsize=self.db_buf_size)
# assert we got the same result as above:
assert len(self.session.query(Event).all()) == len(pd.unique(data['id'])) == 2
assert len(data) == 2
# and since first response is 413, that having split the request into two, the
# second response is our URLError (we could test it better, anyway):
assert "blabla23___" in self.log_msg()
def sort_eg_attributes(df, attributes=['doh', 'ldate'],
reverse_list=[0, 0],
add_columns=False):
'''Sort master list attribute columns by employee group in preparation
for list construction. The overall master list structure and order is
unaffected, only the selected attribute columns are sorted (normally
date-related columns such as doh or ldate)
inputs
df
The master data dataframe (does not need to be sorted)
attributes
columns to sort by eg (inplace)
reverse_list
If an attribute is to be sorted in reverse order (descending),
use a '1' in the list position corresponding to the position of
the attribute within the attributes input
add_columns
If True, an additional column for each sorted attribute will be
added to the resultant dataframe, with the suffix '_sort' added
to it.
'''
date_cols = []
for col in df:
if (df[col]).dtype == 'datetime64[ns]':
date_cols.append(col)
try:
df.sort_values(['eg', 'eg_number'], inplace=True)
except LookupError:
df.sort_values(['eg', 'eg_order'], inplace=True)
egs = df.eg.values
i = 0
for measure in attributes:
data = df[measure].values
measure_col = np.empty_like(data)
for eg in pd.unique(df.eg):
measure_slice = data[egs == eg]
measure_slice_index = np.where(egs == eg)[0]
measure_slice_sorted = np.sort(measure_slice, axis=0)
if reverse_list[i]:
measure_slice_invert = measure_slice_sorted[::-1]
measure_slice_sorted = measure_slice_invert
np.put(measure_col, measure_slice_index, measure_slice_sorted)
if add_columns:
col_name = measure + '_sort'
else:
col_name = measure
df[col_name] = measure_col
if measure in date_cols:
df[col_name] = pd.to_datetime(df[col_name].dt.date)
i += 1
return df
def unique(lst):
"""
Return unique elements
:class:`pandas.unique` and :class:`numpy.unique` cast
mixed type lists to the same type. They are faster, but
some times we want to maintain the type.
Parameters
----------
lst : list-like
List of items
Returns
-------
out : list
Unique items in the order that they appear in the
input.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> lst = ['one', 'two', 123, 'three']
>>> pd.unique(lst)
array(['one', 'two', '123', 'three'], dtype=object)
>>> np.unique(lst)
array(['123', 'one', 'three', 'two'],
dtype='<U5')
>>> unique(lst)
['one', 'two', 123, 'three']
pandas and numpy cast 123 to a string!, and numpy does not
even maintain the order.
"""
seen = set()
def make_seen(x):
seen.add(x)
return x
return [make_seen(x) for x in lst if x not in seen]
test_chamber_of_deputies_dataset.py 文件源码
项目:serenata-toolbox
作者: datasciencebr
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def test_fetch_translate_clean_integration(self):
self.subject.fetch()
files = ["Ano-{}.csv".format(n) for n in [2017]]
files.append('datasets-format.html')
for name in files:
file_path = os.path.join(self.path, name)
assert(os.path.exists(file_path))
self.subject.translate()
for name in ["reimbursements-{}.xz".format(n) for n in self.years]:
file_path = os.path.join(self.path, name)
assert(os.path.exists(file_path))
self.subject.clean()
file_path = os.path.join(self.path, 'reimbursements.xz')
assert(os.path.exists(file_path))
# test for subquota translation
dataset = pd.read_csv(file_path, compression='xz')
all_subquotas = ['Maintenance of office supporting parliamentary activity',
'Locomotion, meal and lodging',
'Fuels and lubricants',
'Consultancy, research and technical work',
'Publicity of parliamentary activity',
'Purchase of office supplies',
'Software purchase or renting; Postal services; Subscriptions',
'Security service provided by specialized company',
'Flight tickets',
'Telecommunication',
'Postal services',
'Publication subscriptions',
'Congressperson meal',
'Lodging, except for congressperson from Distrito Federal',
'Automotive vehicle renting or watercraft charter',
'Aircraft renting or charter of aircraft',
'Automotive vehicle renting or charter',
'Watercraft renting or charter',
'Taxi, toll and parking',
'Terrestrial, maritime and fluvial tickets',
'Participation in course, talk or similar event',
'Flight ticket issue']
present_subquotas = pd.unique(dataset['subquota_description'])
for subquota in present_subquotas:
assert(subquota in all_subquotas)
def __init__(self, data_dir, work_dir, train_folds, validation_folds, test_folds, esc10=False,
downsample=True):
super().__init__(data_dir, work_dir)
self.meta = pd.read_csv(data_dir + 'esc50.csv')
self.train_folds = train_folds
self.validation_folds = validation_folds
self.test_folds = test_folds
self.class_count = 50
self.DOWNSAMPLE = downsample
self.SEGMENT_LENGTH = 300
self.BANDS = 180
self.WITH_DELTA = False
self.FMAX = 16000
self.FFT = 2205
self.HOP = 441
self.esc10 = esc10
if self.esc10:
self.class_count = 10
self.meta = self.meta[self.meta['esc10']]
self.categories = pd.unique(self.meta.sort_values('target')['category'])
self.meta['target'] = self.to_targets(self.meta['category'])
else:
self.categories = pd.unique(self.meta.sort_values('target')['category'])
self.train_meta = self.meta[self.meta['fold'].isin(self.train_folds)]
self.validation_data.meta = self.meta[self.meta['fold'].isin(self.validation_folds)]
self.test_data.meta = self.meta[self.meta['fold'].isin(self.test_folds)]
self._validation_size = len(self.validation_data.meta)
self._test_size = len(self.test_data.meta)
self._generate_spectrograms()
if self.DOWNSAMPLE:
self.SEGMENT_LENGTH //= 2
self.BANDS //= 3
self._populate(self.validation_data)
self._populate(self.test_data)
def set_dataconf_for_checktype(self, df, node_id, data_dfconf_list):
"""
csv? ?? column type? ???? data_conf? ??(data_conf? ????? )
???? ??? Unique ? ?? ??? cell_feature_unique? ???(Keras?)
:param wf_data_config, df, nnid, ver, node:
:param conf_data:
"""
try:
#TODO : set_default_dataconf_from_csv ???? ?? ??
data_conf = dict()
data_conf_unique_v = dict()
data_conf_col_unique_v = dict()
data_conf_col_type = dict()
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# Wdnn??? data_dfconf? ??? ??? ?? ??? ??? ?? ??? ??
if len(data_dfconf_list) > 0:
_wf_data_conf = wf_data_conf(data_dfconf_list)
_cell_feature_unique = _wf_data_conf.cell_feature_unique if hasattr(_wf_data_conf,
'cell_feature_unique') else list() # ?? ???? ????? ??? ? ??? ??
for i, v in df.dtypes.iteritems():
# label
column_dtypes = dict()
column_unique_value = dict()
if (str(v) in numerics): # maybe need float
col_type = 'CONTINUOUS'
columns_unique_value = list()
else:
col_type = 'CATEGORICAL'
columns_unique_value = pd.unique(df[i].fillna('').values.ravel()).tolist() # null?? ???
column_dtypes['column_type'] = col_type
origin_feature_unique = _cell_feature_unique[i].get('column_u_values') if (i in _cell_feature_unique) else list()
combined_col_u_list = utils.get_combine_label_list(origin_feature_unique, columns_unique_value)
column_unique_value['column_u_values'] = combined_col_u_list #???? ???? ?? ????.
data_conf_col_type[i] = column_dtypes
data_conf_col_unique_v[i] = column_unique_value
data_conf['cell_feature'] = data_conf_col_type
data_conf_unique_v['cell_feature_unique'] = data_conf_col_unique_v
data_conf_json_str = json.dumps(data_conf) #Json?? ???
data_conf_json = json.loads(data_conf_json_str)
data_conf_unique_json_str = json.dumps(data_conf_unique_v)
data_conf_unique_json = json.loads(data_conf_unique_json_str)
return data_conf_json, data_conf_unique_json
except Exception as e:
logging.error("set_dataconf_for_checktype {0} {1}".format(e, e.__traceback__.tb_lineno))