def ge_cmd_learn():
args = parse_arg_learn()
# prepare input to GE_learn
data = GE_data()
data.dat = util.load_data(args.data)
data.labeled_features = util.load_labeled_features(args.labeled_features)
init_model = GE_model()
param = GE_param()
if args.l2:
param.l2_regularization = args.l2
final_model_path = args.model
# print data
final_model = GE_learn(data, init_model, param)
util.save_model(final_model, final_model_path)
return
# parse arguments, get data and model, output prediction
python类load_data()的实例源码
def describe(name):
"""
Describe the dataset
"""
df = load_data(name)
s = df.groupby(level=[0, 1]).size()
print('Dataset :', name)
print('Users :', len(s.groupby(level=0)))
print('Sessions/user :', s.groupby(level=0).size().mean())
print('Sample size :', s.mean(), '+/-', s.std())
print('Mean pp interval (ms) :',
df.groupby(level=[0, 1]).apply(lambda x: x['timepress'].diff().dropna().mean()).mean())
print('Mean duration (ms) :',
df.groupby(level=[0, 1]).apply(lambda x: (x['timerelease'] - x['timepress']).mean()).mean())
for target in TARGETS[1:]:
s = df.reset_index().groupby([target, 'session']).size().groupby(level=0).size()
print(target)
print(s / s.sum())
return
def obfuscate_keystrokes(name, strategy, param):
"""
"""
df = load_data(name)
df = df.groupby(level=[0, 1]).apply(keystrokes2events).reset_index(level=[2, 3], drop=True)
if strategy == 'delay':
df = df.groupby(level=[0, 1]).apply(lambda x: delay_mix(x, param))
elif strategy == 'interval':
df = df.groupby(level=[0, 1]).apply(lambda x: interval_mix(x, param))
else:
raise Exception('Unknown masking strategy')
df = df.groupby(level=[0, 1]).apply(events2keystrokes).reset_index(level=[2, 3], drop=True)
save_data(df, name, masking=(strategy, param))
return
def main():
# img_width, img_height = 48, 48
img_width, img_height = 200, 60
img_channels = 1
# batch_size = 1024
batch_size = 32
nb_epoch = 1000
post_correction = False
save_dir = 'save_model/' + str(datetime.now()).split('.')[0].split()[0] + '/' # model is saved corresponding to the datetime
train_data_dir = 'train_data/ip_train/'
# train_data_dir = 'train_data/single_1000000/'
val_data_dir = 'train_data/ip_val/'
test_data_dir = 'test_data//'
weights_file_path = 'save_model/2016-10-27/weights.11-1.58.hdf5'
char_set, char2idx = get_char_set(train_data_dir)
nb_classes = len(char_set)
max_nb_char = get_maxnb_char(train_data_dir)
label_set = get_label_set(train_data_dir)
# val 'char_set:', char_set
print 'nb_classes:', nb_classes
print 'max_nb_char:', max_nb_char
print 'size_label_set:', len(label_set)
model = build_shallow(img_channels, img_width, img_height, max_nb_char, nb_classes) # build CNN architecture
# model.load_weights(weights_file_path) # load trained model
val_data = load_data(val_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx)
# val_data = None
train_data = load_data(train_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx)
train(model, batch_size, nb_epoch, save_dir, train_data, val_data, char_set)
# train_data = load_data(train_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx)
# test(model, train_data, char_set, label_set, post_correction)
# val_data = load_data(val_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx)
# test(model, val_data, char_set, label_set, post_correction)
# test_data = load_data(test_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx)
# test(model, test_data, char_set, label_set, post_correction)
def ge_cmd_predict():
args = parse_arg_predict()
# prepare input to GE_learn
data = util.load_data(args.data)
model = util.load_model(args.model)
pred_path = args.output
pred = GE_predict(data, model)
util.write_prediction(pred, pred_path)
return
def load_data(x_path, y_path, shuffle=True):
xs, ys = load_dataset_csv(x_path, y_path)
n = len(xs)
shuffle_indices = range(n)
np.random.shuffle(shuffle_indices)
return xs, ys, n
def preprocess_villani(in_file, out_file, long_fixed_out_file):
"""
Preprocess the raw Villani dataset and extend the long fixed dataset
"""
df = pd.read_csv(in_file, index_col=[0, 1])
# Make age a binary target, <30 and >=30
df['age'] = df['agegroup'].map({
'under20': '<30',
'20-29': '<30',
'30-39': '>=30',
'40-49': '>=30',
'50-59': '>=30',
'over60': '>=30'}
)
# Ignore missing data
df = df.dropna()
df = remove_repeated_keys(df)
# combine the villani fixed text with citefa dataset fixed text
long_fixed = load_data('long_fixed')
slf = long_fixed.groupby(level=[0, 1]).size()
villani_fixed = df[df['inputtype'] == 'fixed']
villani_fixed = villani_fixed.groupby(level=[0, 1]).apply(lambda x: make_sessions(x, slf.mean(), slf.std()))
villani_fixed = villani_fixed.reset_index(level=[0, 1], drop=True)
villani_fixed = reduce_dataset(villani_fixed, min_samples=10, max_samples=10)
long_fixed = pd.concat([long_fixed, villani_fixed])
long_fixed = long_fixed[COLS]
long_fixed.to_csv(long_fixed_out_file)
# Free-text input only
villani_free = df[df['inputtype'] == 'free']
villani_free = villani_free.groupby(level=[0, 1]).apply(lambda x: make_sessions(x, slf.mean(), slf.std()))
villani_free = villani_free.reset_index(level=[0, 1], drop=True)
villani_free = reduce_dataset(villani_free, min_samples=10, max_samples=10)
villani_free = villani_free[COLS]
villani_free.to_csv(out_file)
return