def get(languages, feature_set_str, header=False, random=False, minimal=False):
lang_codes = languages.split()
feature_names, feature_values = get_concatenated_sets(lang_codes, feature_set_str)
feature_names = np.array([ f.replace(" ","_") for f in feature_names ])
feats = {}
if minimal:
mask = np.all(feature_values == 0.0, axis=0)
mask |= np.all(feature_values == 1.0, axis=0)
mask |= np.all(feature_values == -1.0, axis=0)
unmasked_indices = np.where(np.logical_not(mask))
else:
unmasked_indices = np.where(np.ones(feature_values.shape[1]))
if random:
feature_values = np.random.random(feature_values.shape) >= 0.5
if header:
print("\t".join(['CODE']+list(feature_names[unmasked_indices])))
feat_names = feature_names[unmasked_indices]
for i, lang_code in enumerate(lang_codes):
values = feature_values[i,unmasked_indices].ravel()
#values = [ '--' if f == -1 else ("%0.4f"%f).rstrip("0").rstrip(".") for f in values ]
feats[lang_code] = values
#print("\t".join([lang_code]+values))
return feats, feat_names
#if __name__ == '__main__':
# argparser = argparse.ArgumentParser()
# argparser.add_argument("languages", default='', help="The languages of interest, in ISO 639-3 codes, separated by spaces (e.g., \"deu eng fra swe\")")
# argparser.add_argument("feature_set", default='', help="The feature set or sets of interest (e.g., \"syntax_knn\" or \"fam\"), joined by concatenation (+) or element-wise union (|).")
# argparser.add_argument("-f", "--fields", default=False, action="store_true", help="Print feature names as the first row of data.")
# argparser.add_argument("-r", "--random", default=False, action="store_true", help="Randomize all feature values (e.g., to make a control group).")
# argparser.add_argument("-m", "--minimal", default=False, action="store_true", help="Suppress columns that are all 0, all 1, or all nulls.")
# args = argparser.parse_args()
# get(args.languages, args.feature_set, args.fields, args.random, args.minimal)
评论列表
文章目录