def compress_traindata(args):
words = OrderedDict()
print("reading embedding vocabulary")
for word in open(args.vocab):
words[word.strip()] = 1
suffixes = defaultdict(int)
suffixes["UNK"] = 1
caps = defaultdict(int)
target = defaultdict(int)
traindata = open(args.path)
len_traindata = 0
print("reading training file")
for line in traindata:
len_traindata += 1
items = line.strip().split(" ")
target[items[-1]] += 1
for item in items[:-1]:
word, suffix, cap = item.split("|")
if words.has_key(word):
words[word] += 1
else:
words[word] = 1
suffixes[suffix] += 1
caps[cap] += 1
def out_dict(d, outfile, freq_cut=-1):
print("writing to {}".format(outfile))
res = {}
with open(outfile, "w") as out:
i = 0
for item, n in d.items():
if freq_cut <= n:
out.write("{} {}\n".format(item, n))
res[item] = i
i += 1
return res
word2id = out_dict(words, os.path.join(args.out, "words.txt"))
suffix2id = out_dict(suffixes, os.path.join(args.out, "suffixes.txt"))
cap2id = out_dict(caps, os.path.join(args.out, "caps.txt"))
target2id = out_dict(target, os.path.join(args.out, "target.txt"), freq_cut=10)
traindata.seek(0)
new_traindata = os.path.join(args.out, "traindata.txt")
print("writing to {}".format(new_traindata))
with open(new_traindata, "w") as out:
for i, line in enumerate(traindata):
items = line.strip().split(" ")
if not target2id.has_key(items[-1]):
continue
target =items[-1]
new_line = ""
for j, item in enumerate(items[:-1]):
word, suffix, cap = item.split("|")
if not word2id.has_key(word):
word = "*UNKNOWN*"
if not suffix2id.has_key(suffix):
suffix = "UNK"
new_line += "|".join([word, suffix, cap]) + " "
out.write(new_line + target + "\n")
评论列表
文章目录