def vectors_from_txtfile(fname, codec, limit=-1, mutagen=None):
f = open(fname)
skipped = Counter()
vecs = []
for line in f:
line = line.strip()
try:
vecs.append(codec.encode(line, mutagen=mutagen))
if len(vecs) == limit:
break
except NonEncodableTextException as e:
# Too long, or illegal characters
skipped[e.reason] += 1
logging.debug("Gathered {} vectors. Skipped {} ({})".format(len(vecs),
sum(skipped.values()), dict(skipped)))
vecs = np.asarray(vecs)
# TODO: Why default to dtype=float? Seems wasteful? Maybe it doesn't really matter. Actually, docs here seem inconsistent? Constructor docs say default float. transform docs say int. Should file a bug on sklearn.
return OneHotEncoder(len(codec.alphabet)).fit_transform(vecs)
# Adapted from sklearn.utils.extmath.softmax
评论列表
文章目录