Utils.py 文件源码-python代码片段

Utils.py 文件源码

python

阅读 24 收藏 0 点赞 0 评论 0

项目：char-rbm 作者: colinmorris 项目源码文件源码

def vectors_from_txtfile(fname, codec, limit=-1, mutagen=None):
    f = open(fname)
    skipped = Counter()
    vecs = []
    for line in f:
        line = line.strip()
        try:
            vecs.append(codec.encode(line, mutagen=mutagen))
            if len(vecs) == limit:
                break
        except NonEncodableTextException as e:
            # Too long, or illegal characters
            skipped[e.reason] += 1

    logging.debug("Gathered {} vectors. Skipped {} ({})".format(len(vecs), 
        sum(skipped.values()), dict(skipped)))
    vecs = np.asarray(vecs)
    # TODO: Why default to dtype=float? Seems wasteful? Maybe it doesn't really matter. Actually, docs here seem inconsistent? Constructor docs say default float. transform docs say int. Should file a bug on sklearn.
    return OneHotEncoder(len(codec.alphabet)).fit_transform(vecs)

# Adapted from sklearn.utils.extmath.softmax