corpus_vectors.py 文件源码-python代码片段

corpus_vectors.py 文件源码

python

阅读 20 收藏 0 点赞 0 评论 0

项目：Word2Vec 作者: hashbangCoder 项目源码文件源码

def extractVecs():
## Pandas read_csv breaks while reading text file. Very buggy. Manually read each line.
    t0 = time.clock()
    with open(options.pretrained,'r') as f:
            content = [item.rstrip().lower().split(' ') for item in f.readlines()]

    globalWordFile = np.asmatrix(content,dtype = str)
    globalWordTokens = globalWordFile[:,0].astype('str')
    globalWordVectors = globalWordFile[:,1:].astype(np.float)
    globalWordFile = None

    ### Pandas read_csv implementation - Broken
    #globalWordFile = pd.read_csv(options.pretrained,delimiter = ' ', header = None)
    #globalWordVectors = globalWordFile.ix[:,1:]
    #globalWordTokens = globalWordFile.ix[:,0]
    #globalWordFile = None
    print time.clock() - t0, " seconds taken for loading and slicing gLoVe Word Vectors"
    return globalWordTokens,globalWordVectors