将word2vec bin文件转换为文本

发布于 2021-01-29 18:32:00

我可以从word2vec网站下载GoogleNews-vectors-
negative300.bin.gz。.bin文件(大约3.4GB)是对我无用的二进制格式。Tomas
Mikolov向我们保证:“将二进制格式转换为文本格式应该很简单(尽管这将占用更多的磁盘空间)。检查距离工具中的代码,读取二进制文件相当简单。”
不幸的是,我对C的了解不足,无法理解http://word2vec.googlecode.com/svn/trunk/distance.c

据说gensim能做到这一点还,但我已经找到了教程似乎是有关转换 文本,而不是相反。

有人可以建议修改C代码或gensim发出文本的指令吗?

关注者
0
被浏览
188
1 个回答
  • 面试哥
    面试哥 2021-01-29
    为面试而生,有面试问题,就找面试哥。

    在word2vec-toolkit邮件列表上,Thomas
    Mensink以小型C程序的形式提供了一个答案,该程序会将.bin文件转换为文本。这是distance.c文件的修改。我用下面的Thomas代码替换了原来的distance.c,并重建了word2vec(make
    clean; make),并将编译后的距离重命名为readbin。然后./readbin vector.bin将创建vector.bin的文本版本。

    //  Copyright 2013 Google Inc. All Rights Reserved.
    //
    //  Licensed under the Apache License, Version 2.0 (the "License");
    //  you may not use this file except in compliance with the License.
    //  You may obtain a copy of the License at
    //
    //      http://www.apache.org/licenses/LICENSE-2.0
    //
    //  Unless required by applicable law or agreed to in writing, software
    //  distributed under the License is distributed on an "AS IS" BASIS,
    //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    //  See the License for the specific language governing permissions and
    //  limitations under the License.
    
    #include <stdio.h>
    #include <string.h>
    #include <math.h>
    #include <malloc.h>
    
    const long long max_size = 2000;         // max length of strings
    const long long N = 40;                  // number of closest words that will be shown
    const long long max_w = 50;              // max length of vocabulary entries
    
    int main(int argc, char **argv) {
      FILE *f;
      char file_name[max_size];
      float len;
      long long words, size, a, b;
      char ch;
      float *M;
      char *vocab;
      if (argc < 2) {
        printf("Usage: ./distance <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
        return 0;
      }
      strcpy(file_name, argv[1]);
      f = fopen(file_name, "rb");
      if (f == NULL) {
        printf("Input file not found\n");
        return -1;
      }
      fscanf(f, "%lld", &words);
      fscanf(f, "%lld", &size);
      vocab = (char *)malloc((long long)words * max_w * sizeof(char));
      M = (float *)malloc((long long)words * (long long)size * sizeof(float));
      if (M == NULL) {
        printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
        return -1;
      }
      for (b = 0; b < words; b++) {
        fscanf(f, "%s%c", &vocab[b * max_w], &ch);
        for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
        len = 0;
        for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
        len = sqrt(len);
        for (a = 0; a < size; a++) M[a + b * size] /= len;
      }
      fclose(f);
      //Code added by Thomas Mensink
      //output the vectors of the binary format in text
      printf("%lld %lld #File: %s\n",words,size,file_name);
      for (a = 0; a < words; a++){
        printf("%s ",&vocab[a * max_w]);
        for (b = 0; b< size; b++){ printf("%f ",M[a*size + b]); }
        printf("\b\b\n");
      }
    
      return 0;
    }
    

    我从中删除了“ \ b \ b” printf

    顺便说一句,生成的文本文件仍然包含文本单词和一些不必要的空格,我不希望这些空格用于某些数值计算。我使用bash命令从每一行中删除了初始文本列和尾随空白。

    cut --complement -d ' ' -f 1 GoogleNews-vectors-negative300.txt > GoogleNews-vectors-negative300_tuples-only.txt
    sed 's/ $//' GoogleNews-vectors-negative300_tuples-only.txt
    


知识点
面圈网VIP题库

面圈网VIP题库全新上线,海量真题题库资源。 90大类考试,超10万份考试真题开放下载啦

去下载看看