def main():
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
outp = OUT_PREFIX
keep_words = DEFAULT_DICT_SIZE
# the doc index
dbc = get_cursor()
dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
docindex = [(pageid, title) for pageid, title in dbc]
pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))
lemmatize = True # 'lemma' in program
wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
# only keep the most frequent words
wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
# initialize corpus reader and word->id mapping
mm = MmCorpus(outp + '_bow.mm')
# build tfidf, ~50min
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(outp + '.tfidf_model')
# save tfidf vectors in matrix market format
# another long task
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
logger.info("finished running %s" % program)
make_wikicorpus.py 文件源码
python
阅读 20
收藏 0
点赞 0
评论 0
评论列表
文章目录