def formatTime(seconds):
"""
Takes a number of elapsed seconds and returns a string in the format h:mm.
"""
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
return "%d:%02d" % (h, m)
# TODO - Add example code for loading each item back from disk (if needed).
# - Maybe a commented line below the 'save' command?
# ======== main ========
# Main entry point for the script.
# This little check has to do with the multiprocess module (which is used by
# WikiCorpus). Without it, the code will spawn infinite processes and hang!
python类WikiCorpus()的实例源码
wiki.py 文件源码
项目:DataScience-And-MachineLearning-Handbook-For-Coders
作者: wxyyxc1992
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def wiki2texts(self, wiki_data_path, wiki_texts_path='./wiki_texts.txt'):
"""
??????????????
Arguments:
wiki_data_path -- ????????
"""
if not wiki_data_path:
print("??? Wiki ?????????? https://dumps.wikimedia.org/zhwiki/ ??")
exit()
# ???????
wiki_corpus = WikiCorpus(wiki_data_path, dictionary={})
texts_num = 0
with open(wiki_text_path, 'w', encoding='utf-8') as output:
for text in wiki_corpus.get_texts():
output.write(b' '.join(text).decode('utf-8') + '\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("??? %d ???" % texts_num)
print("???????? OpenCC ??????")
def zhwiki2chars(in_file, out_file):
reg = re.compile(r'^[a-zA-Z]+$')
def _isalpha(string):
return reg.match(string) is not None
i = 0
out = open(out_file, 'w')
wiki = WikiCorpus(in_file, lemmatize=False, dictionary={})
for article in wiki.get_texts():
tokens = []
for token in article:
token = token.decode("utf-8").strip()
if _isalpha(token):
continue
tokens.append(" ".join(token)) # divided by character
out.write(" ".join(tokens) + "\n")
i += 1
if i % 10000 == 0:
print("process %d articles" % i)
out.close()
def main():
if len(sys.argv) != 2:
print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
exit()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
texts_num = 0
with io.open("wiki_texts.txt",'w',encoding='utf-8') as output:
for text in wiki_corpus.get_texts():
output.write(b' '.join(text).decode('utf-8') + '\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("??? %d ???" % texts_num)
def main():
if len(sys.argv) != 2:
print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
exit()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
texts_num = 0
with open("wiki_texts.txt",'w',encoding='utf-8') as output:
for text in wiki_corpus.get_texts():
output.write(' '.join(text) + '\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("??? %d ???" % texts_num)
def set_wiki_to_txt(self, wiki_data_path = None):
if wiki_data_path == None:
# ?????
if len(sys.argv) != 2:
print("Please Usage: python3 " + sys.argv[0] + " wiki_data_path")
exit()
else:
wiki_corpus = WikiCorpus(sys.argv[1], dictionary = {})
else:
wiki_corpus = WikiCorpus(wiki_data_path, dictionary = {})
# wiki.xml convert to wiki.txt
with open("wiki_text.txt", 'w', encoding = 'utf-8') as output:
text_count = 0
for text in wiki_corpus.get_texts():
# save use byte and decode utf-8
output.write(b' '.join(text).decode('utf-8') + '\n')
text_count += 1
if text_count % 10000 == 0:
logging.info("????? %d ???" % text_count)
print("????!")
def __init__(self, fname, _lemmatize=False, _dictionary={}, filter_namespaces=('0',)):
self.fname = fname
self.logger = startlog()
self.corpus = WikiCorpus(fname, lemmatize=_lemmatize, dictionary=_dictionary)
def __init__(self, fname, _lemmatize=False, _dictionary={}, filter_namespaces=('0',)):
self.fname = fname
self.logger = startlog()
self.corpus = WikiCorpus(fname, lemmatize=_lemmatize, dictionary=_dictionary)
self.traincorpusfname = None
def wikiToTxt(self):
# This function takes about 25 minutes
from gensim.corpora import WikiCorpus
wiki_corpus = WikiCorpus('./build/zhwiki-latest-pages-articles.xml.bz2', dictionary={})
texts_num = 0
with open('./build/wiki_texts.txt', 'w', encoding='utf-8') as output:
for text in wiki_corpus.get_texts():
output.write(b' '.join(text).decode('utf-8') + '\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("??? %d ???" % texts_num)
def get_save_wikitext(wiki_filename,text_filename):
output = open(text_filename, 'w')
wiki = corpora.WikiCorpus(wiki_filename, lemmatize=False, dictionary={})
for text in wiki.get_texts():
# text = delNOTNeedWords(text,"../../stopwords.txt")[1]
output.write(" ".join(text) + "\n")
i = i + 1
if (i % 10000 == 0):
logging.info("Saved " + str(i) + " articles")
output.close()
create_LDA_model.py 文件源码
项目:twitter_LDA_topic_modeling
作者: kenneth-orton
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def main():
parser = argparse.ArgumentParser(description='Create a corpus from a collection of tweets and/or build an LDA model')
subparsers = parser.add_subparsers(dest='mode')
text_corpus_parser = subparsers.add_parser('text', help='Build corpus from directory of text files')
text_corpus_parser.add_argument('-d', '--docs_loc', required=True, action='store', dest='docs_loc', help='Directory where tweet documents stored')
text_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus')
text_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')
wiki_corpus_parser = subparsers.add_parser('wiki', help='Build corpus from compressed Wikipedia articles')
wiki_corpus_parser.add_argument('-w', '--wiki_loc', required=True, action='store', dest='wiki_loc', help='Location of compressed Wikipedia dump')
wiki_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus')
wiki_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')
lda_model_parser = subparsers.add_parser('lda', help='Create LDA model from saved corpus')
lda_model_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus')
lda_model_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary')
lda_model_parser.add_argument('-n', '--num_topics', required=True, action='store', dest='num_topics', help='Number of topics to assign to LDA model')
lda_model_parser.add_argument('-p', '--num_pass', required=True, action='store', dest='num_pass', help='Number of passes through corpus when training the LDA model')
lda_model_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location and name to save LDA model')
lda_vis_parser = subparsers.add_parser('ldavis', help='Create visualization of LDA model')
lda_vis_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus')
lda_vis_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary')
lda_vis_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of LDA model')
argcomplete.autocomplete(parser)
args = parser.parse_args()
if args.mode == 'text':
doc_corpus = DocCorpus(args.docs_loc, args.lemma)
doc_corpus.dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
MmCorpus.serialize(args.corp_loc + '.mm', doc_corpus)
doc_corpus.dictionary.save(args.corp_loc + '.dict')
if args.mode == 'wiki':
if args.lemma:
wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=True, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15)
else:
wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=False, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15)
wiki_corpus.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
MmCorpus.serialize(args.corp_loc + '.mm', wiki_corpus)
wiki_corpus.dictionary.save(args.corp_loc + '.dict')
if args.mode == 'lda':
build_LDA_model(args.corp_loc, args.dict_loc, args.num_topics, args.num_pass, args.lda_loc)
if args.mode == 'ldavis':
build_pyLDAvis_output(args.corp_loc, args.dict_loc, args.lda_loc)