python类WikiCorpus()的实例源码-面圈网

make_wikicorpus.py 文件源码项目：wiki-sim-search 作者: chrisjmccormick 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def formatTime(seconds):
    """
    Takes a number of elapsed seconds and returns a string in the format h:mm.
    """
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return "%d:%02d" % (h, m)

# TODO - Add example code for loading each item back from disk (if needed).
#      - Maybe a commented line below the 'save' command?


# ======== main ========
# Main entry point for the script.
# This little check has to do with the multiprocess module (which is used by
# WikiCorpus). Without it, the code will spawn infinite processes and hang!

wiki.py 文件源码项目：DataScience-And-MachineLearning-Handbook-For-Coders 作者: wxyyxc1992 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def wiki2texts(self, wiki_data_path, wiki_texts_path='./wiki_texts.txt'):
        """
        ??????????????

        Arguments:
        wiki_data_path -- ????????
        """
        if not wiki_data_path:
            print("??? Wiki ?????????? https://dumps.wikimedia.org/zhwiki/ ??")
            exit()

        # ???????
        wiki_corpus = WikiCorpus(wiki_data_path, dictionary={})
        texts_num = 0

        with open(wiki_text_path, 'w', encoding='utf-8') as output:
            for text in wiki_corpus.get_texts():
                output.write(b' '.join(text).decode('utf-8') + '\n')
                texts_num += 1
                if texts_num % 10000 == 0:
                    logging.info("??? %d ???" % texts_num)

        print("???????? OpenCC ??????")

preprocess.py 文件源码项目：blstm-cws 作者: chantera 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def zhwiki2chars(in_file, out_file):
    reg = re.compile(r'^[a-zA-Z]+$')

    def _isalpha(string):
        return reg.match(string) is not None

    i = 0
    out = open(out_file, 'w')
    wiki = WikiCorpus(in_file, lemmatize=False, dictionary={})
    for article in wiki.get_texts():
        tokens = []
        for token in article:
            token = token.decode("utf-8").strip()
            if _isalpha(token):
                continue
            tokens.append(" ".join(token))  # divided by character
        out.write(" ".join(tokens) + "\n")
        i += 1
        if i % 10000 == 0:
            print("process %d articles" % i)
    out.close()

wiki_to_txt.py 文件源码项目：word2vec-tutorial 作者: zake7749 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with io.open("wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(b' '.join(text).decode('utf-8') + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("??? %d ???" % texts_num)

wiki_to_txt.py 文件源码项目：word2vec-tutorial 作者: zake7749 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with open("wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(' '.join(text) + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("??? %d ???" % texts_num)

wiki_to_txt.py 文件源码项目：Word2vec 作者: Alex-CHUN-YU 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def set_wiki_to_txt(self, wiki_data_path = None):
        if wiki_data_path == None:
            # ?????
            if len(sys.argv) != 2:
                print("Please Usage: python3 " + sys.argv[0] + " wiki_data_path")
                exit()
            else:
                wiki_corpus = WikiCorpus(sys.argv[1], dictionary = {})
        else:
            wiki_corpus = WikiCorpus(wiki_data_path, dictionary = {})
        # wiki.xml convert to wiki.txt
        with open("wiki_text.txt", 'w', encoding = 'utf-8') as output:
            text_count = 0
            for text in wiki_corpus.get_texts():
                # save use byte and decode utf-8
                output.write(b' '.join(text).decode('utf-8') + '\n')
                text_count += 1
                if text_count % 10000 == 0:
                    logging.info("????? %d ???" % text_count)
            print("????!")

preanalysis.py 文件源码项目：ChineseSA 作者: cwlseu 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def __init__(self, fname, _lemmatize=False, _dictionary={}, filter_namespaces=('0',)):
        self.fname = fname
        self.logger = startlog()
        self.corpus = WikiCorpus(fname, lemmatize=_lemmatize, dictionary=_dictionary)

trainvecmodel.py 文件源码项目：ChineseSA 作者: cwlseu 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def __init__(self, fname, _lemmatize=False, _dictionary={}, filter_namespaces=('0',)):
        self.fname = fname
        self.logger = startlog()
        self.corpus = WikiCorpus(fname, lemmatize=_lemmatize, dictionary=_dictionary)
        self.traincorpusfname = None

buildkem.py 文件源码项目：KEM 作者: UDICatNCHU 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def wikiToTxt(self):
        # This function takes about 25 minutes
        from gensim.corpora import WikiCorpus

        wiki_corpus = WikiCorpus('./build/zhwiki-latest-pages-articles.xml.bz2', dictionary={})

        texts_num = 0
        with open('./build/wiki_texts.txt', 'w', encoding='utf-8') as output:
            for text in wiki_corpus.get_texts():
                output.write(b' '.join(text).decode('utf-8') + '\n')
                texts_num += 1
                if texts_num % 10000 == 0:
                    logging.info("??? %d ???" % texts_num)

word2vec_wiki.py 文件源码项目：Book_DeepLearning_Practice 作者: wac81 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def get_save_wikitext(wiki_filename,text_filename):
    output = open(text_filename, 'w')
    wiki = corpora.WikiCorpus(wiki_filename, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        # text = delNOTNeedWords(text,"../../stopwords.txt")[1]
        output.write(" ".join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logging.info("Saved " + str(i) + " articles")
    output.close()

create_LDA_model.py 文件源码项目：twitter_LDA_topic_modeling 作者: kenneth-orton 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def main():
    parser = argparse.ArgumentParser(description='Create a corpus from a collection of tweets and/or build an LDA model')
    subparsers = parser.add_subparsers(dest='mode')

    text_corpus_parser = subparsers.add_parser('text', help='Build corpus from directory of text files')
    text_corpus_parser.add_argument('-d', '--docs_loc', required=True, action='store', dest='docs_loc', help='Directory where tweet documents stored')
    text_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus')
    text_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')

    wiki_corpus_parser = subparsers.add_parser('wiki', help='Build corpus from compressed Wikipedia articles')
    wiki_corpus_parser.add_argument('-w', '--wiki_loc', required=True, action='store', dest='wiki_loc', help='Location of compressed Wikipedia dump')
    wiki_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus')
    wiki_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')

    lda_model_parser = subparsers.add_parser('lda', help='Create LDA model from saved corpus')
    lda_model_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus')
    lda_model_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary')
    lda_model_parser.add_argument('-n', '--num_topics', required=True, action='store', dest='num_topics', help='Number of topics to assign to LDA model')
    lda_model_parser.add_argument('-p', '--num_pass', required=True, action='store', dest='num_pass', help='Number of passes through corpus when training the LDA model')
    lda_model_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location and name to save LDA model')

    lda_vis_parser = subparsers.add_parser('ldavis', help='Create visualization of LDA model')
    lda_vis_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus')
    lda_vis_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary')
    lda_vis_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of LDA model')

    argcomplete.autocomplete(parser)
    args = parser.parse_args()

    if args.mode == 'text':
        doc_corpus = DocCorpus(args.docs_loc, args.lemma)

        doc_corpus.dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)

        MmCorpus.serialize(args.corp_loc + '.mm', doc_corpus)
        doc_corpus.dictionary.save(args.corp_loc + '.dict')

    if args.mode == 'wiki':
        if args.lemma:
            wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=True, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15)
        else:
            wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=False, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15)

        wiki_corpus.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)

        MmCorpus.serialize(args.corp_loc + '.mm', wiki_corpus)
        wiki_corpus.dictionary.save(args.corp_loc + '.dict')

    if args.mode == 'lda':
        build_LDA_model(args.corp_loc, args.dict_loc, args.num_topics, args.num_pass, args.lda_loc)

    if args.mode == 'ldavis':
        build_pyLDAvis_output(args.corp_loc, args.dict_loc, args.lda_loc)