python类cut()的实例源码-第2页-面圈网

text_gen_text.py 文件源码项目：chinese_text_generator 作者: yiyuezhuo 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def fetch(self):
        # cut the text in semi-redundant sequences of maxlen characters
        #text=self.text
        text=self.next_text()
        chars=self.chars
        maxlen=self.maxlen
        step=self.step

        maxlen = 20
        step = 3
        sentences = []
        next_chars = []
        for i in range(0, len(text) - maxlen, step):
            sentences.append(text[i: i + maxlen])
            next_chars.append(text[i + maxlen])
        print('nb sequences:', len(sentences))

        print('Vectorization...')
        X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
        y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
        for i, sentence in enumerate(sentences):
            for t, char in enumerate(sentence):
                X[i, t, self.char_indices[char]] = 1
            y[i, self.char_indices[next_chars[i]]] = 1
        return text,X,y

process_corpus.py 文件源码项目：word2vec 作者: sefira 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile)

pridict.py 文件源码项目：deeplearning 作者: fanfanfeng 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def predict(text):
    words = jieba.cut(text)
    words = " ".join(words)
    index2label = {i: l.strip() for i, l in enumerate(tv_classfication.label_list)}

    word2vec_model = Word2Vec.load(tv_classfication.word2vec_path)
    text_converter = data_convert.SimpleTextConverter(word2vec_model, 80, None)
    x_test = []
    for doc, _ in text_converter.transform_to_ids([words]):
        x_test.append(doc)

    x_test = np.array(x_test)

    graph = tf.Graph()
    with graph.as_default(),tf.Session() as sess:
        model = bi_lstm_model.Bi_lstm()
        model.restore_model(sess)

        print(tv_classfication.index2label.get(model.predict(sess,x_test)[0]))

lyric_nlp.py 文件源码项目：web-crawler-tutorial 作者: jwlin 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def lyrics():
    with open('lyrics.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    tokens = list()
    for v in data.values():
        # ??????, ???????? 2 ??, ?????
        tokens += [seg for seg in jieba.cut(v) if seg.split() and len(seg) > 1]

    # ?? tokens ?????????
    counter = Counter(tokens)
    print(counter.most_common(10))

    # ???, ???????????
    wcloud = WordCloud(font_path='NotoSansMonoCJKtc-Regular.otf').generate(' '.join(tokens))
    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()

wiki_token.py 文件源码项目：wiki_zh_vec 作者: zhouhoo 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def cut_words(input_file, output_file):
    count = 0
    with io.open(output_file, mode = 'w', encoding = 'utf-8') as outfile:
        with io.open(input_file, mode = 'r', encoding = 'utf-8') as infile:
            for line in infile:
                line = line.strip()
                if len(line) < 1:  # empty line
                    continue
                if line.startswith('doc'): # start or end of a passage
                    if line == 'doc': # end of a passage
                        outfile.write(u'\n')
                        count = count + 1
                        if(count % 1000 == 0):
                            print('%s articles were finished.......' %count)
                    continue
                for word in jieba.cut(line):
                    outfile.write(word + ' ')
    print('%s articles were finished.......' %count)

search.py 文件源码项目：Commodity-analysis 作者: buhuipao 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def extract_tags(key_word, a_name):
    '''
    ???????????, ????????????,??????,
    ?????????JD??????, ??????????5??????????,
    ???????????????????????????????
    '''
    cut_tags = [tag for tag in jieba.cut(a_name)][:8]
    analyse_tags = jieba.analyse.extract_tags(a_name)
    tags = [tag for tag in cut_tags if tag in analyse_tags]
    # ?????????????tags???
    try:
        tags.remove(key_word)
    except:
        pass
    tags.insert(0, key_word)
    if len(tags) > 5:
        tags = tags[:5]
    return ' '.join(tags)

analyse.py 文件源码项目：lyricswordcloud 作者: qwertyyb 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def handleLine(self, line):
    # ???????
    line = line.replace(' ', '')
    line = line.replace('\n', '')
    line = line.replace('em', '')
    # ??
    words = jieba.cut(line)
    for word in words:
      if len(word)<=1:
        continue
      if word in self.data:
        self.data[word] = self.data[word]+1
      else:
        self.data[word] = 1

eval_data_helpers.py 文件源码项目：question-classification-cnn-rnn-attention 作者: sefira 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def process_data(line):
    """
    word break and remove word
    Returns split sentences
    """
    # Word break
    seg_list = jieba.cut(line)
    line = u' '.join(seg_list)
    # Remove word
    ss = re.findall('[\n\s*\r\u4e00-\u9fa5]|nmovie|nrcelebrity', line)
    line = u"".join(ss).strip()

    if(len(line) < 2):
        return "UNK"
    return line

eval_data_helpers.py 文件源码项目：question-classification-cnn-rnn-attention 作者: sefira 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def process_data(line):
    """
    word break and remove word
    Returns split sentences
    """
    # Word break
    seg_list = jieba.cut(line)
    line = u' '.join(seg_list)
    # Remove word
    ss = re.findall('[\n\s*\r\u4e00-\u9fa5]|nmovie|nrcelebrity', line)
    line = u"".join(ss).strip()

    if(len(line) < 2):
        return "UNK"
    return line

eval_data_helpers.py 文件源码项目：question-classification-cnn-rnn-attention 作者: sefira 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def process_data(line):
    """
    word break and remove word
    Returns split sentences
    """
    # Word break
    seg_list = jieba.cut(line)
    line = u' '.join(seg_list)
    # Remove word
    ss = re.findall('[\n\s*\r\u4e00-\u9fa5]|nmovie|nrcelebrity', line)
    line = u"".join(ss).strip()

    if(len(line) < 2):
        return "UNK"
    return line

chatbot.py 文件源码项目：hadan-gcloud 作者: youkpan 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def mainTestInteractive(self, sess):
        """ Try predicting the sentences that the user will enter in the console
        Args:
            sess: The current running session
        """
        # TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also)
        # TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode)
        # TODO: Log the questions asked for latter re-use (merge with test/samples.txt)

        print('Testing: Launch interactive mode:')
        print('')
        print('Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high '
              'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.')
        import jieba
        while True:
            question = input(self.SENTENCES_PREFIX[0])
            if question == '' or question == 'exit':
                break
            questionc = jieba.cut(question, cut_all=False)
            question = str(" ".join(questionc)).decoder("GBK")
            print(question)
            questionSeq = []  # Will be contain the question as seen by the encoder
            answer = self.singlePredict(question, questionSeq)
            if not answer:
                print('Warning: sentence too long, sorry. Maybe try a simpler sentence.')
                continue  # Back to the beginning, try again

            print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True)))

            if self.args.verbose:
                print(self.textData.batchSeq2str(questionSeq, clean=True, reverse=True))
                print(self.textData.sequence2str(answer))

            print()

jieba2.py 文件源码项目：hadan-gcloud 作者: youkpan 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    #jieba.set_dictionary('jieba_dict/dict.txt.big')

    # load stopwords set
    #stopwordset = set()
    #with open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw:
    #    for line in sw:
    #        stopwordset.add(line.strip('\n'))

    output = open('allbook-segment.txt','w')

    texts_num = 0

    with open("allbook.txt", "rb") as f:
      #if(f.readline() == ""):
      print("geting data")
      bookdata = f.read(190000000).decode('UTF-8')
      print("geting data  OK ")
      lineu = bookdata
      p = 0
      for p in range(0,len(bookdata),100):
            line = bookdata[p:p+100]
            #print(line)
            words = jieba.cut(line, cut_all=False)
            for word in words:
                output.write(word +' ')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("???? %d ????" % texts_num)
    output.close()

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def word_seg_cn(docs):
        docs = [list(jieba.cut(sent)) for sent in docs]
        return docs

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def word_seg_cn(docs):
        docs = [list(jieba.cut(sent)) for sent in docs]
        return docs

tokenizer.py 文件源码项目：quackalike 作者: gumblex 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def cutandsplit(s):
    for ln in filterlist(splitsentence(stripblank(s))):
        l = RE_BRACKETS.sub(brcksub, ln.strip())
        if notchinese(l):
            continue
        yield ' '.join(cut(l.replace('?', '“').replace('?', '”').replace('?', '‘').replace('?', '’').lstrip(tailpunct).rstrip(headpunct)))

splitrecutfilter.py 文件源码项目：nlputils 作者: The-Orizon 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def cutandsplit(s):
    for ln in filterlist(splitsentence(stripblank(s))):
        l = RE_BRACKETS.sub(brcksub, ln.strip())
        if notchinese(l):
            continue
        yield ' '.join(cut(l.replace('?', '“').replace('?', '”').replace('?', '‘').replace('?', '’').lstrip(tailpunct).rstrip(headpunct)))