python类text_to_word_sequence()的实例源码-面圈网

preprocessors.py 文件源码项目：keras-image-captioning 作者: danieljl 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def _handle_rare_words(self, captions):
        if self._rare_words_handling == 'nothing':
            return captions
        elif self._rare_words_handling == 'discard':
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(captions)
            new_captions = []
            for caption in captions:
                words = text_to_word_sequence(caption)
                new_words = [w for w in words
                             if tokenizer.word_counts.get(w, 0) >=
                             self._words_min_occur]
                new_captions.append(' '.join(new_words))
            return new_captions

        raise NotImplementedError('rare_words_handling={} is not implemented '
                                  'yet!'.format(self._rare_words_handling))

preprocess_subj.py 文件源码项目：mtl 作者: zhenhongChen 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def get_sequences(raw_file, word_count):
    raw_sequences = []
    input_file = open(raw_file)

    for line in input_file:
        word_seq = text.text_to_word_sequence(line)
        raw_sequences.append(word_seq)

        for w in word_seq:
            if w in word_count:
                word_count[w] += 1
            else:
                word_count[w] = 1
    input_file.close()
    return raw_sequences, word_count


# index is start from 1

preprocess_sst.py 文件源码项目：mtl 作者: zhenhongChen 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def get_sequences(raw_file, word_count):
    label_list = []
    raw_sequences = []
    input_file = open(raw_file)

    for line in input_file:
        line_parts = line.strip().split('\t')
        label = line_parts[0]
        label_list.append(label)

        sentence = line_parts[1]
        word_seq = text.text_to_word_sequence(sentence)
        raw_sequences.append(word_seq)

        for w in word_seq:
            if w in word_count:
                word_count[w] += 1
            else:
                word_count[w] = 1
    input_file.close()
    return label_list, raw_sequences, word_count


# index is start from 1

preprocessors.py 文件源码项目：keras-image-captioning 作者: danieljl 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def normalize_captions(self, captions_txt):
        captions_txt = self._add_eos(captions_txt)
        word_sequences = map(text_to_word_sequence, captions_txt)
        result = map(' '.join, word_sequences)
        return result

preprocess.py 文件源码项目：mtl 作者: zhenhongChen 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def get_text_sequences(raw_file, word_count):
    label_list = []
    raw_sequences = []
    input_file = open(raw_file)

    for line in input_file:
        line_parts = line.strip().split('\t')
        label = line_parts[0]
        label_list.append(label)

        sentence = line_parts[1]
        word_seq = text.text_to_word_sequence(sentence)
        raw_sequences.append(word_seq)

        for w in word_seq:
            if w in word_count:
                word_count[w] += 1
            else:
                word_count[w] = 1
    input_file.close()
    return label_list, raw_sequences


# def insert_to_global(word_count, num_words, global_word_count):
#   sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)

#   for (word, count) in sorted_word_count[:num_words]:
#       if word in global_word_count:
#           global_word_count[word] += count
#       else:
#           global_word_count[word] = count

DataHandler.py 文件源码项目：TextClassification 作者: AlgorTroy 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def get_encoded_vector(list_of_words, new_string):

    porter = PorterStemmer()
    lmtz = WordNetLemmatizer()

    if 'START_SEQ' not in list_of_words:
        list_of_words.append('START_SEQ')

    if 'UNKNOWN_WORDS' not in list_of_words:
        list_of_words.append('UNKNOWN_WORDS')

    if 'END_SEQ' not in list_of_words:
        list_of_words.append('END_SEQ')

    tokens = text_to_word_sequence(new_string, lower=True, split=" ")

    # Stem and Lemmatize the data
    token_stemmed = []

    for token in tokens:
        try:
            token_stemmed.append(porter.stem(lmtz.lemmatize(token)))
        except:
            token_stemmed.append(token)

    tokens = list(token_stemmed)

    out = []

    all_unknown_words = True

    for token in tokens:
        if token in list_of_words:
            all_unknown_words = False
            out.append(list_of_words.index(token))
        else:
            out.append(list_of_words.index('UNKNOWN_WORDS'))
    if all_unknown_words:
        print('Sentence not recognised:', new_string)

    out = [list_of_words.index('START_SEQ')] + out + [list_of_words.index('END_SEQ')]
    return out