def _handle_rare_words(self, captions):
if self._rare_words_handling == 'nothing':
return captions
elif self._rare_words_handling == 'discard':
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
new_captions = []
for caption in captions:
words = text_to_word_sequence(caption)
new_words = [w for w in words
if tokenizer.word_counts.get(w, 0) >=
self._words_min_occur]
new_captions.append(' '.join(new_words))
return new_captions
raise NotImplementedError('rare_words_handling={} is not implemented '
'yet!'.format(self._rare_words_handling))
python类text_to_word_sequence()的实例源码
def get_sequences(raw_file, word_count):
raw_sequences = []
input_file = open(raw_file)
for line in input_file:
word_seq = text.text_to_word_sequence(line)
raw_sequences.append(word_seq)
for w in word_seq:
if w in word_count:
word_count[w] += 1
else:
word_count[w] = 1
input_file.close()
return raw_sequences, word_count
# index is start from 1
def get_sequences(raw_file, word_count):
label_list = []
raw_sequences = []
input_file = open(raw_file)
for line in input_file:
line_parts = line.strip().split('\t')
label = line_parts[0]
label_list.append(label)
sentence = line_parts[1]
word_seq = text.text_to_word_sequence(sentence)
raw_sequences.append(word_seq)
for w in word_seq:
if w in word_count:
word_count[w] += 1
else:
word_count[w] = 1
input_file.close()
return label_list, raw_sequences, word_count
# index is start from 1
def normalize_captions(self, captions_txt):
captions_txt = self._add_eos(captions_txt)
word_sequences = map(text_to_word_sequence, captions_txt)
result = map(' '.join, word_sequences)
return result
def get_text_sequences(raw_file, word_count):
label_list = []
raw_sequences = []
input_file = open(raw_file)
for line in input_file:
line_parts = line.strip().split('\t')
label = line_parts[0]
label_list.append(label)
sentence = line_parts[1]
word_seq = text.text_to_word_sequence(sentence)
raw_sequences.append(word_seq)
for w in word_seq:
if w in word_count:
word_count[w] += 1
else:
word_count[w] = 1
input_file.close()
return label_list, raw_sequences
# def insert_to_global(word_count, num_words, global_word_count):
# sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
# for (word, count) in sorted_word_count[:num_words]:
# if word in global_word_count:
# global_word_count[word] += count
# else:
# global_word_count[word] = count
def get_encoded_vector(list_of_words, new_string):
porter = PorterStemmer()
lmtz = WordNetLemmatizer()
if 'START_SEQ' not in list_of_words:
list_of_words.append('START_SEQ')
if 'UNKNOWN_WORDS' not in list_of_words:
list_of_words.append('UNKNOWN_WORDS')
if 'END_SEQ' not in list_of_words:
list_of_words.append('END_SEQ')
tokens = text_to_word_sequence(new_string, lower=True, split=" ")
# Stem and Lemmatize the data
token_stemmed = []
for token in tokens:
try:
token_stemmed.append(porter.stem(lmtz.lemmatize(token)))
except:
token_stemmed.append(token)
tokens = list(token_stemmed)
out = []
all_unknown_words = True
for token in tokens:
if token in list_of_words:
all_unknown_words = False
out.append(list_of_words.index(token))
else:
out.append(list_of_words.index('UNKNOWN_WORDS'))
if all_unknown_words:
print('Sentence not recognised:', new_string)
out = [list_of_words.index('START_SEQ')] + out + [list_of_words.index('END_SEQ')]
return out