def get_text_sequences(raw_file, word_count):
label_list = []
raw_sequences = []
input_file = open(raw_file)
for line in input_file:
line_parts = line.strip().split('\t')
label = line_parts[0]
label_list.append(label)
sentence = line_parts[1]
word_seq = text.text_to_word_sequence(sentence)
raw_sequences.append(word_seq)
for w in word_seq:
if w in word_count:
word_count[w] += 1
else:
word_count[w] = 1
input_file.close()
return label_list, raw_sequences
# def insert_to_global(word_count, num_words, global_word_count):
# sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
# for (word, count) in sorted_word_count[:num_words]:
# if word in global_word_count:
# global_word_count[word] += count
# else:
# global_word_count[word] = count
评论列表
文章目录