def process_lyrics(file_name):
lyrics = []
content = clean_cn_corpus(file_name, clean_level='all', is_save=False)
for l in content:
if len(l) < 40:
continue
l = start_token + l + end_token
lyrics.append(l)
lyrics = sorted(lyrics, key=lambda line: len(line))
print('all %d songs...' % len(lyrics))
# if not os.path.exists(os.path.dirname(segment_list_file)):
# os.mkdir(os.path.dirname(segment_list_file))
# if os.path.exists(segment_list_file):
# print('load segment file from %s' % segment_list_file)
# with open(segment_list_file, 'rb') as p:
# all_words = pickle.load(p)
# else:
all_words = []
for lyric in lyrics:
all_words += jieba.lcut(lyric, cut_all=False)
# with open(segment_list_file, 'wb') as p:
# pickle.dump(all_words, p)
# print('segment result have been save into %s' % segment_list_file)
# calculate how many time appear per word
counter = collections.Counter(all_words)
print(counter['E'])
# sorted depends on frequent
counter_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = zip(*counter_pairs)
print('E' in words)
words = words[:len(words)] + (' ',)
word_int_map = dict(zip(words, range(len(words))))
# translate all lyrics into int vector
lyrics_vector = [list(map(lambda word: word_int_map.get(word, len(words)), lyric)) for lyric in lyrics]
return lyrics_vector, word_int_map, words
评论列表
文章目录