lyrics.py 文件源码-python代码片段

def process_lyrics(file_name):
    lyrics = []
    content = clean_cn_corpus(file_name, clean_level='all', is_save=False)
    for l in content:
        if len(l) < 40:
            continue
        l = start_token + l + end_token
        lyrics.append(l)
    lyrics = sorted(lyrics, key=lambda line: len(line))
    print('all %d songs...' % len(lyrics))

    # if not os.path.exists(os.path.dirname(segment_list_file)):
    #     os.mkdir(os.path.dirname(segment_list_file))
    # if os.path.exists(segment_list_file):
    #     print('load segment file from %s' % segment_list_file)
    #     with open(segment_list_file, 'rb') as p:
    #         all_words = pickle.load(p)
    # else:
    all_words = []
    for lyric in lyrics:
        all_words += jieba.lcut(lyric, cut_all=False)
        # with open(segment_list_file, 'wb') as p:
        #     pickle.dump(all_words, p)
        #     print('segment result have been save into %s' % segment_list_file)

    # calculate how many time appear per word
    counter = collections.Counter(all_words)
    print(counter['E'])
    # sorted depends on frequent
    counter_pairs = sorted(counter.items(), key=lambda x: -x[1])
    words, _ = zip(*counter_pairs)
    print('E' in words)

    words = words[:len(words)] + (' ',)
    word_int_map = dict(zip(words, range(len(words))))
    # translate all lyrics into int vector
    lyrics_vector = [list(map(lambda word: word_int_map.get(word, len(words)), lyric)) for lyric in lyrics]
    return lyrics_vector, word_int_map, words