preprocess_data.py 文件源码-python代码片段

preprocess_data.py 文件源码

python

阅读 22 收藏 0 点赞 0 评论 0

项目：identifiera-sarkasm 作者: risnejunior 项目源码文件源码

def build_vocabulary( words, max_size ):
    vocab_instances = 0
    unique_counts = Counter(words)
    d = dict(unique_counts.most_common(cfg.vocabulary_size-2) )
    vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1],  reverse=True) )

    # start at 2 to leave room for padding & unknown
    pb = Progress_bar(len(d) - 1) 
    for i, (key, value) in enumerate(vocabulary.items(), start=2):      
        vocab_instances += value
        vocabulary[key] = i
        pb.tick()

    vocabulary[cfg.padding_char] = 0
    vocabulary[cfg.placeholder_char] = 1
    #reverse the vocbulary (for reverse lookup)
    rev_vocabulary = {v: k for k, v in vocabulary.items()}  
    vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary)

    return vocab