loc2lang.py 文件源码-python代码片段

def city_dialect_words(model, vocab, filename='./city_ranking.txt'):
    #load named entities
    ne_file = './dumps/ne_' + dataset_name + '.json'
    with codecs.open(ne_file, 'r', encoding='utf-8') as fout:
        NEs = json.load(fout)
    NEs = set(NEs['nes'])

    k = 200
    with open('./data/cities.json', 'r') as fin:
        cities = json.load(fin)
    all_locs = np.array([[city['latitude'], city['longitude']] for city in cities]).astype('float32')
    all_probs = model.predict(all_locs)
    all_logprobs = np.log(all_probs)
    all_logprobs_mean = np.mean(all_logprobs, axis=0)
    city_dialectwords = defaultdict(list)

    cities = cities[0:200]
    for city in cities:
        name = city['city']
        lat, lon = city['latitude'], city['longitude']
        loc = np.array([[lat, lon]]).astype('float32')
        city_probs = model.predict(loc)
        city_logprobs = np.log(city_probs)
        normalized_city_logprobs = city_logprobs - all_logprobs_mean
        sorted_vocab_indices = np.argsort(normalized_city_logprobs)
        topwords = list(reversed(np.array(vocab)[sorted_vocab_indices][0].tolist()))[0:k]

        #check if a topword is a named entity add a star beside it
        dialect_words = []
        for topword in topwords:
            if topword in NEs:
                topword = "NE_" + topword
            dialect_words.append(topword)

        city_dialectwords[name] = dialect_words
        #write the city_dialectwords to file
        with codecs.open(filename, 'w', encoding='utf-8') as fout:
            json.dump(city_dialectwords, fout, indent=4, sort_keys=True)