def city_dialect_words(model, vocab, filename='./city_ranking.txt'):
#load named entities
ne_file = './dumps/ne_' + dataset_name + '.json'
with codecs.open(ne_file, 'r', encoding='utf-8') as fout:
NEs = json.load(fout)
NEs = set(NEs['nes'])
k = 200
with open('./data/cities.json', 'r') as fin:
cities = json.load(fin)
all_locs = np.array([[city['latitude'], city['longitude']] for city in cities]).astype('float32')
all_probs = model.predict(all_locs)
all_logprobs = np.log(all_probs)
all_logprobs_mean = np.mean(all_logprobs, axis=0)
city_dialectwords = defaultdict(list)
cities = cities[0:200]
for city in cities:
name = city['city']
lat, lon = city['latitude'], city['longitude']
loc = np.array([[lat, lon]]).astype('float32')
city_probs = model.predict(loc)
city_logprobs = np.log(city_probs)
normalized_city_logprobs = city_logprobs - all_logprobs_mean
sorted_vocab_indices = np.argsort(normalized_city_logprobs)
topwords = list(reversed(np.array(vocab)[sorted_vocab_indices][0].tolist()))[0:k]
#check if a topword is a named entity add a star beside it
dialect_words = []
for topword in topwords:
if topword in NEs:
topword = "NE_" + topword
dialect_words.append(topword)
city_dialectwords[name] = dialect_words
#write the city_dialectwords to file
with codecs.open(filename, 'w', encoding='utf-8') as fout:
json.dump(city_dialectwords, fout, indent=4, sort_keys=True)
评论列表
文章目录