def take_some_analysis(file_dir):
context_length = []
utterance_length = []
dist = nltk.FreqDist()
for c, u in utterance_generator(file_dir):
c_tokens = nltk.word_tokenize(c)
u_tokens = nltk.word_tokenize(u)
# ????
context_length.append(len(c_tokens))
utterance_length.append(len(u_tokens))
dist.update(c_tokens + u_tokens)
cl_array = np.array(context_length)
ul_array = np.array(utterance_length)
print("most length of context is %d" % cl_array.max())
print("most length of utterance is %d" % ul_array.max())
print("mean length of context is %f" % cl_array.mean())
print("mean length of utterance is %f" % ul_array.mean())
sub_abs = np.abs(cl_array - ul_array)
print("max,min,mean of abs(context_length -utterance_length) is %f,%f,%f" % (
np.max(sub_abs), np.min(sub_abs), np.mean(sub_abs)))
print("most common words :")
print(dist.most_common(10))
评论列表
文章目录