def overlap_visualize():
train,test,dev = load("nlpcc",filter=True)
test = test.reindex(np.random.permutation(test.index))
df = test
df['qlen'] = df['question'].str.len()
df['alen'] = df['answer'].str.len()
df['q_n_words'] = df['question'].apply(lambda row:len(row.split(' ')))
df['a_n_words'] = df['answer'].apply(lambda row:len(row.split(' ')))
def normalized_word_share(row):
w1 = set(map(lambda word: word.lower().strip(), row['question'].split(" ")))
w2 = set(map(lambda word: word.lower().strip(), row['answer'].split(" ")))
return 1.0 * len(w1 & w2)/(len(w1) + len(w2))
df['word_share'] = df.apply(normalized_word_share, axis=1)
plt.figure(figsize=(12, 8))
plt.subplot(1,2,1)
sns.violinplot(x = 'flag', y = 'word_share', data = df[0:50000])
plt.subplot(1,2,2)
sns.distplot(df[df['flag'] == 1.0]['word_share'][0:10000], color = 'green')
sns.distplot(df[df['flag'] == 0.0]['word_share'][0:10000], color = 'red')
print evaluation.evaluationBypandas(test,df['word_share'])
plt.show('hold')
评论列表
文章目录