def test_word_means(X, y, word_index):
""" Performs a two-means t-test on the tf-idf values of a given word
represented by its index in the matrix X. The test checks whether the word
is over-represented in spammy messages and returns its p-value. The
smaller the p-value, the more over-represented the word is within spams
compared to hams.
Args:
X: the TF-IDF matrix where each line represents a document and each
column represents a word, typically obtained by running
transform_text().
y: a binary vector where the i-th value indicates whether the i-th
document is a spam, typically obtained by running transform_text().
word_index: an int representing a column number in X.
Returns:
A double that corresponds to the p-value of the test (the probability
that the word is NOT over-represented in the spams).
"""
# get a full matrice instead of a sparse one
X = X.todense()
x0 = X[ y == 0, word_index ]
x1 = X[ y == 1, word_index ]
# t < 0 means x0 < x1
t, p = ttest_ind(x0, x1)
return p
评论列表
文章目录