def fsel(data=[]): # (feature selection, using chi2)
""" Returns a {feature: p-value} dict
for the given set of (vector, label)-tuples.
"""
from scipy.stats import chi2_contingency as chi2
f1 = collections.defaultdict(float) # {label: count}
f2 = collections.defaultdict(float) # {feature: count}
f3 = collections.defaultdict(float) # {feature, label: count}
p = {}
for v, label in data:
f1[label] += 1
for v, label in data:
for f in v:
f2[f] += 1
f3[f, label] += 1
for f in f2:
p[f] = chi2([[f1[label] - f3[f, label] or 0.1 for label in f1],
[ f3[f, label] or 0.1 for label in f1]])[1]
return p
评论列表
文章目录