def feature():
global termcount
dataMatrix = np.genfromtxt(finaltrial, delimiter='|', dtype=None, skip_header=True)
terms = []
n = dataMatrix.size
for row in dataMatrix:
row[0] = row[0].lower().decode('UTF-8')
temp = row[0].decode('UTF-8').replace(' ', '+')
temp = (get.urlopen("http://localhost:5095/parser?sentence=" + temp).read()).decode('UTF-8')
terms.extend([x.split('/')[0] for x in temp.split(' ') if
x.split('/')[1] == 'JJ' or x.split('/')[1].startswith('VB')])
tfidf(temp)
s = sum(list(termcount.values()))
termcount = {x: (y * 100 / s) for x, y in zip(termcount.keys(), termcount.values())}
# terms.extend([x for x in termcount.keys()])
terms = list(set(terms))
stop = open('stop.csv', 'r').read().splitlines()
terms = [x for x in terms if x not in stop]
l = len(terms)
occurence = np.zeros((n, l), dtype=np.int)
d = 0
for row in dataMatrix:
temp = row[0].decode('UTF-8').split(' ')
for i in range(l):
if terms[i] in temp:
occurence[d][i] += 1
d += 1
transformer = TfidfTransformer()
tfdif = transformer.fit_transform(occurence)
occurence = tfdif.toarray()
np.savetxt('occurence.csv',occurence,delimiter=',')
return occurence, dataMatrix, terms
评论列表
文章目录