def vectorize_and_dot_dataset(dataset):
dots = numpy.zeros(len(dataset))
labels = numpy.zeros(len(dataset))
i = 0
for row in dataset:
#Get the vectors for each word in the body and headline
split_headline = hf.split_words(row['headline'])
split_body = hf.split_words_special(row['body'], split_code)
headline_vectors = vectorize_wordlist(split_headline)
body_vectors = vectorize_wordlist(split_body)
#Sum the words in the body, sum the words in the headline
summed_headline_vector = numpy.sum(headline_vectors, axis=0)
summed_body_vector = numpy.sum(body_vectors, axis=0)
#Normalize
normalized_headline_vector = normalize(summed_headline_vector.reshape(1,-1))
normalized_body_vector = normalize(summed_body_vector.reshape(1,-1))
#Save the row vector
row['row_vector'] = numpy.concatenate( (normalized_headline_vector[0], normalized_body_vector[0]), axis=0)
row['isRelated'] = 0 if row['stance'] == 'unrelated' else 1
# Data relating to the relationship between the headline/body can be appended to row['row_vector']
if True:
extra_nodes = []
#Save the dot product
dot = numpy.vdot(normalized_headline_vector, normalized_body_vector)
extra_nodes.append(dot)
#Jaccard distance
jaccard = jaccard_distance(set(split_headline), set(split_body))
extra_nodes.append(jaccard)
#Sentiment analysis
extra_nodes.append( sentiment_analyzer.polarity_scores(row['headline'])['compound'] )
extra_nodes.append( sentiment_analyzer.polarity_scores(" ".join(split_body))['compound'] )
row['row_vector'] = numpy.append(row['row_vector'], extra_nodes)
# return dots, labels
评论列表
文章目录