prediction_functions.py 文件源码-python代码片段

def vectorize_and_dot_dataset(dataset):
    dots = numpy.zeros(len(dataset))
    labels = numpy.zeros(len(dataset))
    i = 0
    for row in dataset:
        #Get the vectors for each word in the body and headline
        split_headline = hf.split_words(row['headline'])
        split_body = hf.split_words_special(row['body'], split_code)
        headline_vectors = vectorize_wordlist(split_headline)
        body_vectors = vectorize_wordlist(split_body)
        #Sum the words in the body, sum the words in the headline
        summed_headline_vector = numpy.sum(headline_vectors, axis=0)
        summed_body_vector = numpy.sum(body_vectors, axis=0)
        #Normalize
        normalized_headline_vector = normalize(summed_headline_vector.reshape(1,-1))
        normalized_body_vector = normalize(summed_body_vector.reshape(1,-1))
        #Save the row vector
        row['row_vector'] = numpy.concatenate( (normalized_headline_vector[0], normalized_body_vector[0]), axis=0)
        row['isRelated'] = 0 if row['stance'] == 'unrelated' else 1

        # Data relating to the relationship between the headline/body can be appended to row['row_vector']
        if True:
            extra_nodes = []

            #Save the dot product
            dot = numpy.vdot(normalized_headline_vector, normalized_body_vector)
            extra_nodes.append(dot)
            #Jaccard distance
            jaccard = jaccard_distance(set(split_headline), set(split_body))
            extra_nodes.append(jaccard)
            #Sentiment analysis
            extra_nodes.append( sentiment_analyzer.polarity_scores(row['headline'])['compound'] )
            extra_nodes.append( sentiment_analyzer.polarity_scores(" ".join(split_body))['compound'] )

            row['row_vector'] = numpy.append(row['row_vector'], extra_nodes)


    # return dots, labels