item2vec.py 文件源码-python代码片段

def build_training_graph(self, batch, labels):
        """Takes in the graph nodes representing a training batch and
        associated labels, and builds the forward training graph, 
        including the embedding itself. Returns nodes representing
        the logits for the positive examples, as well as the logits
        for associated negatives for negative sampling."""
        # We do this because word2vec initializes the weights this way
        init_width = 0.5 / self.embed_dim
        # The actual embedding:
        # The shape of the tensor is weird because we are going to use the
        # "embedding_lookup" function instead of just multipling with a 1-hot.
        emb = tf.Variable(tf.random_uniform([self.vocab_size, self.embed_dim],
                                            -init_width, init_width),
                          name="embedding")
        self.emb = emb

        # For training, we actually need to train a softmax classifier.
        # This tensor can be thought of as a complete softmax unit for
        # every possible game. (Each row is a set of weights)
        softmax_w = tf.Variable(tf.zeros([self.vocab_size, self.embed_dim]),
                                name="softmax_weights")
        softmax_b = tf.Variables(tf.zeros([self.vocab_size]), 
                                 name="softmax_bias")

        # Negative sampling for SGNS. We make the assumption of sparsity.
        # On average, randomly sampled games will be negatives.
        labels_reformat = tf.reshape(tf.cast(labels, dtype=tf.int64),
                            [len(training_labels), 1])
        sampled_ids = tf.nn.fixed_unigram_candidate_sampler(
                true_classes=labels_reformat,
                num_true=1,
                num_sampled=self.num_negatives,
                unique=True,
                range_max=self.vocab_size,
                distortion=0.75,
                unigrams=self.total_item_counts)

        batch_embeds = tf.embedding_lookup(emb, batch)
        # Lookup the softmax classifiers for the training batch.
        # I don't particularly like the use of "embedding_lookup",
        # because softmax_w etc. aren't technically embedding
        # matrices. This is apparently the canonical way to do it in TF though.
        batch_sm_w = tf.embedding_lookup(softmax_w, labels)
        batch_sm_b = tf.embedding_lookup(softmax_b, labels)

        # Lookup the softmax classifers for the negative samples.
        neg_sm_w = tf.embedding_lookup(softmax_w, sampled_ids)
        neg_sm_b = tf.embedding_lookup(softmax_b, sampled_ids)

        # Produces a tensor that represents the logits (the arg of the
        # exponential numerator in softmax) for each of the examples
        # in the training batch.
        batch_logits = (tf.reduce_sum(tf.mul(batch_embeds, batch_sm_w), 1) 
                        + batch_sm_b)
        neg_logits = (tf.reduce_sum(tf.mul(batch_embeds, 
                                           neg_sm_w, transpose_b=True)) +
                      tf.reshape(neg_sm_b, [self.num_negatives]))

        return batch_logits, neg_logits