python类string_split()的实例源码-第2页-面圈网

dataset.py 文件源码项目：THUMT 作者: thumt 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def get_evaluation_input(inputs, params):
    with tf.device("/cpu:0"):
        # Create datasets
        datasets = []

        for data in inputs:
            dataset = tf.data.Dataset.from_tensor_slices(data)
            # Split string
            dataset = dataset.map(lambda x: tf.string_split([x]).values,
                                  num_parallel_calls=params.num_threads)
            # Append <eos>
            dataset = dataset.map(
                lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
                num_parallel_calls=params.num_threads
            )
            datasets.append(dataset)

        dataset = tf.data.Dataset.zip(tuple(datasets))

        # Convert tuple to dictionary
        dataset = dataset.map(
            lambda *x: {
                "source": x[0],
                "source_length": tf.shape(x[0])[0],
                "references": x[1:]
            },
            num_parallel_calls=params.num_threads
        )

        dataset = dataset.padded_batch(
            params.eval_batch_size,
            {
                "source": [tf.Dimension(None)],
                "source_length": [],
                "references": (tf.Dimension(None),) * (len(inputs) - 1)
            },
            {
                "source": params.pad,
                "source_length": 0,
                "references": (params.pad,) * (len(inputs) - 1)
            }
        )

        iterator = dataset.make_one_shot_iterator()
        features = iterator.get_next()

        src_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["source"]),
            default_value=params.mapping["source"][params.unk]
        )
        tgt_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["target"]),
            default_value=params.mapping["target"][params.unk]
        )
        features["source"] = src_table.lookup(features["source"])
        features["references"] = tuple(
            tgt_table.lookup(item) for item in features["references"]
        )

    return features

monodepth_dataloader.py 文件源码项目：monodepth 作者: mrharicot 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def __init__(self, data_path, filenames_file, params, dataset, mode):
        self.data_path = data_path
        self.params = params
        self.dataset = dataset
        self.mode = mode

        self.left_image_batch  = None
        self.right_image_batch = None

        input_queue = tf.train.string_input_producer([filenames_file], shuffle=False)
        line_reader = tf.TextLineReader()
        _, line = line_reader.read(input_queue)

        split_line = tf.string_split([line]).values

        # we load only one image for test, except if we trained a stereo model
        if mode == 'test' and not self.params.do_stereo:
            left_image_path  = tf.string_join([self.data_path, split_line[0]])
            left_image_o  = self.read_image(left_image_path)
        else:
            left_image_path  = tf.string_join([self.data_path, split_line[0]])
            right_image_path = tf.string_join([self.data_path, split_line[1]])
            left_image_o  = self.read_image(left_image_path)
            right_image_o = self.read_image(right_image_path)

        if mode == 'train':
            # randomly flip images
            do_flip = tf.random_uniform([], 0, 1)
            left_image  = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(right_image_o), lambda: left_image_o)
            right_image = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(left_image_o),  lambda: right_image_o)

            # randomly augment images
            do_augment  = tf.random_uniform([], 0, 1)
            left_image, right_image = tf.cond(do_augment > 0.5, lambda: self.augment_image_pair(left_image, right_image), lambda: (left_image, right_image))

            left_image.set_shape( [None, None, 3])
            right_image.set_shape([None, None, 3])

            # capacity = min_after_dequeue + (num_threads + a small safety margin) * batch_size
            min_after_dequeue = 2048
            capacity = min_after_dequeue + 4 * params.batch_size
            self.left_image_batch, self.right_image_batch = tf.train.shuffle_batch([left_image, right_image],
                        params.batch_size, capacity, min_after_dequeue, params.num_threads)

        elif mode == 'test':
            self.left_image_batch = tf.stack([left_image_o,  tf.image.flip_left_right(left_image_o)],  0)
            self.left_image_batch.set_shape( [2, None, None, 3])

            if self.params.do_stereo:
                self.right_image_batch = tf.stack([right_image_o,  tf.image.flip_left_right(right_image_o)],  0)
                self.right_image_batch.set_shape( [2, None, None, 3])

reddit.py 文件源码项目：kaggle-youtube-8m 作者: liufuyang 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def make_preprocessing_fn(frequency_threshold):
  """Creates a preprocessing function for criteo.

  Args:
    frequency_threshold: The frequency_threshold used when generating
      vocabularies for categorical and text features.

  Returns:
    A preprocessing function.
  """

  def preprocessing_fn(inputs):
    """User defined preprocessing function for criteo columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
    # TODO(b/35001605) Make this "passthrough" more DRY.
    result = {'score': inputs['score'], 'toplevel': inputs['toplevel']}

    result['subreddit_id'] = tft.string_to_int(
        inputs['subreddit'], frequency_threshold=frequency_threshold)

    # TODO(b/35318962): Obviate the need for this workaround on Dense features.
    # FeatureColumns expect shape (batch_size, 1), not just (batch_size)
    # All features added to results up to this point are dense and require this
    # workaround. All following features will be sparse.
    result = {
        k: tft.map(lambda x: tf.expand_dims(x, -1), v)
        for k, v in result.items()
    }

    for name in ('author', 'comment_body', 'comment_parent_body'):
      words = tft.map(tf.string_split, inputs[name])
      # TODO(b/33467613) Translate these to bag-of-words style sparse features.
      result[name + '_bow'] = tft.string_to_int(
          words, frequency_threshold=frequency_threshold)

    return result

  return preprocessing_fn