def load_data(self):
# TODO: make configurable
self.data_dir = "/data/WMT15/"
print("Preparing WMT data in %s" % self.data_dir)
en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data(
self.data_dir, self.en_vocab_size, self.fr_vocab_size)
# Read data into buckets and compute their sizes.
print ("Reading development and training data (limit: %d)."
% self.max_train_data_size)
self.dev_set = self.read_data(en_dev, fr_dev)
self.train_set = self.read_data(en_train, fr_train, self.max_train_data_size)
train_bucket_sizes = [len(self.train_set[b]) for b in xrange(len(self._buckets))]
train_total_size = float(sum(train_bucket_sizes))
# A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
# to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
# the size if i-th training bucket, as used later.
self.train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
for i in xrange(len(train_bucket_sizes))]