def decode():
with tf.Session() as sess:
# Create model and load parameters.
model = create_model(sess, True)
model.batch_size = 1 # We decode one sentence at a time.
# Load vocabularies.
enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])
enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
_, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)
# Decode from standard input.
sys.stdout.write("> ")
sys.stdout.flush()
sentence = sys.stdin.readline()
while sentence:
# Get token-ids for the input sentence.
token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
# Which bucket does it belong to?
bucket_id = min([b for b in xrange(len(_buckets))
if _buckets[b][0] > len(token_ids)])
# Get a 1-element batch to feed the sentence to the model.
encoder_inputs, decoder_inputs, target_weights = model.get_batch(
{bucket_id: [(token_ids, [])]}, bucket_id)
# Get output logits for the sentence.
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
target_weights, bucket_id, True)
# This is a greedy decoder - outputs are just argmaxes of output_logits.
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
# If there is an EOS symbol in outputs, cut them at that point.
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
# Print out French sentence corresponding to outputs.
print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]))
print("> ", end="")
sys.stdout.flush()
sentence = sys.stdin.readline()
python类EOS_ID的实例源码
execute.py 文件源码
项目:Technical-Analysis-And-Practice-in-TensorFlow
作者: greatgeekgrace
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def read_data(source_path, target_path, max_size=None):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source language.
target_path: path to the file with token-ids for the target language;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set = [[] for _ in _buckets]
with tf.gfile.GFile(source_path, mode="r") as source_file:
with tf.gfile.GFile(target_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
counter = 0
while source and target and (not max_size or counter < max_size):
counter += 1
if counter % 100000 == 0:
print(" reading data line %d" % counter)
sys.stdout.flush()
source_ids = [int(x) for x in source.split()]
target_ids = [int(x) for x in target.split()]
target_ids.append(data_utils.EOS_ID)
for bucket_id, (source_size, target_size) in enumerate(_buckets):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break
source, target = source_file.readline(), target_file.readline()
return data_set
def read_data(self, source_path, target_path, max_size=None):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source language.
target_path: path to the file with token-ids for the target language;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set = [[] for _ in self._buckets]
with tf.gfile.GFile(source_path, mode="r") as source_file:
with tf.gfile.GFile(target_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
counter = 0
while source and target and (not max_size or counter < max_size):
counter += 1
if counter % 100000 == 0:
print(" reading data line %d" % counter)
sys.stdout.flush()
source_ids = [int(x) for x in source.split()]
target_ids = [int(x) for x in target.split()]
target_ids.append(data_utils.EOS_ID)
for bucket_id, (source_size, target_size) in enumerate(self._buckets):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break
source, target = source_file.readline(), target_file.readline()
return data_set
def read_data(source_path, target_path, max_size=None):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source language.
target_path: path to the file with token-ids for the target language;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set = [[] for _ in _buckets]
with tf.gfile.GFile(source_path, mode="r") as source_file:
with tf.gfile.GFile(target_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
counter = 0
while source and target and (not max_size or counter < max_size):
counter += 1
if counter % 100000 == 0:
print(" reading data line %d" % counter)
sys.stdout.flush()
source_ids = [int(x) for x in source.split()]
target_ids = [int(x) for x in target.split()]
target_ids.append(data_utils.EOS_ID)
for bucket_id, (source_size, target_size) in enumerate(_buckets):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break
source, target = source_file.readline(), target_file.readline()
return data_set
def decode():
with tf.Session() as sess:
# Create model and load parameters.
model = create_model(sess, True)
model.batch_size = 1 # We decode one sentence at a time.
# Load vocabularies.
enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])
enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
_, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)
# Decode from standard input.
sys.stdout.write("> ")
sys.stdout.flush()
sentence = sys.stdin.readline()
while sentence:
# Get token-ids for the input sentence.
token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
# Which bucket does it belong to?
bucket_id = min([b for b in xrange(len(_buckets))
if _buckets[b][0] > len(token_ids)])
# Get a 1-element batch to feed the sentence to the model.
encoder_inputs, decoder_inputs, target_weights = model.get_batch(
{bucket_id: [(token_ids, [])]}, bucket_id)
# Get output logits for the sentence.
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
target_weights, bucket_id, True)
# This is a greedy decoder - outputs are just argmaxes of output_logits.
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
# If there is an EOS symbol in outputs, cut them at that point.
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
# Print out French sentence corresponding to outputs.
print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]))
print("> ", end="")
sys.stdout.flush()
sentence = sys.stdin.readline()
def read_data(source_path, target_path, max_size=10000):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source language.
target_path: path to the file with token-ids for the target language;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set = [[] for _ in _buckets]
with tf.gfile.GFile(source_path, mode="r") as source_file:
with tf.gfile.GFile(target_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
counter = 0
while source and target and (not max_size or counter < max_size):
counter += 1
if counter % 100000 == 0:
print(" reading data line %d" % counter)
sys.stdout.flush()
source_ids = [int(x) for x in source.split()]
target_ids = [int(x) for x in target.split()]
target_ids.append(data_utils.EOS_ID)
for bucket_id, (source_size, target_size) in enumerate(_buckets):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break
source, target = source_file.readline(), target_file.readline()
return data_set
def read_data(source_path, target_path, max_size=10000):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source language.
target_path: path to the file with token-ids for the target language;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set = [[] for _ in _buckets]
with tf.gfile.GFile(source_path, mode="r") as source_file:
with tf.gfile.GFile(target_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
counter = 0
while source and target and (not max_size or counter < max_size):
counter += 1
if counter % 100000 == 0:
print(" reading data line %d" % counter)
sys.stdout.flush()
source_ids = [int(x) for x in source.split()]
target_ids = [int(x) for x in target.split()]
target_ids.append(data_utils.EOS_ID)
for bucket_id, (source_size, target_size) in enumerate(_buckets):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break
source, target = source_file.readline(), target_file.readline()
return data_set
def read_data(source_path, target_path, max_size=None):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source language.
target_path: path to the file with token-ids for the target language;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set = [[] for _ in _buckets]
with tf.gfile.GFile(source_path, mode="r") as source_file:
with tf.gfile.GFile(target_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
counter = 0
while source and target and (not max_size or counter < max_size):
counter += 1
if counter % 100000 == 0:
print(" reading data line %d" % counter)
sys.stdout.flush()
source_ids = [int(x) for x in source.split()]
target_ids = [int(x) for x in target.split()]
target_ids.append(data_utils.EOS_ID)
for bucket_id, (source_size, target_size) in enumerate(_buckets):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break
source, target = source_file.readline(), target_file.readline()
return data_set
def read_data(source_path, target_path, max_size=None):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source.
target_path: path to the file with token-ids for the target;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set = [[] for _ in _buckets]
with gfile.GFile(source_path, mode="r") as source_file:
with gfile.GFile(target_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
counter = 0
while source and target and (not max_size or counter < max_size):
counter += 1
if counter % 100000 == 0:
print(" reading data line %d" % counter)
sys.stdout.flush()
source_ids = [int(x) for x in source.split()]
target_ids = [int(x) for x in target.split()]
target_ids.append(data_utils.EOS_ID)
for bucket_id, (source_size, target_size) in enumerate(_buckets):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break
source, target = source_file.readline(), target_file.readline()
return data_set
def read_data(source_path, target_path, max_size=None):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source language.
target_path: path to the file with token-ids for the target language;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set = [[] for _ in _buckets]
with tf.gfile.GFile(source_path, mode="r") as source_file:
with tf.gfile.GFile(target_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
counter = 0
while source and target and (not max_size or counter < max_size):
counter += 1
if counter % 100000 == 0:
print(" reading data line %d" % counter)
sys.stdout.flush()
source_ids = [int(x) for x in source.split()]
target_ids = [int(x) for x in target.split()]
target_ids.append(data_utils.EOS_ID)
for bucket_id, (source_size, target_size) in enumerate(_buckets):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break
source, target = source_file.readline(), target_file.readline()
return data_set
def read_data(source_path, target_path, max_size=None):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source language.
target_path: path to the file with token-ids for the target language;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set = [[] for _ in _buckets]
with tf.gfile.GFile(source_path, mode="r") as source_file:
with tf.gfile.GFile(target_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
counter = 0
while source and target and (not max_size or counter < max_size):
counter += 1
if counter % 100000 == 0:
print(" reading data line %d" % counter)
sys.stdout.flush()
source_ids = [int(x) for x in source.split()]
target_ids = [int(x) for x in target.split()]
target_ids.append(data_utils.EOS_ID)
for bucket_id, (source_size, target_size) in enumerate(_buckets):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break
source, target = source_file.readline(), target_file.readline()
return data_set
def decode():
with tf.Session() as sess:
# Create model and load parameters.
model = create_model(sess, True)
model.batch_size = 1 # We decode one sentence at a time.
# Load vocabularies.
enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])
enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
_, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)
# Decode from standard input.
sys.stdout.write("> ")
sys.stdout.flush()
sentence = sys.stdin.readline()
while sentence:
# Get token-ids for the input sentence.
token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
# Which bucket does it belong to?
bucket_id = min([b for b in xrange(len(_buckets))
if _buckets[b][0] > len(token_ids)])
# Get a 1-element batch to feed the sentence to the model.
encoder_inputs, decoder_inputs, target_weights = model.get_batch(
{bucket_id: [(token_ids, [])]}, bucket_id)
# Get output logits for the sentence.
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
target_weights, bucket_id, True)
# This is a greedy decoder - outputs are just argmaxes of output_logits.
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
# If there is an EOS symbol in outputs, cut them at that point.
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
# Print out French sentence corresponding to outputs.
print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]))
print("> ", end="")
sys.stdout.flush()
sentence = sys.stdin.readline()
def read_data(source_path, target_path, max_size=None):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source language.
target_path: path to the file with token-ids for the target language;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set = [[] for _ in _buckets]
with tf.gfile.GFile(source_path, mode="r") as source_file:
with tf.gfile.GFile(target_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
counter = 0
while source and target and (not max_size or counter < max_size):
counter += 1
if counter % 100000 == 0:
print(" reading data line %d" % counter)
sys.stdout.flush()
source_ids = [int(x) for x in source.split()]
target_ids = [int(x) for x in target.split()]
target_ids.append(data_utils.EOS_ID)
for bucket_id, (source_size, target_size) in enumerate(_buckets):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break
source, target = source_file.readline(), target_file.readline()
return data_set
def decode():
with tf.Session() as sess:
# Create model and load parameters.
model = create_model(sess, True)
model.batch_size = 1 # We decode one sentence at a time.
# Load vocabularies.
enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])
enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
_, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)
# Decode from standard input.
sys.stdout.write("> ")
sys.stdout.flush()
sentence = sys.stdin.readline()
while sentence:
# Get token-ids for the input sentence.
token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
# Which bucket does it belong to?
bucket_id = min([b for b in xrange(len(_buckets))
if _buckets[b][0] > len(token_ids)])
# Get a 1-element batch to feed the sentence to the model.
encoder_inputs, decoder_inputs, target_weights = model.get_batch(
{bucket_id: [(token_ids, [])]}, bucket_id)
# Get output logits for the sentence.
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
target_weights, bucket_id, True)
# This is a greedy decoder - outputs are just argmaxes of output_logits.
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
# If there is an EOS symbol in outputs, cut them at that point.
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
# Print out French sentence corresponding to outputs.
print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]))
print("> ", end="")
sys.stdout.flush()
sentence = sys.stdin.readline()
def decode():
""" Decode file sentence-by-sentence """
with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS)) as sess:
# Create model and load parameters.
with tf.variable_scope("model"):
model, steps_done = create_model(sess, True, attention=FLAGS.attention, model_path=FLAGS.model_path)
model.batch_size = 1 # We decode one sentence at a time.
# Load vocabularies.
sents_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.sents" % FLAGS.input_vocab_size)
parse_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.parse" % FLAGS.output_vocab_size)
sents_vocab, _ = data_utils.initialize_vocabulary(sents_vocab_path)
_, rev_parse_vocab = data_utils.initialize_vocabulary(parse_vocab_path)
start_time = time.time()
# Decode
with open(FLAGS.decode_input_path, 'r') as fin, open(FLAGS.decode_output_path, 'w') as fout:
for line in fin:
sentence = line.strip()
token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), sents_vocab)
try:
bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])
except:
print("Input sentence does not fit in any buckets. Skipping... ")
print("\t", line)
continue
encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,target_weights, bucket_id, True)
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
decoded_sentence = " ".join([tf.compat.as_str(rev_parse_vocab[output]) for output in outputs]) + '\n'
fout.write(decoded_sentence)
time_elapsed = time.time() - start_time
print("Decoding time: ", time_elapsed)
def decode():
""" Decode file sentence-by-sentence """
with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS)) as sess:
# Create model and load parameters.
with tf.variable_scope("model"):
model, steps_done = create_model(sess, True, attention=FLAGS.attention, model_path=FLAGS.model_path)
model.batch_size = 1 # We decode one sentence at a time.
# Load vocabularies.
sents_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.sents" % FLAGS.input_vocab_size)
parse_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.parse" % FLAGS.output_vocab_size)
sents_vocab, _ = data_utils.initialize_vocabulary(sents_vocab_path)
_, rev_parse_vocab = data_utils.initialize_vocabulary(parse_vocab_path)
start_time = time.time()
# Decode
with open(FLAGS.decode_input_path, 'r') as fin, open(FLAGS.decode_output_path, 'w') as fout:
for line in fin:
sentence = line.strip()
token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), sents_vocab)
try:
bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])
except:
print("Input sentence does not fit in any buckets. Skipping... ")
print("\t", line)
continue
encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,target_weights, bucket_id, True)
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
decoded_sentence = " ".join([tf.compat.as_str(rev_parse_vocab[output]) for output in outputs]) + '\n'
fout.write(decoded_sentence)
time_elapsed = time.time() - start_time
print("Decoding time: ", time_elapsed)
def decode():
""" Decode file sentence-by-sentence """
with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS)) as sess:
# Create model and load parameters.
with tf.variable_scope("model"):
model, steps_done = create_model(sess, True, attention=FLAGS.attention, model_path=FLAGS.model_path)
model.batch_size = 1 # We decode one sentence at a time.
# Load vocabularies.
sents_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.sents" % FLAGS.input_vocab_size)
parse_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.parse" % FLAGS.output_vocab_size)
sents_vocab, _ = data_utils.initialize_vocabulary(sents_vocab_path)
_, rev_parse_vocab = data_utils.initialize_vocabulary(parse_vocab_path)
start_time = time.time()
# Decode
with open(FLAGS.decode_input_path, 'r') as fin, open(FLAGS.decode_output_path, 'w') as fout:
for line in fin:
sentence = line.strip()
token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), sents_vocab)
try:
bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])
except:
print("Input sentence does not fit in any buckets. Skipping... ")
print("\t", line)
continue
encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,target_weights, bucket_id, True)
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
decoded_sentence = " ".join([tf.compat.as_str(rev_parse_vocab[output]) for output in outputs]) + '\n'
fout.write(decoded_sentence)
time_elapsed = time.time() - start_time
print("Decoding time: ", time_elapsed)
def plot_attn_mat(model_dev, all_examples):
model_dev.batch_size = len(all_examples)
token_ids = [x[0] for x in all_examples]
gold_ids = [x[1] for x in all_examples]
dec_ids = [[]] * len(token_ids)
encoder_inputs, decoder_inputs, target_weights = model_dev.get_decode_batch(
{bucket_id: zip(token_ids, dec_ids)}, bucket_id)
_, _, output_logits, attns = model_dev.step(sess, encoder_inputs, decoder_inputs,
target_weights, bucket_id, True)
#_, _, output_logits, attns = model_dev.step_with_attn(sess, encoder_inputs, decoder_inputs,target_weights, bucket_id, True)
outputs = [np.argmax(logit, axis=1) for logit in output_logits]
to_decode = np.array(outputs).T
sent_id = 0
parse = list(to_decode[sent_id, :])
parse_all = parse[:]
if data_utils.EOS_ID in parse: parse = parse[:parse.index(data_utils.EOS_ID)]
decoded_parse = []
decoded_parse_all = []
for output in parse:
if output < len(rev_parse_vocab): decoded_parse.append(tf.compat.as_str(rev_parse_vocab[output]))
else: decoded_parse.append("_UNK")
for output in parse_all:
if output < len(rev_parse_vocab): decoded_parse_all.append(tf.compat.as_str(rev_parse_vocab[output]))
else: decoded_parse_all.append("_UNK")
gold_parse = [tf.compat.as_str(rev_parse_vocab[output]) for output in gold_ids[sent_id]]
sent_text = [tf.compat.as_str(rev_sent_vocab[output]) for output in token_ids[sent_id]]
mat = attns[:,0,:].T
return encoder_inputs, sent_text, gold_parse, decoded_parse, mat
def decode():
""" Decode file sentence-by-sentence """
with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS)) as sess:
# Create model and load parameters.
with tf.variable_scope("model"):
model, steps_done = create_model(sess, True, attention=FLAGS.attention, model_path=FLAGS.model_path)
model.batch_size = 1 # We decode one sentence at a time.
# Load vocabularies.
sents_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.sents" % FLAGS.input_vocab_size)
parse_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.parse" % FLAGS.output_vocab_size)
sents_vocab, _ = data_utils.initialize_vocabulary(sents_vocab_path)
_, rev_parse_vocab = data_utils.initialize_vocabulary(parse_vocab_path)
start_time = time.time()
# Decode
with open(FLAGS.decode_input_path, 'r') as fin, open(FLAGS.decode_output_path, 'w') as fout:
for line in fin:
sentence = line.strip()
token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), sents_vocab)
try:
bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])
except:
print("Input sentence does not fit in any buckets. Skipping... ")
print("\t", line)
continue
encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,target_weights, bucket_id, True)
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
decoded_sentence = " ".join([tf.compat.as_str(rev_parse_vocab[output]) for output in outputs]) + '\n'
fout.write(decoded_sentence)
time_elapsed = time.time() - start_time
print("Decoding time: ", time_elapsed)
tf_flask_api.py 文件源码
项目:Seq2Seq-Tensorflow-1.0-Chatbot
作者: igorvishnevskiy
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def post(self):
data_received = request.json
if not data_received:
data_received = eval(request.form["payload"])
sentence = data_received["text"]
print(sentence)
token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
# Which bucket does it belong to?
bucket_id = len(_buckets) - 1
for i, bucket in enumerate(_buckets):
if bucket[0] >= len(token_ids):
bucket_id = i
break
else:
logging.warning("Sentence truncated: %s", sentence)
# Get a 1-element batch to feed the sentence to the model.
encoder_inputs, decoder_inputs, target_weights = model.get_batch(
{bucket_id: [(token_ids, [])]}, bucket_id)
# Get output logits for the sentence.
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
target_weights, bucket_id, True)
# This is a greedy decoder - outputs are just argmaxes of output_logits.
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
# If there is an EOS symbol in outputs, cut them at that point.
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
# Print out French sentence corresponding to outputs.
response = (" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
print(response)
return jsonify({"text":response})