def __init__(self, source,
source_dicts,
batch_size=128,
maxlen=100,
n_words_source=-1,
skip_empty=False,
shuffle_each_epoch=False,
sort_by_length=True,
maxibatch_size=20):
if shuffle_each_epoch:
self.source_orig = source
self.source = shuffle.main([self.source_orig], temporary=True)
else:
self.source = fopen(source, 'r')
self.source_dicts = []
for source_dict in source_dicts:
self.source_dicts.append(load_dict(source_dict))
self.batch_size = batch_size
self.maxlen = maxlen
self.skip_empty = skip_empty
self.n_words_source = n_words_source
if self.n_words_source > 0:
for d in self.source_dicts:
for key, idx in d.items():
if idx >= self.n_words_source:
del d[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False
python类load_dict()的实例源码
def __init__(self, source,
source_dicts,
batch_size=128,
maxlen=100,
n_words_source=-1,
skip_empty=False,
shuffle_each_epoch=False,
sort_by_length=True,
maxibatch_size=20):
if shuffle_each_epoch:
self.source_orig = source
self.source = shuffle.main([self.source_orig], temporary=True)
self.source = self.source[0] # ???
print('this had better be a file:', type(self.source))
else:
self.source = fopen(source, 'r')
self.source_dicts = []
for source_dict in source_dicts:
self.source_dicts.append(load_dict(source_dict))
self.batch_size = batch_size
self.maxlen = maxlen
self.skip_empty = skip_empty
self.n_words_source = n_words_source
if self.n_words_source > 0:
for d in self.source_dicts:
for key, idx in d.items():
if idx >= self.n_words_source:
del d[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False
def __init__(self, source, source_dict,
batch_size=128, maxlen=None,
n_words_source=-1,
skip_empty=False,
shuffle_each_epoch=False,
sort_by_length=False,
maxibatch_size=20,
):
if shuffle_each_epoch:
self.source_orig = source
self.source = shuffle.main([self.source_orig], temporary=True)
else:
self.source = data_utils.fopen(source, 'r')
self.source_dict = load_dict(source_dict)
self.batch_size = batch_size
self.maxlen = maxlen
self.skip_empty = skip_empty
self.n_words_source = n_words_source
if self.n_words_source > 0:
for key, idx in self.source_dict.items():
if idx >= self.n_words_source:
del self.source_dict[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False
def load_inverse_dict(dict_path):
orig_dict = load_dict(dict_path)
idict = {}
for words, idx in orig_dict.iteritems():
idict[idx] = words
return idict
def main(models, source_file, nbest_file, saveto, b=80,
normalize=False, verbose=False, alignweights=False):
# load model model_options
options = []
for model in args.models:
try:
with open('%s.json' % model, 'rb') as f:
options.append(json.load(f))
except:
with open('%s.pkl' % model, 'rb') as f:
options.append(pkl.load(f))
#hacks for using old models with missing options
if not 'dropout_embedding' in options[-1]:
options[-1]['dropout_embedding'] = 0
if not 'dropout_hidden' in options[-1]:
options[-1]['dropout_hidden'] = 0
if not 'dropout_source' in options[-1]:
options[-1]['dropout_source'] = 0
if not 'dropout_target' in options[-1]:
options[-1]['dropout_target'] = 0
dictionary, dictionary_target = options[0]['dictionaries']
# load source dictionary and invert
word_dict = load_dict(dictionary)
word_idict = dict()
for kk, vv in word_dict.iteritems():
word_idict[vv] = kk
word_idict[0] = '<eos>'
word_idict[1] = 'UNK'
# load target dictionary and invert
word_dict_trg = load_dict(dictionary_target)
word_idict_trg = dict()
for kk, vv in word_dict_trg.iteritems():
word_idict_trg[vv] = kk
word_idict_trg[0] = '<eos>'
word_idict_trg[1] = 'UNK'
rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
def __init__(self, source, target,
source_dict, target_dict,
batch_size=128,
maxlen=100,
n_words_source=-1,
n_words_target=-1,
shuffle_each_epoch=False,
sort_by_length=True):
if shuffle_each_epoch:
shuffle.main([source, target])
self.source = fopen(source+'.shuf', 'r')
self.target = fopen(target+'.shuf', 'r')
else:
self.source = fopen(source, 'r')
self.target = fopen(target, 'r')
self.source_dict = load_dict(source_dict)
self.target_dict = load_dict(target_dict)
self.batch_size = batch_size
self.maxlen = maxlen
self.n_words_source = n_words_source
self.n_words_target = n_words_target
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.target_buffer = []
self.k = batch_size * 20
self.end_of_data = False
def main(models, source_file, nbest_file, saveto, b=80,
normalize=False, verbose=False, alignweights=False):
# load model model_options
options = []
for model in args.models:
try:
with open('%s.json' % model, 'rb') as f:
options.append(json.load(f))
except:
with open('%s.pkl' % model, 'rb') as f:
options.append(pkl.load(f))
#hacks for using old models with missing options
if not 'dropout_embedding' in options[-1]:
options[-1]['dropout_embedding'] = 0
if not 'dropout_hidden' in options[-1]:
options[-1]['dropout_hidden'] = 0
if not 'dropout_source' in options[-1]:
options[-1]['dropout_source'] = 0
if not 'dropout_target' in options[-1]:
options[-1]['dropout_target'] = 0
dictionary, dictionary_target = options[0]['dictionaries']
# load source dictionary and invert
word_dict = load_dict(dictionary)
word_idict = dict()
for kk, vv in word_dict.iteritems():
word_idict[vv] = kk
word_idict[0] = '<eos>'
word_idict[1] = 'UNK'
# load target dictionary and invert
word_dict_trg = load_dict(dictionary_target)
word_idict_trg = dict()
for kk, vv in word_dict_trg.iteritems():
word_idict_trg[vv] = kk
word_idict_trg[0] = '<eos>'
word_idict_trg[1] = 'UNK'
rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
def _build_dictionaries(self):
"""
Builds and inverts source and target dictionaries, taken
from the first model since all of them must have the same
vocabulary.
"""
dictionaries = self._options[0]['dictionaries']
dictionaries_source = dictionaries[:-1]
dictionary_target = dictionaries[-1]
# load and invert source dictionaries
word_dicts = []
word_idicts = []
for dictionary in dictionaries_source:
word_dict = load_dict(dictionary)
if self._options[0]['n_words_src']:
for key, idx in word_dict.items():
if idx >= self._options[0]['n_words_src']:
del word_dict[key]
word_idict = dict()
for kk, vv in word_dict.iteritems():
word_idict[vv] = kk
word_idict[0] = '<eos>'
word_idict[1] = 'UNK'
word_dicts.append(word_dict)
word_idicts.append(word_idict)
self._word_dicts = word_dicts
self._word_idicts = word_idicts
# load and invert target dictionary
word_dict_trg = load_dict(dictionary_target)
word_idict_trg = dict()
for kk, vv in word_dict_trg.iteritems():
word_idict_trg[vv] = kk
word_idict_trg[0] = '<eos>'
word_idict_trg[1] = 'UNK'
self._word_idict_trg = word_idict_trg
def __init__(self, source, target,
source_dicts, target_dict,
batch_size=128,
maxlen=100,
n_words_source=-1,
n_words_target=-1,
shuffle_each_epoch=False,
sort_by_length=True,
indomain_source='', indomain_target='',
interpolation_rate=0.1,
maxibatch_size=20):
if shuffle_each_epoch:
shuffle.main([source, target])
shuffle.main([indomain_source, indomain_target])
self.source = fopen(source+'.shuf', 'r')
self.target = fopen(target+'.shuf', 'r')
self.indomain_source = fopen(indomain_source+'.shuf', 'r')
self.indomain_target = fopen(indomain_target+'.shuf', 'r')
else:
self.source = fopen(source, 'r')
self.target = fopen(target, 'r')
self.indomain_source = fopen(indomain_source, 'r')
self.indomain_target = fopen(indomain_target, 'r')
self.source_dicts = []
for source_dict in source_dicts:
self.source_dicts.append(load_dict(source_dict))
self.target_dict = load_dict(target_dict)
self.batch_size = batch_size
self.maxlen = maxlen
self.n_words_source = n_words_source
self.n_words_target = n_words_target
if self.n_words_source > 0:
for d in self.source_dicts:
for key, idx in d.items():
if idx >= self.n_words_source:
del d[key]
if self.n_words_target > 0:
for key, idx in self.target_dict.items():
if idx >= self.n_words_target:
del self.target_dict[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.target_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False
self.interpolation_rate = interpolation_rate
self.indomain_k = int(math.ceil(self.interpolation_rate * self.k))
self.outdomain_k = self.k - self.indomain_k
def __init__(self, source, target,
source_dicts, target_dict,
batch_size=128,
maxlen=100,
n_words_source=-1,
n_words_target=-1,
skip_empty=False,
shuffle_each_epoch=False,
sort_by_length=True,
maxibatch_size=20):
if shuffle_each_epoch:
self.source_orig = source
self.target_orig = target
self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
else:
self.source = fopen(source, 'r')
self.target = fopen(target, 'r')
self.source_dicts = []
for source_dict in source_dicts:
self.source_dicts.append(load_dict(source_dict))
self.target_dict = load_dict(target_dict)
self.batch_size = batch_size
self.maxlen = maxlen
self.skip_empty = skip_empty
self.n_words_source = n_words_source
self.n_words_target = n_words_target
if self.n_words_source > 0:
for d in self.source_dicts:
for key, idx in d.items():
if idx >= self.n_words_source:
del d[key]
if self.n_words_target > 0:
for key, idx in self.target_dict.items():
if idx >= self.n_words_target:
del self.target_dict[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.target_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False
def __init__(self, source, target,
source_dict, target_dict,
batch_size=128,
maxlen=100,
n_words_source=-1,
n_words_target=-1,
skip_empty=False,
shuffle_each_epoch=False,
sort_by_length=True,
maxibatch_size=20):
if shuffle_each_epoch:
self.source_orig = source
self.target_orig = target
self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
else:
self.source = data_utils.fopen(source, 'r')
self.target = data_utils.fopen(target, 'r')
self.source_dict = load_dict(source_dict)
self.target_dict = load_dict(target_dict)
self.batch_size = batch_size
self.maxlen = maxlen
self.skip_empty = skip_empty
self.n_words_source = n_words_source
self.n_words_target = n_words_target
if self.n_words_source > 0:
for key, idx in self.source_dict.items():
if idx >= self.n_words_source:
del self.source_dict[key]
if self.n_words_target > 0:
for key, idx in self.target_dict.items():
if idx >= self.n_words_target:
del self.target_dict[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.target_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False
def main(models, source_file, nbest_file, saveto, b=80,
normalize=False, verbose=False, alignweights=False):
# load model model_options
options = []
for model in args.models:
try:
with open('%s.json' % model, 'rb') as f:
options.append(json.load(f))
except:
with open('%s.pkl' % model, 'rb') as f:
options.append(pkl.load(f))
#hacks for using old models with missing options
if not 'dropout_embedding' in options[-1]:
options[-1]['dropout_embedding'] = 0
if not 'dropout_hidden' in options[-1]:
options[-1]['dropout_hidden'] = 0
if not 'dropout_source' in options[-1]:
options[-1]['dropout_source'] = 0
if not 'dropout_target' in options[-1]:
options[-1]['dropout_target'] = 0
dictionaries = options[0]['dictionaries']
dictionaries_source = dictionaries[:-1]
dictionary_target = dictionaries[-1]
# load source dictionary and invert
word_dicts = []
word_idicts = []
for dictionary in dictionaries_source:
word_dict = load_dict(dictionary)
if options[0]['n_words_src']:
for key, idx in word_dict.items():
if idx >= options[0]['n_words_src']:
del word_dict[key]
word_idict = dict()
for kk, vv in word_dict.iteritems():
word_idict[vv] = kk
word_idict[0] = '<eos>'
word_idict[1] = 'UNK'
word_dicts.append(word_dict)
word_idicts.append(word_idict)
# load target dictionary and invert
word_dict_trg = load_dict(dictionary_target)
word_idict_trg = dict()
for kk, vv in word_dict_trg.iteritems():
word_idict_trg[vv] = kk
word_idict_trg[0] = '<eos>'
word_idict_trg[1] = 'UNK'
rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
def __init__(self, source, target,
source_dicts, target_dict,
batch_size=128,
maxlen=100,
n_words_source=-1,
n_words_target=-1,
shuffle_each_epoch=False,
sort_by_length=True,
maxibatch_size=20):
# source, target: file path+name
# allow source have many dicts
if shuffle_each_epoch:
shuffle.main([source, target])
self.source = fopen(source+'.shuf', 'r')
self.target = fopen(target+'.shuf', 'r')
else:
self.source = fopen(source, 'r')
self.target = fopen(target, 'r')
self.source_dicts = []
for source_dict in source_dicts:
self.source_dicts.append(load_dict(source_dict))
self.target_dict = load_dict(target_dict)
self.batch_size = batch_size
self.maxlen = maxlen
self.n_words_source = n_words_source
self.n_words_target = n_words_target
if self.n_words_source > 0:
for d in self.source_dicts:
for key, idx in d.items():
if idx >= self.n_words_source:
del d[key]
if self.n_words_target > 0:
for key, idx in self.target_dict.items():
if idx >= self.n_words_target:
del self.target_dict[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.target_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False
def __init__(self, source, target,
source_dicts, target_dict,
batch_size=128,
maxlen=100,
n_words_source=-1,
n_words_target=-1,
shuffle_each_epoch=False,
sort_by_length=True,
maxibatch_size=20):
if shuffle_each_epoch:
shuffle.main([source, target])
self.source = fopen(source+'.shuf', 'r')
self.target = fopen(target+'.shuf', 'r')
else:
self.source = fopen(source, 'r')
self.target = fopen(target, 'r')
self.source_dicts = []
for source_dict in source_dicts:
self.source_dicts.append(load_dict(source_dict))
self.target_dict = load_dict(target_dict)
self.batch_size = batch_size
self.maxlen = maxlen
self.n_words_source = n_words_source
self.n_words_target = n_words_target
if self.n_words_source > 0:
for d in self.source_dicts:
for key, idx in d.items():
if idx >= self.n_words_source:
del d[key]
if self.n_words_target > 0:
for key, idx in self.target_dict.items():
if idx >= self.n_words_target:
del self.target_dict[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.target_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False
def main(models, source_file, nbest_file, saveto, b=80,
normalize=False, verbose=False, alignweights=False):
# load model model_options
options = []
for model in args.models:
try:
with open('%s.json' % model, 'rb') as f:
options.append(json.load(f))
except:
with open('%s.pkl' % model, 'rb') as f:
options.append(pkl.load(f))
#hacks for using old models with missing options
if not 'dropout_embedding' in options[-1]:
options[-1]['dropout_embedding'] = 0
if not 'dropout_hidden' in options[-1]:
options[-1]['dropout_hidden'] = 0
if not 'dropout_source' in options[-1]:
options[-1]['dropout_source'] = 0
if not 'dropout_target' in options[-1]:
options[-1]['dropout_target'] = 0
dictionaries = options[0]['dictionaries']
dictionaries_source = dictionaries[:-1]
dictionary_target = dictionaries[-1]
# load source dictionary and invert
word_dicts = []
word_idicts = []
for dictionary in dictionaries_source:
word_dict = load_dict(dictionary)
if options[0]['n_words_src']:
for key, idx in word_dict.items():
if idx >= options[0]['n_words_src']:
del word_dict[key]
word_idict = dict()
for kk, vv in word_dict.iteritems():
word_idict[vv] = kk
word_idict[0] = '<eos>'
word_idict[1] = 'UNK'
word_dicts.append(word_dict)
word_idicts.append(word_idict)
# load target dictionary and invert
word_dict_trg = load_dict(dictionary_target)
word_idict_trg = dict()
for kk, vv in word_dict_trg.iteritems():
word_idict_trg[vv] = kk
word_idict_trg[0] = '<eos>'
word_idict_trg[1] = 'UNK'
rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
def __init__(self, source, target,
source_dicts, target_dict,
batch_size=128,
maxlen=100,
n_words_source=-1,
n_words_target=-1,
shuffle_each_epoch=False,
sort_by_length=True,
maxibatch_size=20):
# source, target: file path+name
# allow source have many dicts
if shuffle_each_epoch:
shuffle.main([source, target])
self.source = fopen(source+'.shuf', 'r')
self.target = fopen(target+'.shuf', 'r')
else:
self.source = fopen(source, 'r')
self.target = fopen(target, 'r')
self.source_dicts = []
for source_dict in source_dicts:
self.source_dicts.append(load_dict(source_dict))
self.target_dict = load_dict(target_dict)
self.batch_size = batch_size
self.maxlen = maxlen
self.n_words_source = n_words_source
self.n_words_target = n_words_target
if self.n_words_source > 0:
for d in self.source_dicts:
for key, idx in d.items():
if idx >= self.n_words_source:
del d[key]
if self.n_words_target > 0:
for key, idx in self.target_dict.items():
if idx >= self.n_words_target:
del self.target_dict[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.target_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False
def __init__(self, source, target,
source_dicts, target_dict,
batch_size=128,
maxlen=100,
n_words_source=-1,
n_words_target=-1,
shuffle_each_epoch=False,
sort_by_length=True,
maxibatch_size=20):
if shuffle_each_epoch:
shuffle.main([source, target])
self.source = fopen(source+'.shuf', 'r')
self.target = fopen(target+'.shuf', 'r')
else:
self.source = fopen(source, 'r')
self.target = fopen(target, 'r')
self.source_dicts = []
for source_dict in source_dicts:
self.source_dicts.append(load_dict(source_dict))
self.target_dict = load_dict(target_dict)
self.batch_size = batch_size
self.maxlen = maxlen
self.n_words_source = n_words_source
self.n_words_target = n_words_target
if self.n_words_source > 0:
for d in self.source_dicts:
for key, idx in d.items():
if idx >= self.n_words_source:
del d[key]
if self.n_words_target > 0:
for key, idx in self.target_dict.items():
if idx >= self.n_words_target:
del self.target_dict[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.target_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False
def main(models, source_file, nbest_file, saveto, b=80,
normalize=False, verbose=False, alignweights=False):
# load model model_options
options = []
for model in args.models:
try:
with open('%s.json' % model, 'rb') as f:
options.append(json.load(f))
except:
with open('%s.pkl' % model, 'rb') as f:
options.append(pkl.load(f))
#hacks for using old models with missing options
if not 'dropout_embedding' in options[-1]:
options[-1]['dropout_embedding'] = 0
if not 'dropout_hidden' in options[-1]:
options[-1]['dropout_hidden'] = 0
if not 'dropout_source' in options[-1]:
options[-1]['dropout_source'] = 0
if not 'dropout_target' in options[-1]:
options[-1]['dropout_target'] = 0
dictionaries = options[0]['dictionaries']
dictionaries_source = dictionaries[:-1]
dictionary_target = dictionaries[-1]
# load source dictionary and invert
word_dicts = []
word_idicts = []
for dictionary in dictionaries_source:
word_dict = load_dict(dictionary)
if options[0]['n_words_src']:
for key, idx in word_dict.items():
if idx >= options[0]['n_words_src']:
del word_dict[key]
word_idict = dict()
for kk, vv in word_dict.iteritems():
word_idict[vv] = kk
word_idict[0] = '<eos>'
word_idict[1] = 'UNK'
word_dicts.append(word_dict)
word_idicts.append(word_idict)
# load target dictionary and invert
word_dict_trg = load_dict(dictionary_target)
word_idict_trg = dict()
for kk, vv in word_dict_trg.iteritems():
word_idict_trg[vv] = kk
word_idict_trg[0] = '<eos>'
word_idict_trg[1] = 'UNK'
rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
def __init__(self, source, target,
source_dicts, target_dict,
batch_size=128,
maxlen=100,
n_words_source=-1,
n_words_target=-1,
skip_empty=False,
shuffle_each_epoch=False,
sort_by_length=True,
use_factor=False,
maxibatch_size=20):
if shuffle_each_epoch:
self.source_orig = source
self.target_orig = target
self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
else:
self.source = fopen(source, 'r')
self.target = fopen(target, 'r')
self.source_dicts = []
for source_dict in source_dicts:
self.source_dicts.append(load_dict(source_dict))
self.target_dict = load_dict(target_dict)
self.batch_size = batch_size
self.maxlen = maxlen
self.skip_empty = skip_empty
self.use_factor = use_factor
self.n_words_source = n_words_source
self.n_words_target = n_words_target
if self.n_words_source > 0:
for d in self.source_dicts:
for key, idx in d.items():
if idx >= self.n_words_source:
del d[key]
if self.n_words_target > 0:
for key, idx in self.target_dict.items():
if idx >= self.n_words_target:
del self.target_dict[key]
self.shuffle = shuffle_each_epoch
self.sort_by_length = sort_by_length
self.source_buffer = []
self.target_buffer = []
self.k = batch_size * maxibatch_size
self.end_of_data = False