def __init__(self, batchsize=64, max_length=15, mode='train'):
self.batchsize = batchsize
self.d_vocabulary = None
self.batch_index = None
self.batch_len = None
self.rev_adict = None
self.max_length = max_length
self.mode = mode
self.qdic, self.adic = VQADataProvider.load_data(mode)
with open('./result/vdict.json','r') as f:
self.vdict = json.load(f)
with open('./result/adict.json','r') as f:
self.adict = json.load(f)
self.n_ans_vocabulary = len(self.adict)
self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
self.glove_dict = {} # word -> glove vector
python类load()的实例源码
def load_vqa_json(data_split):
"""
Parses the question and answer json files for the given data split.
Returns the question dictionary and the answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
qdata = json.load(f)['questions']
for q in qdata:
qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
{'qstr': q['question'], 'iid': q['image_id']}
if 'test' not in data_split:
with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
adata = json.load(f)['annotations']
for a in adata:
adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
a['answers']
print 'parsed', len(qdic), 'questions for', data_split
return qdic, adic
def load_genome_json():
"""
Parses the genome json file. Returns the question dictionary and the
answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
qdata = json.load(f)
for q in qdata:
key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
qdic[key] = {'qstr': q['question'], 'iid': q['image']}
adic[key] = [{'answer': q['answer']}]
print 'parsed', len(qdic), 'questions for genome'
return qdic, adic
def __init__(self, batchsize=64, max_length=15, mode='train'):
self.batchsize = batchsize
self.d_vocabulary = None
self.batch_index = None
self.batch_len = None
self.rev_adict = None
self.max_length = max_length
self.mode = mode
self.qdic, self.adic = VQADataProvider.load_data(mode)
with open('./result/vdict.json','r') as f:
self.vdict = json.load(f)
with open('./result/adict.json','r') as f:
self.adict = json.load(f)
self.n_ans_vocabulary = len(self.adict)
self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
self.glove_dict = {} # word -> glove vector
def load_genome_json():
"""
Parses the genome json file. Returns the question dictionary and the
answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
qdata = json.load(f)
for q in qdata:
key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
qdic[key] = {'qstr': q['question'], 'iid': q['image']}
adic[key] = [{'answer': q['answer']}]
print 'parsed', len(qdic), 'questions for genome'
return qdic, adic
def __init__(self, vdict_path, adict_path, \
batchsize=128, max_length=15, n_ans_vocabulary=1000, mode='train', data_shape=(2048)):
self.batchsize = batchsize
self.d_vocabulary = None
self.batch_index = None
self.batch_len = None
self.rev_adict = None
self.max_length = max_length
self.n_ans_vocabulary = n_ans_vocabulary
self.mode = mode
self.data_shape = data_shape
assert self.mode == 'test'
# load vocabulary
with open(vdict_path,'r') as f:
vdict = json.load(f)
with open(adict_path,'r') as f:
adict = json.load(f)
self.n_vocabulary, self.vdict = len(vdict), vdict
self.n_ans_vocabulary, self.adict = len(adict), adict
self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
self.glove_dict = {} # word -> glove vector
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType:
"""
In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
keyed by the options we used to create the spacy model, so any particular configuration only
gets loaded once.
"""
options = (spacy_model_name, pos_tags, parse, ner)
if options not in LOADED_SPACY_MODELS:
disable = ['vectors', 'textcat']
if not pos_tags:
disable.append('tagger')
if not parse:
disable.append('parser')
if not ner:
disable.append('ner')
spacy_model = spacy.load(spacy_model_name, disable=disable)
LOADED_SPACY_MODELS[options] = spacy_model
return LOADED_SPACY_MODELS[options]
def load(self, filename):
"""Load pre-existing dictionary in 'token[<TAB>count]' format.
Initialize counts from other dictionary, or 0 if they aren't included.
"""
print('Dictionary: loading dictionary from {}'.format(
filename))
with open(filename) as read:
for line in read:
split = line.strip().split('\t')
token = unescape(split[0])
cnt = int(split[1]) if len(split) > 1 else 0
self.freq[token] = cnt
if token not in self.tok2ind:
index = len(self.tok2ind)
self.tok2ind[token] = index
self.ind2tok[index] = token
print('[ num words = %d ]' % len(self))
def load_spacy_model(disable=False):
"""
Returns loaded spacy pipeline
Args:
disable: a list of pipeline components to disable from loaded spacy
model. Can signifcantly increase speed.
Returns:
spacy pipeline
"""
# if diable is not false, load spacy model with modified pipeline
# otherwise, load the default pipeline
if disable:
try:
nlp = spacy.load('en_core_web_sm', disable=disable)
except:
print('''[ERROR] You likely pased an invalid disable argument to
get_spacy_doc!''')
else:
nlp = spacy.load('en_core_web_sm')
return nlp
def train(filePath):
try:
if not filePath.lower().endswith('json'):
return {'success':False,'message':'Training file should be in json format'}
with open(filePath) as file:
ent_data = json.load(file)
dataset = [jsonToCrf(q, nlp) for q in ent_data['entity_examples']]
X_train = [sent2features(s) for s in dataset]
y_train = [sent2labels(s) for s in dataset]
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
if(not os.path.exists("crfModel")):
os.mkdir("crfModel")
if(os.path.isfile("crfModel/classifier.pkl")):
os.remove("crfModel/classifier.pkl")
joblib.dump(crf,"crfModel/classifier.pkl")
return {'success':True,'message':'Model Trained Successfully'}
except Exception as ex:
return {'success':False,'message':'Error while Training the model - '+str(ex)}
def predict(utterance):
try:
tagged = []
finallist = []
parsed = nlp(utterance)
for i in range(len(parsed)):
tagged.append((str(parsed[i]),parsed[i].tag_))
finallist.append(tagged)
test = [sent2features(s) for s in finallist]
if(os.path.isfile("crfModel/classifier.pkl")):
crf = joblib.load("crfModel/classifier.pkl")
else:
return {'success':False,'message':'Please Train the model first'}
predicted = crf.predict(test)
entityList = extractEntities(predicted[0],tagged)
return {'success':True,'entitiesPredicted':entityList}
except Exception as ex:
return {'success':False,'message':'Error while pediction - '+str(ex)}
def __init__(self,language='en'):
"""
Create a Parser object that will use Spacy for parsing. It uses Spacy and offers all the same languages that Spacy offers. Check out: https://spacy.io/usage/models. Note that the language model needs to be downloaded first (e.g. python -m spacy download en)
:param language: Language to parse (en/de/es/pt/fr/it/nl)
:type language: str
"""
# We only load spacy if a Parser is created (to allow ReadTheDocs to build the documentation easily)
import spacy
acceptedLanguages = ['en','de','es','pt','fr','it','nl']
assert language in acceptedLanguages, "Language for parser (%s) not in accepted languages: %s" % (language,str(acceptedLanguages))
self.language = language
if not language in Parser.languageModels:
Parser.languageModels[language] = spacy.load(language, disable=['ner'])
self.nlp = Parser.languageModels[language]
def build_word_frequency_distribution():
path = os.path.join(data_dir, 'word_freq.pickle')
try:
with open(path, 'rb') as freq_dist_f:
freq_dist_f = pickle.load(freq_dist_f)
print('frequency distribution loaded')
return freq_dist_f
except IOError:
pass
print('building frequency distribution')
freq = defaultdict(int)
for i, review in enumerate(read_reviews()):
doc = en.tokenizer(review['text'])
for token in doc:
freq[token.orth_] += 1
if i % 10000 == 0:
with open(path, 'wb') as freq_dist_f:
pickle.dump(freq, freq_dist_f)
print('dump at {}'.format(i))
return freq
def build_vocabulary(lower=3, n=50000):
try:
with open(vocab_fn, 'rb') as vocab_file:
vocab = pickle.load(vocab_file)
print('vocabulary loaded')
return vocab
except IOError:
print('building vocabulary')
freq = build_word_frequency_distribution()
top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n-lower+1]
vocab = {}
i = lower
for w, freq in top_words:
vocab[w] = i
i += 1
with open(vocab_fn, 'wb') as vocab_file:
pickle.dump(vocab, vocab_file)
return vocab
def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
self.batchsize = batchsize
self.d_vocabulary = None
self.batch_index = None
self.batch_len = None
self.rev_adict = None
self.max_length = max_length
self.mode = mode
self.qdic, self.adic = VQADataProvider.load_data(mode)
with open('./%s/vdict.json'%folder,'r') as f:
self.vdict = json.load(f)
with open('./%s/adict.json'%folder,'r') as f:
self.adict = json.load(f)
self.n_ans_vocabulary = len(self.adict)
self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
self.glove_dict = {} # word -> glove vector
def load_vqa_json(data_split):
"""
Parses the question and answer json files for the given data split.
Returns the question dictionary and the answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
qdata = json.load(f)['questions']
for q in qdata:
qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
{'qstr': q['question'], 'iid': q['image_id']}
if 'test' not in data_split:
with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
adata = json.load(f)['annotations']
for a in adata:
adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
a['answers']
print 'parsed', len(qdic), 'questions for', data_split
return qdic, adic
def load_genome_json():
"""
Parses the genome json file. Returns the question dictionary and the
answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
qdata = json.load(f)
for q in qdata:
key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
qdic[key] = {'qstr': q['question'], 'iid': q['image']}
adic[key] = [{'answer': q['answer']}]
print 'parsed', len(qdic), 'questions for genome'
return qdic, adic
def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
self.batchsize = batchsize
self.d_vocabulary = None
self.batch_index = None
self.batch_len = None
self.rev_adict = None
self.max_length = max_length
self.mode = mode
self.qdic, self.adic = VQADataProvider.load_data(mode)
with open('./%s/vdict.json'%folder,'r') as f:
self.vdict = json.load(f)
with open('./%s/adict.json'%folder,'r') as f:
self.adict = json.load(f)
self.n_ans_vocabulary = len(self.adict)
self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
self.glove_dict = {} # word -> glove vector
def load_genome_json():
"""
Parses the genome json file. Returns the question dictionary and the
answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
qdata = json.load(f)
for q in qdata:
key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
qdic[key] = {'qstr': q['question'], 'iid': q['image']}
adic[key] = [{'answer': q['answer']}]
print 'parsed', len(qdic), 'questions for genome'
return qdic, adic
def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
self.batchsize = batchsize
self.d_vocabulary = None
self.batch_index = None
self.batch_len = None
self.rev_adict = None
self.max_length = max_length
self.mode = mode
self.qdic, self.adic = VQADataProvider.load_data(mode)
with open('./%s/vdict.json'%folder,'r') as f:
self.vdict = json.load(f)
with open('./%s/adict.json'%folder,'r') as f:
self.adict = json.load(f)
self.n_ans_vocabulary = len(self.adict)
self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
self.glove_dict = {} # word -> glove vector
def load_vqa_json(data_split):
"""
Parses the question and answer json files for the given data split.
Returns the question dictionary and the answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
qdata = json.load(f)['questions']
for q in qdata:
qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
{'qstr': q['question'], 'iid': q['image_id']}
if 'test' not in data_split:
with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
adata = json.load(f)['annotations']
for a in adata:
adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
a['answers']
print 'parsed', len(qdic), 'questions for', data_split
return qdic, adic
def load_genome_json():
"""
Parses the genome json file. Returns the question dictionary and the
answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
qdata = json.load(f)
for q in qdata:
key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
qdic[key] = {'qstr': q['question'], 'iid': q['image']}
adic[key] = [{'answer': q['answer']}]
print 'parsed', len(qdic), 'questions for genome'
return qdic, adic
def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
self.batchsize = batchsize
self.d_vocabulary = None
self.batch_index = None
self.batch_len = None
self.rev_adict = None
self.max_length = max_length
self.mode = mode
self.qdic, self.adic = VQADataProvider.load_data(mode)
with open('./%s/vdict.json'%folder,'r') as f:
self.vdict = json.load(f)
with open('./%s/adict.json'%folder,'r') as f:
self.adict = json.load(f)
self.n_ans_vocabulary = len(self.adict)
self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
self.glove_dict = {} # word -> glove vector
def load_vqa_json(data_split):
"""
Parses the question and answer json files for the given data split.
Returns the question dictionary and the answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
qdata = json.load(f)['questions']
for q in qdata:
qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
{'qstr': q['question'], 'iid': q['image_id']}
if 'test' not in data_split:
with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
adata = json.load(f)['annotations']
for a in adata:
adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
a['answers']
print 'parsed', len(qdic), 'questions for', data_split
return qdic, adic
def load_genome_json():
"""
Parses the genome json file. Returns the question dictionary and the
answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
qdata = json.load(f)
for q in qdata:
key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
qdic[key] = {'qstr': q['question'], 'iid': q['image']}
adic[key] = [{'answer': q['answer']}]
print 'parsed', len(qdic), 'questions for genome'
return qdic, adic
def _set_tokenizer(self, tokenizer):
"""
Set tokenizer
:param tokenizer: tokenization method
:return: None
"""
if tokenizer == "nltk":
self.tokenizer = nltk.word_tokenize
elif tokenizer == "spacy":
spacy_en = spacy.load("en")
def spacy_tokenizer(seq):
return [w.text for w in spacy_en(seq)]
self.tokenizer = spacy_tokenizer
else:
raise ValueError("Invalid tokenizing method %s" % tokenizer)
def __init__(self, batchsize=64, max_length=config.MAX_WORDS_IN_QUESTION, mode='train'):
self.batchsize = batchsize
self.d_vocabulary = None
self.batch_index = None
self.batch_len = None
self.rev_adict = None
self.max_length = max_length
self.mode = mode
#self.max_length, self.qdic, self.adic = VQADataProvider.load_data(mode)
self.qdic, self.adic = VQADataProvider.load_data(mode)
with open('./result/vdict.json','r') as f:
self.vdict = json.load(f)
with open('./result/adict.json','r') as f:
self.adict = json.load(f)
self.n_ans_vocabulary = len(self.adict)
# self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
# self.glove_dict = {} # word -> glove vector
def load_vqa_json(data_split):
"""
Parses the question and answer json files for the given data split.
Returns the question dictionary and the answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
qdata = json.load(f)['questions']
for q in qdata:
qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
{'qstr': q['question'], 'iid': q['image_id']}
if 'test' not in data_split:
with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
adata = json.load(f)['annotations']
for a in adata:
adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
a['answers']
write_log('parsed ' + str(len(qdic)) + ' questions for ' + data_split, 'log.txt')
return qdic, adic
def load_genome_json():
"""
Parses the genome json file. Returns the question dictionary and the
answer dictionary.
"""
qdic, adic = {}, {}
with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
qdata = json.load(f)
for q in qdata:
key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
qdic[key] = {'qstr': q['question'], 'iid': q['image']}
adic[key] = [{'answer': q['answer']}]
write_log('parsed ' + str(len(qdic)) + ' questions for genome', 'log.txt')
return qdic, adic
def __init__(self, batchsize=64, max_length=config.MAX_WORDS_IN_QUESTION, max_w_length=config.LENGTH_OF_LONGEST_WORD, mode='train'):
self.batchsize = batchsize
self.d_vocabulary = None
self.batch_index = None
self.batch_len = None
self.rev_adict = None
self.max_length = max_length
self.max_w_length = max_w_length
self.mode = mode
self.qdic, self.adic = VQADataProvider.load_data(mode)
with open('./result/cdict.json','r') as f:
self.cdict = json.load(f)
with open('./result/vdict.json','r') as f:
self.vdict = json.load(f)
with open('./result/adict.json','r') as f:
self.adict = json.load(f)
self.n_ans_vocabulary = len(self.adict)
#self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
#self.glove_dict = {} # word -> glove vector