def _op(self, df, verbose):
inter_df = df
colnames = list(self._bin_map.keys())
if verbose:
colnames = tqdm.tqdm(colnames)
for colname in colnames:
if verbose:
colnames.set_description(colname)
source_col = df[colname]
loc = df.columns.get_loc(colname) + 1
new_name = colname + "_bin"
if self._drop:
inter_df = inter_df.drop(colname, axis=1)
new_name = colname
loc -= 1
inter_df = out_of_place_col_insert(
df=inter_df,
series=source_col.apply(
self._get_col_binner(self._bin_map[colname])),
loc=loc,
column_name=new_name)
return inter_df
python类tqdm()的实例源码
def _op(self, df, verbose):
columns_to_encode = self._columns
if self._columns is None:
columns_to_encode = list(set(df.select_dtypes(
include=['object', 'category']).columns).difference(
self._exclude_columns))
if verbose:
columns_to_encode = tqdm.tqdm(columns_to_encode)
inter_df = df
for colname in columns_to_encode:
lbl_enc = sklearn.preprocessing.LabelEncoder()
source_col = df[colname]
loc = df.columns.get_loc(colname) + 1
new_name = colname + "_enc"
if self._drop:
inter_df = inter_df.drop(colname, axis=1)
new_name = colname
loc -= 1
inter_df = out_of_place_col_insert(
df=inter_df,
series=lbl_enc.fit_transform(source_col),
loc=loc,
column_name=new_name)
self.encoders[colname] = lbl_enc
return inter_df
def segment(self, *args):
"""Segment one or more datasets with this subword field.
Arguments:
Positional arguments: Dataset objects or other indexable
mutable sequences to segment. If a Dataset object is provided,
all columns corresponding to this field are used; individual
columns can also be provided directly.
"""
sources = []
for arg in args:
if isinstance(arg, Dataset):
sources += [getattr(arg, name) for name, field in
arg.fields.items() if field is self]
else:
sources.append(arg)
for data in sources:
for x in tqdm(data, 'segmenting'):
x[:] = self.vocab.segment(x)
def cbow_train(self):
print("CBOW Training......")
self.cbow_model.save_embedding(self.data.id2word, 'cbow_begin_embedding.txt')
pos_all_pairs = self.data.get_cbow_batch_all_pairs(self.batch_size, self.context_size)
pair_count = len(pos_all_pairs)
process_bar = tqdm(range(int(pair_count / self.batch_size)))
for _ in process_bar:
pos_pairs = self.data.get_cbow_batch_pairs(self.batch_size, self.window_size)
if self.using_hs:
pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman(pos_pairs)
else:
pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling(pos_pairs, self.context_size)
pos_u = [pair[0] for pair in pos_pairs]
pos_v = [int(pair[1]) for pair in pos_pairs]
neg_u = [pair[0] for pair in neg_pairs]
neg_v = [int(pair[1]) for pair in neg_pairs]
self.optimizer.zero_grad()
loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v)
loss.backward()
self.optimizer.step()
print("CBOW Trained and Saving File......")
self.cbow_model.save_embedding(self.data.id2word, self.output_file_name)
print("CBOW Trained and Saved File.")
def load_word2emb(self, show_progress=True, batch_size=1000):
fin_name = self.ensure_file(path.join('fasttext', '{}.zip'.format(self.lang)), url=self.url.format(self.lang))
seen = set()
with zipfile.ZipFile(fin_name) as fin:
content = fin.read('wiki.{}.vec'.format(self.lang))
lines = content.splitlines()
if show_progress:
lines = tqdm(lines)
batch = []
for line in lines:
elems = line.decode().rstrip().split()
vec = [float(n) for n in elems[-self.d_emb:]]
word = ' '.join(elems[:-self.d_emb])
if word in seen:
continue
seen.add(word)
batch.append((word, vec))
if len(batch) == batch_size:
self.insert_batch(batch)
batch.clear()
if batch:
self.insert_batch(batch)
def load_word2emb(self, show_progress=True, batch_size=1000):
fin_name = self.ensure_file(path.join('glove', '{}.zip'.format(self.name)), url=self.setting.url)
seen = set()
with zipfile.ZipFile(fin_name) as fin:
fname_zipped = [fzipped.filename for fzipped in fin.filelist if str(self.d_emb) in fzipped.filename][0]
content = fin.read(fname_zipped)
lines = content.splitlines()
if show_progress:
lines = tqdm(lines, total=self.setting.size)
batch = []
for line in lines:
elems = line.decode().rstrip().split()
vec = [float(n) for n in elems[-self.d_emb:]]
word = ' '.join(elems[:-self.d_emb])
if word in seen:
continue
seen.add(word)
batch.append((word, vec))
if len(batch) == batch_size:
self.insert_batch(batch)
batch.clear()
if batch:
self.insert_batch(batch)
def load_word2emb(self, show_progress=True, batch_size=1000):
fin_name = self.ensure_file('kazuma.tar.gz', url=self.url)
seen = set()
with tarfile.open(fin_name, 'r:gz') as fzip:
ftxt = fzip.extractfile('charNgram.txt')
content = ftxt.read()
ftxt.close()
lines = content.splitlines()
if show_progress:
lines = tqdm(lines)
batch = []
for line in lines:
elems = line.decode().rstrip().split()
vec = [float(n) for n in elems[-self.d_emb:]]
word = ' '.join(elems[:-self.d_emb])
if word in seen:
continue
seen.add(word)
batch.append((word, vec))
if len(batch) == batch_size:
self.insert_batch(batch)
batch.clear()
if batch:
self.insert_batch(batch)
def render_posts(self):
"""Render posts using jinja2 templates."""
for post in tqdm(self.posts, unit=' pages', miniters=1, desc="Posts"):
template_name = "%s.html" % post.meta.template
template = self.jinja2.get_template(template_name)
html = post.html.decode("utf-8", 'ignore')
rv = template.render(content=html, meta=post.meta, posts=self.posts, plugin_data=self.plugin_data, config=self.config,
categories=self.posts_by_category.get_as_dict(), tags=self.posts_by_tag.get_as_dict(), templates=self.posts_by_template.get_as_dict(),
microdata=self.posts_by_microdata.get_as_dict())
# Liniting
linter_results = self.linter.lint(post, rv, self)
# Are we stopping on linting errors?
if linter_results.has_errors and self.config.linter.stop_on_error:
print post.filename
for error in linter_results.info:
print "\t-%s:%s" % (error[0], error[1])
sys.exit(-1)
path = "%s%s/" % (self.get_output_dir(), post.meta.permanent_url)
path = path.replace('//', '/')
files.write_file(path, 'index.html', rv)
### Templates functions ###
def read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
instances = []
with open(file_path, 'r') as snli_file:
logger.info("Reading SNLI instances from jsonl dataset at: %s", file_path)
for line in tqdm.tqdm(snli_file):
example = json.loads(line)
label = example["gold_label"]
if label == '-':
# These were cases where the annotators disagreed; we'll just skip them. It's
# like 800 out of 500k examples in the training data.
continue
premise = example["sentence1"]
hypothesis = example["sentence2"]
instances.append(self.text_to_instance(premise, hypothesis, label))
if not instances:
raise ConfigurationError("No instances were read from the given filepath {}. "
"Is the path correct?".format(file_path))
return Dataset(instances)
def read(self, file_path):
instances = []
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
for line_num, line in enumerate(tqdm.tqdm(data_file)):
line = line.strip("\n")
if not line:
continue
line_parts = line.split('\t')
if len(line_parts) != 2:
raise ConfigurationError("Invalid line format: %s (line number %d)" % (line, line_num + 1))
source_sequence, target_sequence = line_parts
instances.append(self.text_to_instance(source_sequence, target_sequence))
if not instances:
raise ConfigurationError("No instances read!")
return Dataset(instances)
def evaluate(model: Model,
dataset: Dataset,
iterator: DataIterator,
cuda_device: int) -> Dict[str, Any]:
model.eval()
generator = iterator(dataset, num_epochs=1, cuda_device=cuda_device, for_training=False)
logger.info("Iterating over dataset")
generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset))
for batch in generator_tqdm:
model(**batch)
metrics = model.get_metrics()
description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
generator_tqdm.set_description(description)
return model.get_metrics()
def train():
rnn.train()
total_loss = 0
hidden = rnn.init_hidden(args.batch_size)
for data, label in tqdm(training_data, mininterval=1,
desc='Train Processing', leave=False):
optimizer.zero_grad()
hidden = repackage_hidden(hidden)
target, hidden = rnn(data, hidden)
loss = criterion(target, label)
loss.backward()
torch.nn.utils.clip_grad_norm(rnn.parameters(), args.clip)
optimizer.step()
total_loss += loss.data
return total_loss[0]/training_data.sents_size
# ##############################################################################
# Save Model
# ##############################################################################
def train():
rnn.train()
total_loss = 0
hidden = rnn.init_hidden()
for data, label in tqdm(training_data, mininterval=1,
desc='Train Processing', leave=False):
optimizer.zero_grad()
hidden = repackage_hidden(hidden)
target, hidden = rnn(data, hidden)
loss = criterion(target, label)
loss.backward()
optimizer.step()
total_loss += loss.data
return total_loss[0]/training_data.sents_size
# ##############################################################################
# Save Model
# ##############################################################################
def fit(self,
data_x_train,
data_x_dev=None,
data_x_test=None,
n_epochs=10,
batch_size=10):
assert n_epochs > 0
assert batch_size < data_x_train.shape[0]
size_x_train = data_x_train.shape[0]
n_batches = size_x_train / batch_size
for e in range(n_epochs):
epoch_costs = np.zeros(n_batches)
bar = tqdm(range(n_batches), desc='Epoch: {:d}'.format(e))
for i in bar:
batch_x = data_x_train[i*batch_size:(i+1)*batch_size]
err = self.partial_fit(batch_x)
epoch_costs[i] = err
mean_cost = epoch_costs.mean()
print 'Train error: {:.4f}'.format(mean_cost)
if data_x_dev is not None:
random_indices = np.random.randint(0, data_x_dev.shape[0], batch_size)
batch_x = data_x_dev[random_indices]
err = self.get_cost(batch_x)
print 'Validation data error: {:.4f}'.format(err)
if data_x_test is not None:
err = self.get_cost(data_x_test)
print 'Test data error: {:.4f}'.format(err)
def get_tqdm_progressbar(iterator):
sys.stderr.flush()
return tqdm.tqdm(iterator, bar_format='{desc}{percentage:3.0f}%|{bar}|[{elapsed}<{remaining}, {rate_fmt}]' , ncols=72)
def validate(args):
# Setup Dataloader
data_loader = get_loader(args.dataset)
data_path = get_data_path(args.dataset)
loader = data_loader(data_path, split=args.split, is_transform=True, img_size=(args.img_rows, args.img_cols))
n_classes = loader.n_classes
valloader = data.DataLoader(loader, batch_size=args.batch_size, num_workers=4)
running_metrics = runningScore(n_classes)
# Setup Model
model = get_model(args.model_path[:args.model_path.find('_')], n_classes)
state = convert_state_dict(torch.load(args.model_path)['model_state'])
model.load_state_dict(state)
model.eval()
for i, (images, labels) in tqdm(enumerate(valloader)):
model.cuda()
images = Variable(images.cuda(), volatile=True)
labels = Variable(labels.cuda(), volatile=True)
outputs = model(images)
pred = outputs.data.max(1)[1].cpu().numpy()
gt = labels.data.cpu().numpy()
running_metrics.update(gt, pred)
score, class_iou = running_metrics.get_scores()
for k, v in score.items():
print(k, v)
for i in range(n_classes):
print(i, class_iou[i])
def setup(self, pre_encode=False):
sbd_path = get_data_path('sbd')
voc_path = get_data_path('pascal')
target_path = self.root + '/SegmentationClass/pre_encoded/'
if not os.path.exists(target_path):
os.makedirs(target_path)
sbd_train_list = tuple(open(sbd_path + 'dataset/train.txt', 'r'))
sbd_train_list = [id_.rstrip() for id_ in sbd_train_list]
self.files['train_aug'] = self.files['train'] + sbd_train_list
if pre_encode:
print("Pre-encoding segmentation masks...")
for i in tqdm(sbd_train_list):
lbl_path = sbd_path + 'dataset/cls/' + i + '.mat'
lbl = io.loadmat(lbl_path)['GTcls'][0]['Segmentation'][0].astype(np.int32)
lbl = m.toimage(lbl, high=lbl.max(), low=lbl.min())
m.imsave(target_path + i + '.png', lbl)
for i in tqdm(self.files['trainval']):
lbl_path = self.root + '/SegmentationClass/' + i + '.png'
lbl = self.encode_segmap(m.imread(lbl_path))
lbl = m.toimage(lbl, high=lbl.max(), low=lbl.min())
m.imsave(target_path + i + '.png', lbl)
def build_feature_files(base_directory,
new_directory,
data_loader,
n=None,
negative_example_keep_prob=1.0):
os.makedirs(new_directory, exist_ok=False)
episode_paths = frame.episode_paths(base_directory)
label_counts = [0, 0]
if n is not None:
np.random.shuffle(episode_paths)
episode_paths = episode_paths[:n]
for episode_path in tqdm.tqdm(episode_paths):
try:
features, labels = data_loader.load_features_and_labels([episode_path])
except:
traceback.print_exc()
else:
keep = np.logical_or(labels, (np.less(
np.random.rand(len(labels)), negative_example_keep_prob)))
labels = labels[keep]
for i in range(len(label_counts)):
label_counts[i] += np.count_nonzero(labels == i)
features = {k: v[keep] for k, v in features.items()}
new_path = path_relative_to_new_directory(base_directory, new_directory, episode_path,
".features")
os.makedirs(os.path.dirname(new_path), exist_ok=True)
with open(new_path, 'wb') as f:
pickle.dump((features, labels), f)
return label_counts
def copy_episodes(indir, outdir, n):
episode_paths = frame.episode_paths(indir)
np.random.shuffle(episode_paths)
episode_paths = episode_paths[:n]
start = len(indir)
for p in tqdm.tqdm(episode_paths):
assert p.startswith(indir), p
outfile = outdir + p[start:]
os.makedirs(os.path.dirname(outfile), exist_ok=True)
shutil.copyfile(p, outfile)
def label_episodes(directory, classifier):
episode_paths = frame.episode_paths(directory)
data_loader = DataLoader(hparams=classifier.hparams)
for episode_path in tqdm.tqdm(episode_paths):
try:
data_loader.predict_episodes(classifier, [episode_path], prefix="frame/classifier_")
except EOFError as e:
traceback.print_exception(e)
print("Error reading {}".format(episode_path))
os.remove(episode_path)