def _as_dmatrix(self):
kwargs = dict(label=self.records['label'])
kwargs['feature_names'] = self.feature_names
featdat = self.records[self.basic_feat_cols]
featdat = featdat.view(fields.dtype).reshape(len(featdat), -1)
if self.hps.embedding_tag:
embs = cache_embeddings.load_embeddings(self.hps.embedding_tag)
npids, embsize = embs.shape
assert embsize == self.hps.embedding_dimension
logging.info('Loaded {}-d embeddings from rnn model {}'.format(
embsize, self.hps.embedding_tag))
pids = self.records['pid']
# NB: pids are 1-indexed
pidxs = (pids-1).astype(np.int32)
lookuped = embs[pidxs]
orig_shape = featdat.shape
featdat = np.hstack((featdat, lookuped))
logging.info('Shape went from {} to {} after adding pid embeddings'.format(
orig_shape, featdat.shape))
onehot_matrices = []
for onehot_var in self.onehot_vars:
onehot = label_binarize(self.records[onehot_var],
classes=range(1, self.FIELD_TO_NVALUES[onehot_var]+1),
sparse_output=True).astype(fields.dtype)
onehot_matrices.append(onehot)
if onehot_matrices:
# TODO: There are some perf issues with this. Look into this workaround:
# https://stackoverflow.com/questions/6844998/is-there-an-efficient-way-of-concatenating-scipy-sparse-matrices/33259578#33259578
featdat = scipy.sparse.hstack([featdat,]+onehot_matrices, format='csr')
logging.info('Made dmatrix with feature data having shape {}'.format(featdat.shape))
# https://github.com/dmlc/xgboost/issues/2554
if not kwargs['label'].flags.c_contiguous:
logging.info('Contiguizing labels')
kwargs['label'] = np.ascontiguousarray(kwargs['label'])
logging.info('Contiguized')
if isinstance(featdat, np.ndarray) and not featdat.flags.c_contiguous:
logging.info('Contiguizing feature data')
featdat = np.ascontiguousarray(featdat)
if FTYPES:
kwargs['feature_types'] = self.feature_types
return xgb.DMatrix(featdat, **kwargs)
dataset.py 文件源码
python
阅读 25
收藏 0
点赞 0
评论 0
评论列表
文章目录