dataset.py 文件源码-python代码片段

def _as_dmatrix(self):
    kwargs = dict(label=self.records['label'])
    kwargs['feature_names'] = self.feature_names

    featdat = self.records[self.basic_feat_cols]
    featdat = featdat.view(fields.dtype).reshape(len(featdat), -1)

    if self.hps.embedding_tag:
      embs = cache_embeddings.load_embeddings(self.hps.embedding_tag)
      npids, embsize = embs.shape
      assert embsize == self.hps.embedding_dimension
      logging.info('Loaded {}-d embeddings from rnn model {}'.format(
        embsize, self.hps.embedding_tag))
      pids = self.records['pid']
      # NB: pids are 1-indexed
      pidxs = (pids-1).astype(np.int32)
      lookuped = embs[pidxs]
      orig_shape = featdat.shape
      featdat = np.hstack((featdat, lookuped))
      logging.info('Shape went from {} to {} after adding pid embeddings'.format(
        orig_shape, featdat.shape))

    onehot_matrices = []
    for onehot_var in self.onehot_vars:
      onehot = label_binarize(self.records[onehot_var], 
          classes=range(1, self.FIELD_TO_NVALUES[onehot_var]+1),
          sparse_output=True).astype(fields.dtype)
      onehot_matrices.append(onehot)
    if onehot_matrices:
      # TODO: There are some perf issues with this. Look into this workaround:
      # https://stackoverflow.com/questions/6844998/is-there-an-efficient-way-of-concatenating-scipy-sparse-matrices/33259578#33259578
      featdat = scipy.sparse.hstack([featdat,]+onehot_matrices, format='csr')

    logging.info('Made dmatrix with feature data having shape {}'.format(featdat.shape))

    # https://github.com/dmlc/xgboost/issues/2554
    if not kwargs['label'].flags.c_contiguous:
      logging.info('Contiguizing labels')
      kwargs['label'] = np.ascontiguousarray(kwargs['label'])
      logging.info('Contiguized')
    if isinstance(featdat, np.ndarray) and not featdat.flags.c_contiguous:
      logging.info('Contiguizing feature data')
      featdat = np.ascontiguousarray(featdat)

    if FTYPES:
      kwargs['feature_types'] = self.feature_types

    return xgb.DMatrix(featdat, **kwargs)