python类unique()的实例源码

build_features.py 文件源码 项目:KaggleExeter 作者: detomo 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def app_activity_features():
    train = pd.read_csv("gender_age_train.csv")
    test = pd.read_csv("gender_age_test.csv")
    train.drop(['gender','age','group'],axis=1,inplace=True)
    data = train.append(test)

    """ Merge with brand_model table"""
    device_table = pd.read_csv("phone_brand_device_model.csv")
    data = pd.merge(data,device_table,how='left',on='device_id')
    data = data.drop_duplicates()  #drop duplicates  #note: there is still one device associated with 2 brands/models
    del device_table
    print "data build"
    """
    Create dataframe indicating for each device id, which app is present, and how much is it active
        - merge events and app_events on event_id
        - group by device_id and app_id, and take the mean of activity
    """
    events = pd.read_csv("events.csv")
    events = events[events['device_id'].isin(list(data['device_id']))]
    apps = pd.read_csv("app_events.csv")
    apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
    apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
    del events
    print "events build"
    """Reshape the dataframe so that each app is a new feature"""
    reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
    reshaped[list(pd.unique(apps['app_id']))]=0

    for app in list(pd.unique(apps['app_id'])):
        sliced = apps[apps['app_id']==app]
        reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
    del apps
    return reshaped



########################################################################################################################################
########################################################################################################################################
test_submission.py 文件源码 项目:KaggleExeter 作者: detomo 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def app_activity_features():
    train = pd.read_csv("gender_age_train.csv")
    test = pd.read_csv("gender_age_test.csv")
    train.drop(['gender','age','group'],axis=1,inplace=True)
    data = train.append(test)

    """ Merge with brand_model table"""
    device_table = pd.read_csv("phone_brand_device_model.csv")
    data = pd.merge(data,device_table,how='left',on='device_id')
    data = data.drop_duplicates()  #drop duplicates  #note: there is still one device associated with 2 brands/models
    del device_table
    print "data build"
    """
    Create dataframe indicating for each device id, which app is present, and how much is it active
        - merge events and app_events on event_id
        - group by device_id and app_id, and take the mean of activity
    """
    events = pd.read_csv("events.csv")
    events = events[events['device_id'].isin(list(data['device_id']))]
    apps = pd.read_csv("app_events.csv")
    apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
    apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
    del events
    print "events build"
    """Reshape the dataframe so that each app is a new feature"""
    reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
    reshaped[list(pd.unique(apps['app_id']))]=0

    for app in list(pd.unique(apps['app_id'])):
        sliced = apps[apps['app_id']==app]
        reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
    del apps
    return reshaped
build_mtt.py 文件源码 项目:sample-cnn 作者: tae-jun 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def _process_dataset(anno, sample_rate, n_samples, n_threads):
  """Processes, and saves MagnaTagATune dataset using multi-processes.

  Args:
    anno: Annotation DataFrame contains tags, mp3_path, split, and shard.
    sample_rate: Sampling rate of the audios. If the sampling rate is different 
      with an audio's original sampling rate, then it re-samples the audio.
    n_samples: Number of samples one segment contains.
    n_threads: Number of threads to process the dataset.
  """
  args_queue = Queue()
  split_and_shard_sets = pd.unique([tuple(x) for x in anno[['split', 'shard']].values])

  for split, shard in split_and_shard_sets:
    assigned_anno = anno[(anno['split'] == split) & (anno['shard'] == shard)]
    n_shards = anno[anno['split'] == split]['shard'].nunique()

    args = (assigned_anno, sample_rate, n_samples, split, shard, n_shards)
    args_queue.put(args)

  if FLAGS.n_threads > 1:
    threads = []
    for _ in range(FLAGS.n_threads):
      thread = Thread(target=_process_audio_files, args=[args_queue])
      thread.start()
      threads.append(thread)

    for thread in threads:
      thread.join()
  else:
    _process_audio_files(args_queue)
postprocessor.py 文件源码 项目:calvin 作者: ucd-cws 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def aggregate_regions(fp):

  # aggregate regions and supply portfolios
  # easier to do this with pandas by just reading the CSVs again
  sc = pd.read_csv(fp + '/shortage_cost.csv', index_col=0, parse_dates=True)
  sv = pd.read_csv(fp + '/shortage_volume.csv', index_col=0, parse_dates=True)
  flow = pd.read_csv(fp + '/flow.csv', index_col=0, parse_dates=True)
  demand_nodes = pd.read_csv('calvin/data/demand_nodes.csv', index_col = 0)
  portfolio = pd.read_csv('calvin/data/portfolio.csv', index_col = 0)

  for R in demand_nodes.region.unique():
    for t in demand_nodes.type.unique():
      ix = demand_nodes.index[(demand_nodes.region == R) & 
                              (demand_nodes.type == t)]
      sc['%s_%s' % (R,t)] = sc[ix].sum(axis=1)
      sv['%s_%s' % (R,t)] = sv[ix].sum(axis=1)

  for P in portfolio.region.unique():
    for k in portfolio.supplytype.unique():
      for t in portfolio.type.unique():
        ix = portfolio.index[(portfolio.region == P) & 
                             (portfolio.type ==t) & 
                             (portfolio.supplytype == k)]
        flow['%s_%s_%s' % (P,k,t)] = flow[ix].sum(axis=1)

  sc.to_csv(fp + '/shortage_cost.csv')
  sv.to_csv(fp + '/shortage_volume.csv')
  flow.to_csv(fp + '/flow.csv')
calvin.py 文件源码 项目:calvin 作者: ucd-cws 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def remove_debug_links(self):
    df = self.df
    ix = df.index[df.index.str.contains('DBUG')]
    df.drop(ix, inplace=True, axis=0)
    self.nodes = pd.unique(df[['i','j']].values.ravel()).tolist()
    self.links = list(zip(df.i,df.j,df.k))
    return df
data_prep.py 文件源码 项目:finch 作者: chrisranderson 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def nominal_to_numeric(array):
  mapper = {name: i for i, name in enumerate(pd.unique(array))}
  return np.array([mapper[name] for name in array])
esc_original.py 文件源码 项目:echonet 作者: karoldvl 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def __init__(self, data_dir, work_dir, train_folds, validation_folds, test_folds, esc10=False):
        super().__init__(data_dir, work_dir)

        self.meta = pd.read_csv(data_dir + 'esc50.csv')

        self.train_folds = train_folds
        self.validation_folds = validation_folds
        self.test_folds = test_folds

        self.class_count = 50

        self.bands = 60
        self.segment_length = 101

        self.esc10 = esc10
        if self.esc10:
            self.class_count = 10
            self.meta = self.meta[self.meta['esc10']]
            self.categories = pd.unique(self.meta.sort_values('target')['category'])
            self.meta['target'] = self.to_targets(self.meta['category'])
        else:
            self.categories = pd.unique(self.meta.sort_values('target')['category'])

        self.train_meta = self.meta[self.meta['fold'].isin(self.train_folds)]
        self.validation_data.meta = self.meta[self.meta['fold'].isin(self.validation_folds)]
        self.test_data.meta = self.meta[self.meta['fold'].isin(self.test_folds)]

        self._validation_size = len(self.validation_data.meta)
        self._test_size = len(self.test_data.meta)

        self._generate_spectrograms()
        self._populate(self.validation_data)
        self._populate(self.test_data)
data_node_frame.py 文件源码 项目:skp_edu_docker 作者: TensorMSA 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def dataconf_eval_time_check(self, _wf_data_conf_node, _node_name):
        """
        data conf? ???, eval?? unique?? ????.
        :param data_dfconf_list (nn00001_1_dataconf_node)
        :return True:
        """
        _value = False
        if ('evaldata' in _node_name):
             _value = True
        return _value
data_node_frame.py 文件源码 项目:skp_edu_docker 作者: TensorMSA 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def set_dataconf_for_labels(self, df, label):
        """
        csv? ?? label? distict ?? ???
        Extract distinct label values
        :param wf_data_config, df, nnid, ver, node:
        :param conf_data:
        """
        #TODO : set_default_dataconf_from_csv ???? ?? ??
        label_values = pd.unique(df[label].values.ravel().astype('str')).tolist()
        return label_values
test_u_download.py 文件源码 项目:stream2segment 作者: rizac 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_get_events(self, mock_query):
        urlread_sideeffect = ["""1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
"""]


        data = self.get_events_df(urlread_sideeffect, self.session, "http://eventws", db_bufsize=self.db_buf_size)
        # assert only first two events events were successfully saved 
        assert len(self.session.query(Event).all()) == len(pd.unique(data['id'])) == 2
        # AND data to save has length 2:
        assert len(data) == 2

        # now download again, with an url error:        
        urlread_sideeffect = [413, """1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
""", URLError('blabla23___')]

        data = self.get_events_df(urlread_sideeffect, self.session, "http://eventws", db_bufsize=self.db_buf_size)
        # assert we got the same result as above:
        assert len(self.session.query(Event).all()) == len(pd.unique(data['id'])) == 2
        assert len(data) == 2
        # and since first response is 413, that having split the request into two, the
        # second response is our URLError (we could test it better, anyway):
        assert "blabla23___" in self.log_msg()
list_builder.py 文件源码 项目:seniority_list 作者: rubydatasystems 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def sort_eg_attributes(df, attributes=['doh', 'ldate'],
                       reverse_list=[0, 0],
                       add_columns=False):
    '''Sort master list attribute columns by employee group in preparation
    for list construction.  The overall master list structure and order is
    unaffected, only the selected attribute columns are sorted (normally
    date-related columns such as doh or ldate)

    inputs
        df
            The master data dataframe (does not need to be sorted)
        attributes
            columns to sort by eg (inplace)
        reverse_list
            If an attribute is to be sorted in reverse order (descending),
            use a '1' in the list position corresponding to the position of
            the attribute within the attributes input
        add_columns
            If True, an additional column for each sorted attribute will be
            added to the resultant dataframe, with the suffix '_sort' added
            to it.
    '''
    date_cols = []
    for col in df:
        if (df[col]).dtype == 'datetime64[ns]':
            date_cols.append(col)
    try:
        df.sort_values(['eg', 'eg_number'], inplace=True)
    except LookupError:
        df.sort_values(['eg', 'eg_order'], inplace=True)

    egs = df.eg.values
    i = 0
    for measure in attributes:
        data = df[measure].values
        measure_col = np.empty_like(data)
        for eg in pd.unique(df.eg):
            measure_slice = data[egs == eg]
            measure_slice_index = np.where(egs == eg)[0]
            measure_slice_sorted = np.sort(measure_slice, axis=0)

            if reverse_list[i]:
                measure_slice_invert = measure_slice_sorted[::-1]
                measure_slice_sorted = measure_slice_invert
            np.put(measure_col, measure_slice_index, measure_slice_sorted)

        if add_columns:
            col_name = measure + '_sort'
        else:
            col_name = measure

        df[col_name] = measure_col

        if measure in date_cols:
            df[col_name] = pd.to_datetime(df[col_name].dt.date)
        i += 1

    return df
utils.py 文件源码 项目:plydata 作者: has2k1 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def unique(lst):
    """
    Return unique elements

    :class:`pandas.unique` and :class:`numpy.unique` cast
    mixed type lists to the same type. They are faster, but
    some times we want to maintain the type.

    Parameters
    ----------
    lst : list-like
        List of items

    Returns
    -------
    out : list
        Unique items in the order that they appear in the
        input.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> lst = ['one', 'two', 123, 'three']
    >>> pd.unique(lst)
    array(['one', 'two', '123', 'three'], dtype=object)
    >>> np.unique(lst)
    array(['123', 'one', 'three', 'two'],
          dtype='<U5')
    >>> unique(lst)
    ['one', 'two', 123, 'three']

    pandas and numpy cast 123 to a string!, and numpy does not
    even maintain the order.
    """
    seen = set()

    def make_seen(x):
        seen.add(x)
        return x

    return [make_seen(x) for x in lst if x not in seen]
test_chamber_of_deputies_dataset.py 文件源码 项目:serenata-toolbox 作者: datasciencebr 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def test_fetch_translate_clean_integration(self):
        self.subject.fetch()
        files = ["Ano-{}.csv".format(n) for n in [2017]]
        files.append('datasets-format.html')

        for name in files:
            file_path = os.path.join(self.path, name)
            assert(os.path.exists(file_path))

        self.subject.translate()
        for name in ["reimbursements-{}.xz".format(n) for n in self.years]:
            file_path = os.path.join(self.path, name)
            assert(os.path.exists(file_path))

        self.subject.clean()
        file_path = os.path.join(self.path, 'reimbursements.xz')
        assert(os.path.exists(file_path))

        # test for subquota translation
        dataset = pd.read_csv(file_path, compression='xz')
        all_subquotas = ['Maintenance of office supporting parliamentary activity',
                     'Locomotion, meal and lodging',
                     'Fuels and lubricants',
                     'Consultancy, research and technical work',
                     'Publicity of parliamentary activity',
                     'Purchase of office supplies',
                     'Software purchase or renting; Postal services; Subscriptions',
                     'Security service provided by specialized company',
                     'Flight tickets',
                     'Telecommunication',
                     'Postal services',
                     'Publication subscriptions',
                     'Congressperson meal',
                     'Lodging, except for congressperson from Distrito Federal',
                     'Automotive vehicle renting or watercraft charter',
                     'Aircraft renting or charter of aircraft',
                     'Automotive vehicle renting or charter',
                     'Watercraft renting or charter',
                     'Taxi, toll and parking',
                     'Terrestrial, maritime and fluvial tickets',
                     'Participation in course, talk or similar event',
                     'Flight ticket issue']

        present_subquotas = pd.unique(dataset['subquota_description'])
        for subquota in present_subquotas:
            assert(subquota in all_subquotas)
esc.py 文件源码 项目:echonet 作者: karoldvl 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def __init__(self, data_dir, work_dir, train_folds, validation_folds, test_folds, esc10=False,
                 downsample=True):
        super().__init__(data_dir, work_dir)

        self.meta = pd.read_csv(data_dir + 'esc50.csv')

        self.train_folds = train_folds
        self.validation_folds = validation_folds
        self.test_folds = test_folds

        self.class_count = 50

        self.DOWNSAMPLE = downsample
        self.SEGMENT_LENGTH = 300
        self.BANDS = 180
        self.WITH_DELTA = False
        self.FMAX = 16000
        self.FFT = 2205
        self.HOP = 441

        self.esc10 = esc10
        if self.esc10:
            self.class_count = 10
            self.meta = self.meta[self.meta['esc10']]
            self.categories = pd.unique(self.meta.sort_values('target')['category'])
            self.meta['target'] = self.to_targets(self.meta['category'])
        else:
            self.categories = pd.unique(self.meta.sort_values('target')['category'])

        self.train_meta = self.meta[self.meta['fold'].isin(self.train_folds)]
        self.validation_data.meta = self.meta[self.meta['fold'].isin(self.validation_folds)]
        self.test_data.meta = self.meta[self.meta['fold'].isin(self.test_folds)]

        self._validation_size = len(self.validation_data.meta)
        self._test_size = len(self.test_data.meta)

        self._generate_spectrograms()

        if self.DOWNSAMPLE:
            self.SEGMENT_LENGTH //= 2
            self.BANDS //= 3

        self._populate(self.validation_data)
        self._populate(self.test_data)
data_node_frame.py 文件源码 项目:skp_edu_docker 作者: TensorMSA 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def set_dataconf_for_checktype(self, df, node_id, data_dfconf_list):
        """
        csv? ?? column type? ???? data_conf? ??(data_conf? ????? )
        ???? ??? Unique ? ?? ??? cell_feature_unique? ???(Keras?)

        :param wf_data_config, df, nnid, ver, node:
        :param conf_data:
        """
        try:
            #TODO : set_default_dataconf_from_csv ???? ?? ??
            data_conf = dict()
            data_conf_unique_v = dict()
            data_conf_col_unique_v = dict()
            data_conf_col_type = dict()
            numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
            # Wdnn??? data_dfconf? ??? ??? ?? ??? ??? ?? ??? ??
            if len(data_dfconf_list) > 0:
                _wf_data_conf = wf_data_conf(data_dfconf_list)
                _cell_feature_unique = _wf_data_conf.cell_feature_unique if hasattr(_wf_data_conf,
                                                                      'cell_feature_unique') else list()  # ?? ???? ????? ??? ? ??? ??
            for i, v in df.dtypes.iteritems():
                # label
                column_dtypes = dict()
                column_unique_value = dict()
                if (str(v) in numerics):  # maybe need float
                    col_type = 'CONTINUOUS'
                    columns_unique_value = list()
                else:
                    col_type = 'CATEGORICAL'
                    columns_unique_value = pd.unique(df[i].fillna('').values.ravel()).tolist()  # null?? ???
                column_dtypes['column_type'] = col_type
                origin_feature_unique = _cell_feature_unique[i].get('column_u_values') if (i in _cell_feature_unique) else list()
                combined_col_u_list = utils.get_combine_label_list(origin_feature_unique, columns_unique_value)
                column_unique_value['column_u_values'] = combined_col_u_list    #???? ???? ?? ????.
                data_conf_col_type[i] = column_dtypes
                data_conf_col_unique_v[i] = column_unique_value
            data_conf['cell_feature'] = data_conf_col_type
            data_conf_unique_v['cell_feature_unique'] = data_conf_col_unique_v
            data_conf_json_str = json.dumps(data_conf)  #Json?? ???
            data_conf_json = json.loads(data_conf_json_str)
            data_conf_unique_json_str = json.dumps(data_conf_unique_v)
            data_conf_unique_json = json.loads(data_conf_unique_json_str)
            return data_conf_json, data_conf_unique_json
        except Exception as e:
            logging.error("set_dataconf_for_checktype {0} {1}".format(e, e.__traceback__.tb_lineno))


问题


面经


文章

微信
公众号

扫码关注公众号