python类DataFrame()的实例源码

get_actions.py 文件源码 项目:JDcontest 作者: zsyandjyhouse 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_actions(start_time, end_time):
    """
    :param start_date:
    :param end_date:
    :return: actions: pd.Dataframe
    """
    FilePath = "../JData/"
    ActionAllFile = "JData_Action_All.csv"
    #ActionAllFile = "JData_Action_before_327.csv"
    action_all = pd.read_csv(FilePath + ActionAllFile,nrows=100000)
    action_all.time = pd.to_datetime(action_all['time'],format='%Y-%m-%d %H:%M:%S')
    actions = action_all[(action_all.time >= start_time) & (action_all.time <= end_time)]
    return actions
fragility_curve.py 文件源码 项目:WNTR 作者: USEPA 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def cdf_probability(self, x):
        """
        Return the CDF probability for each state, based on the value of x

        Parameters
        -----------
        x : pd.Series
            Control variable for each element

        Returns
        --------
        Pr : pd.Dataframe
            Probability of exceeding a damage state

        """
        state_names = [name for name, state in self.states()]

        Pr = pd.DataFrame(index = x.index, columns=state_names)

        for element in Pr.index:
            for state_name, state in self.states():
                try:
                    dist=state.distribution[element]
                except:
                    dist=state.distribution['Default']
                Pr.loc[element, state_name] = dist.cdf(x[element])

        return Pr
core.py 文件源码 项目:SimpleSQLite 作者: thombashi 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def select_as_dataframe(
            self, table_name, column_list=None, where=None, extra=None):
        """
        Get data in the database and return fetched data as a
        :py:class:`pandas.Dataframe` instance.

        :param str table_name: |arg_select_table_name|
        :param list column_list: |arg_select_as_xx_column_list|
        :param str where: |arg_select_where|
        :param str extra: |arg_select_extra|
        :return: Table data as a :py:class:`pandas.Dataframe` instance.
        :rtype: pandas.DataFrame
        :raises simplesqlite.NullDatabaseConnectionError:
            |raises_check_connection|
        :raises simplesqlite.TableNotFoundError:
            |raises_verify_table_existence|
        :raises simplesqlite.OperationalError: |raises_operational_error|

        :Example:
            :ref:`example-select-as-dataframe`

        .. note::
            ``pandas`` package required to execute this method.
        """

        import pandas

        if column_list is None:
            column_list = self.get_attr_name_list(table_name)

        result = self.select(
            select=",".join(SqlQuery.to_attr_str_list(column_list)),
            table_name=table_name, where=where, extra=extra)

        if result is None:
            return pandas.DataFrame()

        return pandas.DataFrame(result.fetchall(), columns=column_list)
histogrammar_filler.py 文件源码 项目:Eskapade 作者: KaveIO 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def construct_empty_hist(self, columns):
        """Create an (empty) histogram of right type

        Create a multi-dim histogram by iterating through the columns in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param list columns: histogram columns
        :returns: created histogram
        :rtype: histogrammar.Count
        """

        hist = hg.Count()

        # create a multi-dim histogram by iterating through the columns in reverse order
        # and passing a single-dim hist as input to the next column
        for col in reversed(columns):
            # histogram type depends on the data type
            dt = np.dtype(self.var_dtype[col])

            # processing function, e.g. only accept boolians during filling
            f = self.quantity[col] if col in self.quantity else hf.QUANTITY[dt.type]
            if len(columns) == 1:
                # df[col] is a pd.series
                quant = lambda x, fnc=f: fnc(x)
            else:
                # df[columns] is a pd.Dataframe
                # fix column to col
                quant = lambda x, fnc=f, clm=col: fnc(x[clm])

            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                bs = self.bin_specs.get(col, self._unit_bin_specs if is_number else self._unit_timestamp_specs)
                hist = hg.SparselyBin(binWidth=bs['bin_width'], origin=bs['bin_offset'], quantity=quant, value=hist)
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=quant, value=hist)

        # FIXME stick data types and number of dimension to histogram
        dta = [self.var_dtype[col] for col in columns]
        hist.datatype = dta[0] if len(columns) == 1 else dta
        hist.n_dim = len(columns)

        @property
        def n_bins(self):
            if hasattr(self, 'num'):
                return self.num
            elif hasattr(self, 'size'):
                return self.size
            else:
                raise RuntimeError('Cannot retrieve number of bins from hgr hist')
        hist.n_bins = n_bins

        return hist
forecaster.py 文件源码 项目:prophet 作者: facebook 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def make_all_seasonality_features(self, df):
        """Dataframe with seasonality features.

        Includes seasonality features, holiday features, and added regressors.

        Parameters
        ----------
        df: pd.DataFrame with dates for computing seasonality features and any
            added regressors.

        Returns
        -------
        pd.DataFrame with regression features.
        list of prior scales for each column of the features dataframe.
        """
        seasonal_features = []
        prior_scales = []

        # Seasonality features
        for name, props in self.seasonalities.items():
            features = self.make_seasonality_features(
                df['ds'],
                props['period'],
                props['fourier_order'],
                name,
            )
            seasonal_features.append(features)
            prior_scales.extend(
                [props['prior_scale']] * features.shape[1])

        # Holiday features
        if self.holidays is not None:
            features, holiday_priors = self.make_holiday_features(df['ds'])
            seasonal_features.append(features)
            prior_scales.extend(holiday_priors)

        # Additional regressors
        for name, props in self.extra_regressors.items():
            seasonal_features.append(pd.DataFrame(df[name]))
            prior_scales.append(props['prior_scale'])

        if len(seasonal_features) == 0:
            seasonal_features.append(
                pd.DataFrame({'zeros': np.zeros(df.shape[0])}))
            prior_scales.append(1.)
        return pd.concat(seasonal_features, axis=1), prior_scales
forecaster.py 文件源码 项目:prophet 作者: facebook 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def predict_seasonal_components(self, df):
        """Predict seasonality components, holidays, and added regressors.

        Parameters
        ----------
        df: Prediction dataframe.

        Returns
        -------
        Dataframe with seasonal components.
        """
        seasonal_features, _ = self.make_all_seasonality_features(df)
        lower_p = 100 * (1.0 - self.interval_width) / 2
        upper_p = 100 * (1.0 + self.interval_width) / 2

        components = pd.DataFrame({
            'col': np.arange(seasonal_features.shape[1]),
            'component': [x.split('_delim_')[0] for x in seasonal_features.columns],
        })
        # Add total for all regression components
        components = components.append(pd.DataFrame({
            'col': np.arange(seasonal_features.shape[1]),
            'component': 'seasonal',
        }))
        # Add totals for seasonality, holiday, and extra regressors
        components = self.add_group_component(
            components, 'seasonalities', self.seasonalities.keys())
        if self.holidays is not None:
            components = self.add_group_component(
                components, 'holidays', self.holidays['holiday'].unique())
        components = self.add_group_component(
            components, 'extra_regressors', self.extra_regressors.keys())
        # Remove the placeholder
        components = components[components['component'] != 'zeros']

        X = seasonal_features.as_matrix()
        data = {}
        for component, features in components.groupby('component'):
            cols = features.col.tolist()
            comp_beta = self.params['beta'][:, cols]
            comp_features = X[:, cols]
            comp = (
                np.matmul(comp_features, comp_beta.transpose())
                * self.y_scale  # noqa W503
            )
            data[component] = np.nanmean(comp, axis=1)
            data[component + '_lower'] = np.nanpercentile(comp, lower_p,
                                                            axis=1)
            data[component + '_upper'] = np.nanpercentile(comp, upper_p,
                                                            axis=1)
        return pd.DataFrame(data)
ensemble_method_func.py 文件源码 项目:quickdraw_prediction_model 作者: keisukeirie 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def _df_initial_fixer(df, word, sample=60000):
    '''
    function:
    - ramdomly select rows (image) "sample" times from the df dataframe
    and delete features that are not used in ensemble method modeling

    input:
        df = dataframe. output of 1_feature_engineering_func. [pd.dataframe]
        word = name of topic ig "cat" [str]
        sample = number of sample you want to extract from df [int]

    output:
    new data frame!

    '''
    print "total number of images for df_{}: {}".format(word, len(df))
    random_index = np.random.choice(list(df.index), sample, replace=False)
    df = df.loc[list(random_index)]
    df_test = df.drop(['drawing','key_id','timestamp','recognized','X','Y','time',\
                        'X_per_stroke','Y_per_stroke','time_per_stroke',\
                        'total_time_of_stroke','dp_per_stroke','dp_percent_per_stroke',\
                        'direction'], axis=1)
    return df_test
ensemble_method_func.py 文件源码 项目:quickdraw_prediction_model 作者: keisukeirie 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def _df_initial_fixer_cc(df, word):
    '''
    prepares training and test X and Y for xgboost test for countrycode classifier

    function:
    - delete features that are not used in ensemble method modeling

    input:
        df = dataframe. output of 1_feature_engineering_func. [pd.dataframe]
        word = name of topic ig "cat" [str]

    output:
    new data frame!

    '''
    df_test = df.drop(['drawing','key_id','timestamp','recognized','X','Y','time',\
                        'X_per_stroke','Y_per_stroke','time_per_stroke',\
                        'total_time_of_stroke','dp_per_stroke','dp_percent_per_stroke',\
                        'direction'], axis=1)
    return df_test
ensemble_method_func.py 文件源码 项目:quickdraw_prediction_model 作者: keisukeirie 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def _country_initial_fixer(df,country,limit):
    '''
    Function:
    extracts data by country and ramdomly select "limit" amount of data from that dataset

    Input:
    df = dataframe (should contain 'countrycode' features) [dataframe]
    country = should be 2 capital letter country code[string]
    limit = max number of rows (data) you want to take into the new data frame

    Output:
    dataframe contains data from selected country (# of data <= limit)

    note: uses random.seed(32113)
    '''
    if df[df['countrycode']==country].count()[0] > limit:
        df_c = df[df['countrycode']==country]
        random_c = np.random.choice(list(df_c.index), limit, replace=False)
        df_c = df_c.loc[list(random_c)]
    else:
        df_c = df[df['countrycode']==country]
    return df_c
fsData.py 文件源码 项目:foamBazar 作者: BV-DR 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def postProcessingDatFile(fname, objName=None, root='./'):
    if objName!=None:
        dataFolder = postProcessingFolder(objName, root=root)
        timeNames = timeFolder(root=dataFolder)
    else:
        dataFolder = addslash(root)
        timeNames = []
    if len(timeNames)==0: timeNames=['']    # at least check the current folder
    keyName = os.path.basename(rmslash(fname))
    keyName = os.path.splitext(keyName)[0]
    datFiles = []
    for subdir in timeNames:
        found = filesOnly(sorted(glob.glob(dataFolder + subdir + "/" + keyName + "*.dat")))
        for f in found: datFiles.append(f)
    return datFiles

# concat dataframe and optionally merge xAxis
# When overlap, either keep 'last', 'first', or 'False'
# list_of_data must be of type pandas.dataframe
feature_engineering_func.py 文件源码 项目:quickdraw_prediction_model 作者: keisukeirie 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def feature_eng_pt3(df_cf):
    '''
    function:
    - feature engineering pt3
      need to run this after feature_eng_pt2 since pt4 and pt5
      uses features created in this function.

    - Create following features:
      direction = direction of stroke (from first XY points to last XY points)
                    in radian (0 to 6.28...) [float]

    input:
      df_cf = output dataframe from feature_eng_pt2

    output:
      dataframe with above features and filter

    the way I approached this is by finding the first and last x,y locations for each stroke and
    I then calculated delta x (dx) and delta y (dy).
    from there, I just calculated the direction of the stroke in radian using my user defined function "_radian_direction"
    '''
    direction = {}
    for index in df_cf.index:
        dx = [float(df_cf.drawing[index][stroke][1][-1] - df_cf.drawing[index][stroke][1][0]) \
          for stroke in xrange(df_cf.stroke_number[index])]
        dy = [float(df_cf.drawing[index][stroke][0][-1] - df_cf.drawing[index][stroke][0][0]) \
          for stroke in xrange(df_cf.stroke_number[index])]
        dx = np.array(dx)
        dy = np.array(dy)
        dx[dx==0] = 0.000001
        vecrad_direction = np.vectorize(_radian_direction)
        direction[index] = vecrad_direction(dy,dx)
    df_cf['direction'] = pd.Series(direction)
    return df_cf
feature_engineering_func.py 文件源码 项目:quickdraw_prediction_model 作者: keisukeirie 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def load_json(filename):
    '''
    Function:
        - opens json file and store information in a pandas dataframe
        - also prints out aggregated df with counts of picture by countrycode
    Input:
        1. filename/path ex: ./data/filename.json
    Output:
        1. new dataframe containing json info
    '''
    df = pd.read_json(filename, lines=True)
    test = df.groupby(df['countrycode']).count()
    print test.sort(columns='drawing',ascending=False).head(15)
    return df
feature_engineering_func.py 文件源码 项目:quickdraw_prediction_model 作者: keisukeirie 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def pic_viewer(df_cf, _id):

    '''
    Function:
        - If X and Y columns exist in your dataframe, you can use this function
                            to view drawing with specific id.
        - run this after running CNN_feat_eng_pt1 or feature_eng_pt2
    Input:
        1. dataframe df_cf
        2. object id _id
    Output:
        1. scatter plot of x and y
    '''
    plt.scatter(df_cf.X[_id],df_cf.Y[_id])
    plt.gca().invert_yaxis()
feature_utils.py 文件源码 项目:2020plus 作者: KarchinLab 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def random_sort(df, prng=None):
    """Randomly shuffle a DataFrame.

    NOTE: if the training data is not randomly shuffled, then
    supervised learning may find artifacts related to the order
    of the data.

    Parameters
    ----------
    df : pd.DataFrame
        dataframe with feature information

    Returns
    -------
    df : pd.DataFrame
        Randomly shuffled data frame
    """
    # get new random state if not specified
    if prng is None:
        prng = np.random.RandomState()

    # get random order
    random_indices = prng.choice(df.index.values,  # sample from 'genes'
                                 len(df),  # number of samples
                                 replace=False)  # sample without replacement

    # change order of df
    random_df = df.ix[random_indices].copy()

    return random_df
feature_utils.py 文件源码 项目:2020plus 作者: KarchinLab 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def process_mutational_features(mydf):
    """Performs feature processing pipeline.

    Parameters
    ----------
    mydf : pd.DataFrame
        data frame containing the desired raw data for computation of
        features for classifier

    Returns
    -------
    proc_feat_df: pd.DataFrame
        dataframe consisting of features for classification
    """
    # rename process of columns to ensure compatability with previously
    # written code
    mydf = mydf.rename(columns={'Protein_Change': 'AminoAcid',
                                'DNA_Change': 'Nucleotide'})

    # process features
    feat_list = fmat.generate_feature_matrix(mydf, 2)
    headers = feat_list.pop(0)  # remove header row
    feat_df = pd.DataFrame(feat_list, columns=headers)  # convert to data frame
    proc_feat_df = normalize_mutational_features(feat_df, 0)
    miss_ent_df = pentropy.missense_position_entropy(mydf[['Gene', 'AminoAcid']])
    # mut_ent_df = pentropy.mutation_position_entropy(mydf[['Gene', 'AminoAcid']])

    # encorporate entropy features
    #proc_feat_df['mutation position entropy'] = mut_ent_df['mutation position entropy']
    #proc_feat_df['pct of uniform mutation entropy'] = mut_ent_df['pct of uniform mutation entropy']
    proc_feat_df['missense position entropy'] = miss_ent_df['missense position entropy']
    proc_feat_df['pct of uniform missense entropy'] = miss_ent_df['pct of uniform missense entropy']
    return proc_feat_df
dimension_fitting.py 文件源码 项目:adel 作者: openalea-incubator 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def pandadf2adeldict(df):
    ''' convertit un dataframe panda en dictionaire de vecteur numpy '''
    d = df.to_dict()
    return dict((k,np.array([v for v in dv.itervalues()])) for k, dv in d.iteritems())
tfa.py 文件源码 项目:inferelator_ng 作者: simonsfoundation 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def compute_transcription_factor_activity(self, allow_self_interactions_for_duplicate_prior_columns = True):
        # Find TFs that have non-zero columns in the priors matrix
        non_zero_tfs = self.prior.columns[(self.prior != 0).any(axis=0)].tolist()

        # Delete tfs that have neither prior information nor expression
        delete_tfs = set(self.prior.columns).difference(self.prior.index).difference(non_zero_tfs)
        # Raise warnings
        if len(delete_tfs) > 0:
            message = " ".join([str(len(delete_tfs)).capitalize(),
             "transcription factors are removed because no expression or prior information exists."])
            warnings.warn(message)
            self.prior = self.prior.drop(delete_tfs, axis = 1)

        # Create activity dataframe with values set by default to the transcription factor's expression
        activity = pd.DataFrame(self.expression_matrix.loc[self.prior.columns,:].values,
                index = self.prior.columns,
                columns = self.expression_matrix.columns)

        # Find all non-zero TFs that are duplicates of any other non-zero tfs
        is_duplicated = self.prior[non_zero_tfs].transpose().duplicated(keep=False)
        duplicates = is_duplicated[is_duplicated].index.tolist()

        # Find non-zero TFs that are also present in target gene list 
        self_interacting_tfs = set(non_zero_tfs).intersection(self.prior.index)

        # If this flag is set to true, don't count duplicates as self-interacting when setting the diag to zero
        if allow_self_interactions_for_duplicate_prior_columns:
            self_interacting_tfs = self_interacting_tfs.difference(duplicates)

        # Set the diagonal of the matrix subset of self-interacting tfs to zero
        subset = self.prior.loc[self_interacting_tfs, self_interacting_tfs].values
        np.fill_diagonal(subset, 0)
        self.prior.at[self_interacting_tfs, self_interacting_tfs] = subset

        # Set the activity of non-zero tfs to the pseudoinverse of the prior matrix times the expression
        if non_zero_tfs:
            activity.loc[non_zero_tfs,:] = np.matrix(linalg.pinv2(self.prior[non_zero_tfs])) * np.matrix(self.expression_matrix_halftau)

        return activity
fsData.py 文件源码 项目:foamBazar 作者: BV-DR 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def usage():
    print '''
# template for loading openfoam data into pandas.dataframe

import sys
sys.path.append("/home/soseng/OpenFOAM/bv/foamBazar/pythonScripts/")
import fsData as fs
from matplotlib import pyplot as plt
if __name__ == "__main__":
    log = fs.loadLogData("-p res -w init,Ux,Uy,Uz", logfiles=['log.run','fsLog'])
    mot = fs.loadMotionInfo("motionInfo", root='./')
    vbm = fs.loadInternalLoads("vbm", root='./', fnames=['my','fz','acc'])    
    '''
    pass
fsData.py 文件源码 项目:foamBazar 作者: BV-DR 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def setmetadata(data, label=None, info=None, module=None, args=None):
    data.fsData = deepcopy(FSDATA)
    data.fsData['label'] = label
    data.fsData['info'] = info if info!=None else 'last update: ' + datetime.date.today().strftime("%I:%M%p %B %d, %Y")
    data.fsData['module'] = module
    data.fsData['args'] = args if args!=None else {
        'lastUpdate':datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
    }

# load log data given a list of logfiles/folder
# data will be merged and return as pandas.dataframe
# cmd: is fsPlot.py command line arguments
# e.g.: loadLogData('-p res', logfiles=['log.run0','log.run1',''])
# When overlap, either keep 'last', 'first', or 'False'
feature_engineering_func.py 文件源码 项目:quickdraw_prediction_model 作者: keisukeirie 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def feature_engineering_ensemble(df,category,sample=60000,purpose='word',\
                                            countries = ['US','BR','RU','KR']):
    '''
    function:
    - aggregates multiple user defined functions to create dataframe for ensemble method modeling.
    - it also prints out how long it takes to run
    - processes google quickdraw raw data dataframe
    - after this processing, dataframe contains 404 features
    - the output of this function will be used for ensemble method modeling.

    input:
    - df = dataframe that was converted from raw_data json file
    - category = used to name output pickle file
    - sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word')  
    - purpose = 'word' or 'country'. prepares data for different purposes.
        'word' for image recognition, 'country' for country prediction
    - countries = list of country code used in country prediction

    output:
    - pickled dataframe that will be used for ensemble method (404 features)
    filename: "./data/MY_feature_{}.pkl".format(category)
    '''
    start_time = time.time()
    #runs feature_eng_pt1 through pt5.
    df_test1 = feature_eng_pt1(df)
    df_test2 = feature_eng_pt2(df_test1)
    df_test3 = feature_eng_pt3(df_test2)
    df_subset = feature_eng_pt4(df_test3)
    df_subset2 = feature_eng_pt5(df_test3)
    df_final = pd.concat([df_test3,df_subset,df_subset2], axis=1)

    # prepares final dataframe
    #If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final
    if purpose == 'word':
        df_final.index = xrange(len(df_final))
        random_ind = np.random.choice(list(df_final.index), sample, replace=False)
        df_final = df_final.loc[list(random_ind)]
    #if purpose = 'country', it will correct all datapoints from the selected countries.
    elif purpose == 'country':
        df_final = df_final[(df_final['countrycode']==countries[0])|\
                (df_final['countrycode']==countries[1])|\
               (df_final['countrycode']==countries[2])|(df_final['countrycode']==countries[3])]
    df_final.index = df_final['key_id']
    df_final.to_pickle("./data/MY_feature_{}.pkl".format(category))
    print("--- %s seconds ---" % (time.time() - start_time))
feature_engineering_func.py 文件源码 项目:quickdraw_prediction_model 作者: keisukeirie 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def feature_engineering_CNN(df,category,sample=60000,purpose='word',countries = ['US','BR','RU','KR']):
    '''
    function:
    - aggregates 2 user defined functions that prepares dataframe for CNN modeling.
    - it also prints out how long it takes to run.

    input:
    - df = dataframe that was converted from raw_data json file
    - category = used to name output pickle file
    - sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word') 
    - purpose = 'word' or 'country'. prepares data for different purposes.
        'word' for image recognition, 'country' for country prediction
    - countries = list of country codes used in country prediction

    output:
    - pickled dataframe that will be used for CNN modeling (1176 features)
    - each row represents 42 by 28 pixel image
    file name: "./data/{}.pkl".format(category)
    '''

    start_time = time.time()
    #runs CNN feature engineering functions
    df_1 = CNN_feat_eng_pt1(df)
    df_2 = CNN_feat_eng_pt2(df_1)
    #If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final
    if purpose == 'word':
        df_2.index = xrange(len(df_2))
        random_ind = np.random.choice(list(df_2.index), sample, replace=False)
        df_2 = df_2.loc[list(random_ind)]
    #If purpose = 'country', it will correct all datapoints from the selected countries.
    elif purpose == 'country':
        df_2 = df_2[(df_2['countrycode']==countries[0])|(df_2['countrycode']==countries[1])|\
               (df_2['countrycode']==countries[2])|(df_2['countrycode']==countries[3])]
    df_2.index = df_2['key_id']
    df_2.to_pickle("./data/{}.pkl".format(category))
    print("--- %s seconds ---" % (time.time() - start_time))
    return df_2



##############################################################################
#           functions for feature engineeering for ensemble methods          #
##############################################################################
feature_engineering_func.py 文件源码 项目:quickdraw_prediction_model 作者: keisukeirie 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def feature_eng_pt1(df_cf):

    '''
    function:
    - feature engineering pt1
      need to run this first since pt2 to pt5 uses features created
      in this function.

    - create following features:
      stroke_number = total stroke number of an image [int]
      final time = time of the last datapoints for an image (how long it took user to draw) [int]
      recognized = changed True/False response to boolean
                              (1 is true, 0 is false)[int]

    - Filtering applied:
      1: filtered out data where recognize == 0. 
          Having unrecognized images in the dataset may reduce prediction accuracy
      2: filtered out data where stroke_number is greater than 15
          After analysis, most pics were drawn under 15 strokes. 
          I'm suspecting that if stroke numbers are above 20 or 30, users might be using a graphic tablet. 
          In this project, I tried to exclude those images above 15 strokes.
          So that I keep all images that are drawn in the similar environment.
      3: filtered out data where final time is greater than 20000
          I do not know how this happens but some images have time values that are more than 20000.
          The quickdraw ask users to draw in 20sec so I am a bit puzzled how these users draw for more than 20000ms.

    input:
    df = dataframe created from Google quickdraw raw data json file

    output:
    dataframe with additional features mentioned above
    '''
    # create feature "stroke_number"
    df_cf['stroke_number']=df_cf['drawing'].str.len()

    #create feature "final_time"
    df_cf['final_time'] = [df_cf.loc[index,'drawing']\
                [df_cf.stroke_number[index]-1][2][-1] for index in df_cf.index]

    #setting boolean and changing recognized features to 1 and 0.
    b_loon = {True: 1, False:0}
    df_cf['recognized'] = df_cf['recognized'].map(b_loon)

    #filtered data by stroke number, recognized and final time features
    df_cf = df_cf[(df_cf['recognized']==1) & (df_cf['stroke_number'] <= 15)]
    df_cf = df_cf[(df_cf['final_time']<=20000)]
    return df_cf
feature_engineering_func.py 文件源码 项目:quickdraw_prediction_model 作者: keisukeirie 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def feature_eng_pt4(df_cf):
    '''
    function:
    - feature engineering pt4
      create new dataframe that need to be combined with output dataframe
      of feature_eng_pt3
    - it creates 5 features per 1 stroke.
    - this function will creates these 5 features for first 15 strokes of an image

    - Create following features:
      datapoint_percentage_stroke'i' = # of data points in stroke i divide by
                            total number of data points of an image. [float]
            * do not confuse with dp_percent_per_stroke column I previously made.
            dp_percent_per_stroke is a list. datapoint_percentage_stroke'i' is a float!

      direction_stroke'i' = direction of stroke 'i' [float]

      time_stroke'i' = total time spent on stroke'i' [int]

      datapoints_stroke'i' = number of data points in stroke i [int]

      switch_stroke'i' = boolean indicates whether stroke'i' exist in an image
                            0: stroke exist 1: stroke does not exist [int]

    input:
      df_cf = output dataframe from feature_eng_pt3

    output:
      new dataframe with 75 features (5 * 15 features)
    '''

    ar = np.zeros((len(df_cf),75))
    c = 0
    for index_ in df_cf.index:
        stroke = (df_cf.stroke_number[index_])
        ar[c][:stroke] = np.array(df_cf['dp_percent_per_stroke'][index_])
        ar[c][15:15+stroke] = np.array(df_cf['direction'][index_])
        ar[c][30:30+stroke] = np.array(df_cf['total_time_of_stroke'][index_])
        ar[c][45:45+stroke] = np.array(df_cf['dp_per_stroke'][index_])
        ar[c][60:75] = np.array([0]*stroke+[1]*(15-stroke))
        c += 1
    subset = pd.DataFrame(ar)
    subset.index = df_cf.index
    for num in xrange(15):
        subset = subset.rename(columns={num:"datapoint_percentage_stroke{}".format(num)})
    for num in xrange(15,30):
        subset = subset.rename(columns={num:"direction_stroke{}".format(num-15)})
    for num in xrange(30,45):
        subset = subset.rename(columns={num:"time_stroke{}".format(num-30)})
    for num in xrange(45,60):
        subset = subset.rename(columns={num:"datapoint_stroke{}".format(num-45)})
    for num in xrange(60,75):
        subset = subset.rename(columns={num:"switch_stroke{}".format(num-60)})
    return subset
chx_generic_functions.py 文件源码 项目:chxanalys 作者: yugangzhang 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def extract_data_from_file(  filename, filepath, good_line_pattern, good_cols=None, labels=None,):
    '''YG Develop Octo 17, 2018 
        Extract data from a file
    Input:
        filename: str, filename of the data
        filepath: str, path of the data
        good_line_pattern: str, data will be extract below this good_line_pattern
        good_cols: list of integer, good index of cols
        lables: the label of the good_cols
        #save: False, if True will save the data into a csv file with filename appending csv ??
    Return:
        a pds.dataframe
    Example:
    filepath =  '/XF11ID/analysis/2017_3/lwiegart/Link_files/Exports/'
    filename = 'ANPES2 15-10-17 16-31-11-84Exported.txt'    
    good_cols = [ 1,2,4,6,8,10 ]
    labels = [  'time', 'temperature', 'force', 'distance', 'stress', 'strain'  ]
    good_line_pattern = "Index\tX\tY\tX\tY\tX\tY" 
    df =  extract_data_from_file(  filename, filepath, good_line_pattern, good_cols, labels)
    '''
    import pandas as pds
    with open( filepath + filename, 'r' ) as fin:
        p=fin.readlines()
        di = 1e20                
        for i, line in enumerate(p):
            if good_line_pattern in line:                
                di = i
            if i == di+1:
                els = line.split()  
                if good_cols is  None:
                    data = np.array( els, dtype=float  )
                else:
                    data = np.array( [els[j] for j in good_cols], dtype=float  )
            elif i > di:
                try:                    
                    els = line.split() 
                    if good_cols is  None:
                        temp = np.array( els, dtype=float  )
                    else:
                        temp=  np.array( [els[j] for j in good_cols], dtype=float  ) 
                    data=np.vstack( (data,temp))
                except:
                    pass
        if labels is None:
            labels = np.arange(data.shape[1])
        df = pds.DataFrame( data, index= np.arange(data.shape[0]), columns= labels  )    
    return df


问题


面经


文章

微信
公众号

扫码关注公众号