python类notnull()的实例源码

quality.py 文件源码 项目:ssbio 作者: SBRG 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def parse_psqs(psqs_results_file):
    """Parse a PSQS result file and returns a Pandas DataFrame of the results

    Args:
        psqs_results_file: Path to psqs results file

    Returns:
        Pandas DataFrame: Summary of PSQS results

    """

    # TODO: generalize column names for all results, save as dict instead

    psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None)
    psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb'))
    psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1)
    psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan)
    psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan)
    psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)]

    return psqs_results
cli.py 文件源码 项目:eemeter 作者: openeemeter 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def trace_serializer(trace):
    data = OrderedDict([
        ("type", "ARBITRARY_START"),
        ("interpretation", trace.interpretation),
        ("unit", trace.unit),
        ("trace_id", trace.trace_id),
        ("interval", trace.interval),
        ("records", [
            OrderedDict([
                ("start", start.isoformat()),
                ("value", record.value if pd.notnull(record.value) else None),
                ("estimated", bool(record.estimated)),
            ])
            for start, record in trace.data.iterrows()
        ]),
    ])
    return data
dataframe_utils.py 文件源码 项目:fileflow 作者: industrydive 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def clean_and_write_dataframe_to_csv(data, filename):
    """
    Cleans a dataframe of np.NaNs and saves to file via pandas.to_csv

    :param data: data to write to CSV
    :type data: :class:`pandas.DataFrame`
    :param filename: Path to file to write CSV to. if None, string of data
        will be returned
    :type filename: str | None
    :return: If the filename is None, returns the string of data. Otherwise
        returns None.
    :rtype: str | None
    """
    # cleans np.NaN values
    data = data.where((pd.notnull(data)), None)
    # If filename=None, to_csv will return a string
    result = data.to_csv(path_or_buf=filename, encoding='utf-8', dtype=str, index=False, na_rep=None,
                         skipinitialspace=True, quoting=csv.QUOTE_ALL)
    logging.info("Dataframe of shape %s has been stored." % str(data.shape))

    return result
bdsim.py 文件源码 项目:georges 作者: chernals 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def element_to_bdsim(e):
    """Convert a pandas.Series representation onto a BDSim sequence element."""
    bdsim = ""
    if e.KEYWORD in ['MARKER', 'INSTRUMENT']:
        bdsim = "{}: {};".format(e.name.replace('$', ''), "marker")
    if e.KEYWORD in ['DRIFT', 'QUADRUPOLE', 'RBEND', 'SBEND']:
        bdsim = "{}: {}, l={}*m".format(e.name.replace('$', ''), e.KEYWORD.lower(), e.L)
        if e.get('BENDING_ANGLE') is not None and not np.isnan(e['BENDING_ANGLE']):
            bdsim += f",angle=-{e['BENDING_ANGLE']}"
        elif e.get('ANGLE') is not None and not np.isnan(e['ANGLE']):
            bdsim += f",angle=-{e.get('ANGLE', 0)}"
        else:
            # Angle property not supported by the element or absent
            bdsim += ""
        #if pd.notnull(e['APERTYPE']):
        #    bdsim += ", aperture={}*m".format(str(e['APERTURE']).strip('[]'))
        if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('CIRCUIT')):
            bdsim += ", {}={{{{ {} or '0.0' }}}}".format(e['PLUG'].lower(), e['CIRCUIT'])
        bdsim += ';'
    return bdsim
madx.py 文件源码 项目:georges 作者: chernals 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def element_to_mad(e):
    """Convert a pandas.Series representation onto a MAD-X sequence element."""
    if e.CLASS not in SUPPORTED_CLASSES:
        return ""
    mad = "{}: {}, ".format(e.name, e.CLASS)
    if e.get('BENDING_ANGLE') is not None and not np.isnan(e['BENDING_ANGLE']):
        mad += f"ANGLE={e['BENDING_ANGLE']},"
    elif e.get('ANGLE') is not None and not np.isnan(e['ANGLE']):
        mad += f"ANGLE={e.get('ANGLE', 0)},"
    else:
        # Angle property not supported by the element or absent
        mad += ""
    mad += ', '.join(["{}={}".format(p, e[p]) for p in SUPPORTED_PROPERTIES if pd.notnull(e.get(p, None))])
    if pd.notnull(e['LENGTH']) and e['LENGTH'] != 0.0:
        mad += ", L={}".format(e['LENGTH'])
    if pd.notnull(e.get('APERTYPE', None)):
        mad += ", APERTURE={}".format(str(e['APERTURE']).strip('[]'))
    if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('CIRCUIT')) and pd.isnull(e.get('VALUE')):
        mad += ", {}:={}".format(e['PLUG'], e['CIRCUIT'])
    if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('VALUE')):
        mad += ", {}={}".format(e['PLUG'], e['VALUE'])
    mad += ", AT={}".format(e['AT_CENTER'])
    mad += ";"
    return mad
Sensitivity.py 文件源码 项目:gullikson-scripts 作者: kgullikson88 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def get_sec_spt(row):
    """
    Get the secondary spectral type from the information we have. Meant to be
    called as the `apply` method of a pandas DataFrame.
    """
    if pd.notnull(row['Sp2']):
        return row['Sp2']
    elif pd.notnull(row['Sp1']) and pd.notnull(row['mag1']) and pd.notnull(row['mag2']):
        # TODO: Do better than assuming V band!
        band = 'V'
        absmag_prim = MS.GetAbsoluteMagnitude(row['Sp1'], color=band)
        dm = float(row['mag1']) - absmag_prim
        absmag_sec = float(row['mag2']) - dm
        return MS.GetSpectralType_FromAbsMag(absmag_sec, color=band)[0]
    elif pd.notnull(row['Sp1']) and pd.notnull(row['K1']) and pd.notnull(row['K2']):
        mass = MS.Interpolate('mass', row['Sp1'])
        q = float(row['K1']) / float(row['K2'])
        sec_mass = q * mass
        return MS.GetSpectralType('mass', sec_mass)[0]
    else:
        print(row)
        raise ValueError('Must give enough information to figure out the spectral type!')
pandatools.py 文件源码 项目:zeex 作者: zbarge 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def series_is_datetime(series: pd.Series, check_num: int=5, dropna: bool=True):
    """
    Checks random rows in a Series comparing rows that coerce to datetime.
    :param series:
    :param check_num:
    :param dropna:
    :return:
    """
    if dropna:
        series = series.dropna(axis=0)
    got, lost = 0, 0
    size = (check_num if series.index.size > check_num else series.index.size)

    if size > 0:
        checks = np.random.randint(0, high=series.index.size, size=size)
        for x in series[checks].tolist():
            try:
                x = pd.Timestamp(x)
                if pd.notnull(x):
                    got += 1
            except (ValueError, OverflowError):
                lost += 1

    return got > lost
regression_nn.py 文件源码 项目:NTHU-Machine-Learning 作者: YuChunLOL 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def to_csv(self, filepath='hypothesis/SGD_hypothesis_header.csv'):
        df = pd.DataFrame()
        df = pd.concat([df, pd.DataFrame([['depth', self.depth]])], ignore_index=True)
        df = pd.concat([df, pd.DataFrame([['sizes'] + [self.input_size+1] \
                                                    + [hidden_size+1 for hidden_size in self.hidden_sizes] \
                                                    + [self.output_size]])], ignore_index=True)
        for i, weight in enumerate(self.best_weights):
            df = pd.concat([df, pd.DataFrame([['W_{}'.format(i)] + weight.T.flatten().tolist()])], ignore_index=True)

        # Fill nan with None[]
        df = df.where((pd.notnull(df)), None)

        # Since pd.to_csv converts int to float if there's `None` in the same row,
        # we need to handle this.
        with open(filepath, 'w') as f:
            for row in range(df.shape[0]):
                for col in range(df.shape[1]):
                    if (row == 0 and col != 0) or (row == 1 and col != 0):
                        val = int(df[col][row]) if df[col][row] is not None else ''
                    else:
                        val = df[col][row] if df[col][row] is not None else ''
                    f.writelines('{},'.format(val))
                if row != df.shape[0]-1: f.writelines('\n')
movie_data.py 文件源码 项目:parade 作者: bailaohe 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def execute_internal(self, context, **kwargs):
        """
        the internal execution process to be implemented
        :param context:
        :param kwargs:
        :return:
        """
        df = pd.read_csv('https://raw.githubusercontent.com/bailaohe/parade/master/assets/movie_metadata.csv')

        # Process projection on the dataset to get our interested attributes
        df = df[['movie_title', 'genres', 'title_year', 'content_rating', 'budget', 'num_voted_users', 'imdb_score']]

        # Filter out records with *NAN* title_year and budget
        df = df[pd.notnull(df['title_year'])]
        df = df[df['budget'] > 0]

        # Extract the genres ROOT
        df['genres_root'] = df['genres'].apply(lambda g: g.split('|')[0])

        return df
Test_Formula.py 文件源码 项目:MarketMakingProfitability 作者: MiesJansen 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def Join_Inputs(df, df_betas, df_ff_params, df_liq_prox):
    # add beta values & set index to datetime from df_diff
    df = pd.merge(df, df_betas, left_on='cusip_id', 
                  right_on='cusip_id', left_index=True)

    df['trd_exctn_dt_idx'] = pd.to_datetime(df['trd_exctn_dt'],\
                                        format='%Y%m%d')
    df.set_index('trd_exctn_dt_idx', inplace=True)

    # Join with fama-french factors on date index
    df_join_ff = df.join(df_ff_params, lsuffix="_m", rsuffix='_b')
    # Drop any rows where dates in df_diff do not appear in fama-french
    df_join_ff = df_join_ff[pd.notnull(df_join_ff['MKT_b'])]

    # Combine liquidity factor L_t
    df_liq_prox_values = df_liq_prox['residual_term']
    df_join_liq = df_join_ff.join(df_liq_prox_values)
    df_join_liq = df_join_liq[pd.notnull(df_join_liq['residual_term'])]

    return df_join_liq
test_pandas.py 文件源码 项目:craft-ai-client-python 作者: craft-ai 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def test_decide_from_contexts_df_null_decisions():
  tree = CLIENT.get_decision_tree(AGENT_ID,
                                  COMPLEX_AGENT_DATA.last_valid_index().value // 10 ** 9)

  test_df = pd.DataFrame(
    [
      ["Jean-Pierre", "+02:00"],
      ["Paul"]
    ],
    columns=["b", "tz"],
    index=pd.date_range("20130201", periods=2, freq="D"))

  df = CLIENT.decide_from_contexts_df(tree, test_df)
  assert_equal(len(df), 2)
  assert pd.isnull(df["a_predicted_value"][0])
  assert pd.notnull(df["error"][0])

  assert pd.notnull(df["a_predicted_value"][1])
  assert pd.isnull(df["error"][1])
client.py 文件源码 项目:craft-ai-client-python 作者: craft-ai 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def add_operations(self, agent_id, operations):
    if isinstance(operations, pd.DataFrame):
      if not isinstance(operations.index, pd.DatetimeIndex):
        raise CraftAiBadRequestError("Invalid dataframe given, it is not time indexed")

      chunk_size = self.config["operationsChunksSize"]

      for chunk in chunker(operations, chunk_size):
        chunk_operations = [
          {
            "timestamp": row.name.value // 10 ** 9, # Timestamp.value returns nanoseconds
            "context": {
              col: row[col] for col in operations.columns if pd.notnull(row[col])
            }
          } for _, row in chunk.iterrows()
        ]
        super(Client, self).add_operations(agent_id, chunk_operations)

      return {
        "message": "Successfully added %i operation(s) to the agent \"%s/%s/%s\" context."
                   % (len(operations), self.config["owner"], self.config["project"], agent_id)
      }
    else:
      return super(Client, self).add_operations(agent_id, operations)
interpreter.py 文件源码 项目:craft-ai-client-python 作者: craft-ai 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def decide_from_row(tree, columns, row):
  time = Time(
    t=row.name.value // 10 ** 9, # Timestamp.value returns nanoseconds
    timezone=row.name.tz
  )
  context = {
    col: row[col] for col in columns if pd.notnull(row[col])
  }
  try:
    decision = VanillaInterpreter.decide(tree, [context, time])

    keys, values = zip(*[
      (output + "_" + key, value)
      for output, output_decision in decision["output"].items()
      for key, value in output_decision.items()
    ])

    return pd.Series(data=values, index=keys)
  except CraftAiNullDecisionError as e:
    return pd.Series(data=[e.message], index=["error"])
missingno.py 文件源码 项目:missingno 作者: ResidentMario 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def _calculate_geographic_nullity(geo_group, x_col, y_col):
    """
    Helper method which calculates the nullity of a DataFrame. Factored out of and used within `geoplot`.
    """
    # Aggregate by point and fetch a list of non-null coordinate pairs, which is returned.
    point_groups = geo_group.groupby([x_col, y_col])
    points = [point for point in point_groups.groups.keys() if pd.notnull(point[0]) and pd.notnull(point[1])]
    # Calculate nullities by location, then take their average within the overall feature.
    counts = np.sum(point_groups.count().values, axis=1)
    entries = point_groups.size()
    width = len(geo_group.columns)
    # Remove empty (NaN, NaN) points.
    if len(entries) > 0:  # explicit check to avoid a Runtime Warning
        geographic_nullity = np.average(1 - counts / width / entries)
        return points, geographic_nullity
    else:
        return points, np.nan
jstreecontrol.py 文件源码 项目:tmtk 作者: thehyve 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def _get_hd_args(path, high_dim_node, annotation):
    """
    Create dict with meta tags that belong to a certain high dimensional node.
    """
    map_file = high_dim_node.sample_mapping

    s = map_file.slice_path(path).iloc[:, 5].unique()
    t = map_file.slice_path(path).iloc[:, 6].unique()

    hd_args = {'hd_sample': ', '.join(s.astype(str)) if pd.notnull(s[0]) else '',
               'hd_tissue': ', '.join(t.astype(str)) if pd.notnull(t[0]) else '',
               'hd_type': Mappings.annotation_data_types.get(high_dim_node.params.datatype),
               }

    if annotation:
        hd_args.update({'pl_marker_type': annotation.marker_type,
                        'pl_genome_build': annotation.params.get('GENOME_RELEASE', ''),
                        'pl_title': annotation.params.get('TITLE', ''),
                        'pl_id': annotation.platform})
    return hd_args
p3.py 文件源码 项目:Uber-DS-Challenge 作者: bjherger 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def extract_days(input_delta):
    """
    Helper function to extract the number of days from a time delta. Returns:
     - Number of days, if valid time delta
     - np.NaN if time delta is null or invalid
    :param input_delta:
    :return: number of days in time delta
    :rtype: float
    """

    # Attempt to coerce into Pandas time delta
    delta = pd.Timedelta(input_delta)

    # Attempt to extract number of days
    days = np.NaN
    if pd.notnull(delta):
        days = delta.days

    # Return result
    return days
asset.py 文件源码 项目:py-investment 作者: kprestel 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def kama(self, efficiency_ratio_periods=10, ema_fast=2, ema_slow=30,
             period=20, column='adj_close'):
        er = self._efficiency_ratio_computation(
                period=efficiency_ratio_periods, column=column)
        fast_alpha = 2 / (ema_fast + 1)
        slow_alpha = 2 / (ema_slow + 1)
        smoothing_constant = pd.Series(
                (er * (fast_alpha - slow_alpha) + slow_alpha) ** 2,
                name='smoothing_constant')
        sma = pd.Series(self.ohlcv[column].rolling(period).mean(), name='SMA')
        kama = []
        for smooth, ma, price in zip(iter(smoothing_constant.items()),
                                     iter(sma.shift(-1).items()),
                                     iter(self.ohlcv[column].items())):
            try:
                kama.append(kama[-1] + smooth[1] * (price[1] - kama[-1]))
            except:
                if pd.notnull(ma[1]):
                    kama.append(ma[1] + smooth[1] * (price[1] - ma[1]))
                else:
                    kama.append(None)
        sma['KAMA'] = pd.Series(kama, index=sma.index,
                                name='{} days KAMA Ticker {}'.format(period,
                                                                     self.ticker))
        yield sma['KAMA']
finta.py 文件源码 项目:finta 作者: peerchemist 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def KAMA(cls, ohlc, er=10, ema_fast=2, ema_slow=30, period=20):
        """Developed by Perry Kaufman, Kaufman's Adaptive Moving Average (KAMA) is a moving average designed to account for market noise or volatility.
        Its main advantage is that it takes into consideration not just the direction, but the market volatility as well."""

        er = cls.ER(ohlc, er)
        fast_alpha = 2 / (ema_fast + 1)
        slow_alpha = 2 / (ema_slow + 1)
        sc = pd.Series((er * (fast_alpha - slow_alpha) + slow_alpha)**2, name="smoothing_constant") ## smoothing constant

        sma = pd.Series(ohlc["close"].rolling(period).mean(), name="SMA") ## first KAMA is SMA
        kama = []
        # Current KAMA = Prior KAMA + smoothing_constant * (Price - Prior KAMA)
        for s, ma, price in zip(sc.iteritems(), sma.shift().iteritems(), ohlc["close"].iteritems()):
            try:
                kama.append(kama[-1] + s[1] * (price[1] - kama[-1]))
            except:
                if pd.notnull(ma[1]):
                    kama.append(ma[1] + s[1] * (price[1] - ma[1]))
                else:
                    kama.append(None)

        sma["KAMA"] = pd.Series(kama, index=sma.index, name="{0} period KAMA.".format(period)) ## apply the kama list to existing index
        return sma["KAMA"]
graphs.py 文件源码 项目:WellApplication 作者: inkenbrandt 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def markGaps(self):
        """Produces dictionary of list of gaps in time series data based on the presence of nan values;
        used for gantt plotting

        :returns: dateranges; a dictionary with station names as keys and lists of begin and end dates as values
        """
        df = self.data
        stations = self.stations

        dateranges = {}
        for station in stations:
            dateranges[station] = []
            first = df.ix[:, station].first_valid_index()
            last = df.ix[:, station].last_valid_index()
            records = df.ix[first:last, station]
            #dateranges[station].append(pd.to_datetime(first))
            for i in range(len(records) - 1):
                if pd.isnull(records[i + 1]) and pd.notnull(records[i]):
                    dateranges[station].append(pd.to_datetime(records.index)[i])
                elif pd.isnull(records[i]) and pd.notnull(records[i + 1]):
                    dateranges[station].append(pd.to_datetime(records.index)[i])
            dateranges[station].append(pd.to_datetime(last))
        return dateranges
history_container.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def update_last_known_values(self):
        """
        Store the non-NaN values from our oldest frame in each frequency.
        """
        ffillable = self.ffillable_fields
        if not len(ffillable):
            return

        for frequency in self.unique_frequencies:
            digest_panel = self.digest_panels.get(frequency, None)
            if digest_panel:
                oldest_known_values = digest_panel.oldest_frame(raw=True)
            else:
                oldest_known_values = self.buffer_panel.oldest_frame(raw=True)

            oldest_vals = oldest_known_values
            oldest_columns = self.fields
            for field in ffillable:
                f_idx = oldest_columns.get_loc(field)
                field_vals = oldest_vals[f_idx]
                # isnan would be fast, possible to use?
                non_nan_sids = np.where(pd.notnull(field_vals))
                key = (frequency.freq_str, field)
                key_loc = self.last_known_prior_values.index.get_loc(key)
                self.last_known_prior_values.values[
                    key_loc, non_nan_sids
                ] = field_vals[non_nan_sids]
uniprot.py 文件源码 项目:ssbio 作者: SBRG 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def uniprot_reviewed_checker(uniprot_id):
    """Check if a single UniProt ID is reviewed or not.

    Args:
        uniprot_id:

    Returns:
        bool: If the entry is reviewed

    """

    query_string = 'id:' + uniprot_id

    uni_rev_raw = StringIO(bsup.search(query_string, columns='id,reviewed', frmt='tab'))
    uni_rev_df = pd.read_table(uni_rev_raw, sep='\t', index_col=0)
    uni_rev_df = uni_rev_df.fillna(False)
    uni_rev_df = uni_rev_df[pd.notnull(uni_rev_df.Status)]

    uni_rev_df = uni_rev_df.replace(to_replace="reviewed", value=True)
    uni_rev_df = uni_rev_df.replace(to_replace="unreviewed", value=False)
    uni_rev_dict_adder = uni_rev_df.to_dict()['Status']

    return uni_rev_dict_adder[uniprot_id]
backend.py 文件源码 项目:berlin-devfest-2016-backend 作者: giansegato 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def processData(data):
    df = pd.DataFrame.transpose(pd.read_json(json.dumps(data)))
    df = df.dropna(subset = [key for key in df.keys() if "x_" in key])
    df = df[pd.notnull(df['y_observed'])]

    X = df[[key for key in df.keys() if "x_" in key]].values
    y = df["y_observed"].values

    return X, y

# 5th: initial model
solution.py 文件源码 项目:Kaggle 作者: lawlite19 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def pre_processData(train_data,file_path):
    train_data.loc[(train_data.Age.isnull()), 'Age' ] = np.mean(train_data.Age)  # ???????????
    train_data.loc[(train_data.Cabin.notnull(),'Cabin')] = 'yes' # Cabin??????yes
    train_data.loc[(train_data.Cabin.isnull(),'Cabin')] = 'no'    
    '''0/1????'''
    dummies_cabin = pd.get_dummies(train_data['Cabin'],prefix='Cabin')  # get_dummies?????0/1??????????????prefix???Cabin
    dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
    dummies_Sex = pd.get_dummies(train_data['Sex'], prefix='Sex')
    dummies_Pclass = pd.get_dummies(train_data['Pclass'],prefix='Pclass')
    train_data = pd.concat([train_data,dummies_cabin,dummies_Embarked,dummies_Pclass,dummies_Sex], axis=1)  # ??dataframe,axis=1??
    train_data.drop(['Pclass','Name','Sex','Embarked','Cabin','Ticket'],axis=1,inplace=True)   # ????????????            
    header_string = ','.join(train_data.columns.tolist())  # ?????string???????
    np.savetxt(file_path+r'/pre_processData1.csv', train_data, delimiter=',',header=header_string)  # ?????????????    
    '''???????(Age?Fare)'''
    scaler = StandardScaler()
    age_scaler = scaler.fit(train_data['Age'])
    train_data['Age'] = age_scaler.fit_transform(train_data['Age'])
    if np.sum(train_data.Fare.isnull()):  # ??Fare???????????
        train_data.loc[(train_data.Fare.isnull(),'Fare')]=np.mean(train_data.Fare)
    fare_scaler = scaler.fit(train_data['Fare'])
    train_data['Fare'] = fare_scaler.transform(train_data['Fare'])
    header_string = ','.join(train_data.columns.tolist())  # ?????string???????
    np.savetxt(file_path+r'/pre_processData_scaled.csv', train_data, delimiter=',',header=header_string)  # ?????????????    
    return train_data






## feature engineering?????-?????
apply_matcher.py 文件源码 项目:py_stringsimjoin 作者: anhaidgroup 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def generate_tokens(table, key_attr, join_attr, tokenizer):
    table_nonnull = table[pd.notnull(table[join_attr])]
    return dict(zip(table_nonnull[key_attr],
                    table_nonnull[join_attr].apply(tokenizer.tokenize)))
titanic.py 文件源码 项目:tensorflow 作者: KirovVerst 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def preprocess_data(path, is_test=False):
    data = pd.read_csv(path, index_col='PassengerId')
    data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    if is_test:
        data = data.replace([None], [0])
    else:
        data = data[pd.notnull(data['Age'])]
        data = data[pd.notnull(data['Embarked'])]
    data.replace(["female", "male"], [0, 1], inplace=True)
    data.replace(["Q", "C", "S"], [0, 1, 2], inplace=True)
    if "Survived" in data:
        data = data[pd.notnull(data['Survived'])]
    data_norm = (data - data.mean()) / (data.max() - data.min())
    return data_norm
timeseries.py 文件源码 项目:coquery 作者: gkunter 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## Line plot:
        #self.vmax = max(self.vmax, ct.values.max())
        #ct.plot(ax=plt.gca(), color=self.get_palette())
timeseries.py 文件源码 项目:coquery 作者: gkunter 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## Stacked area plot:
        #if len(self._groupby) == 2:
            #self.vmax = max(self.vmax, ct.apply(sum, axis=1).max())
        #ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs)
timeseries.py 文件源码 项目:coquery 作者: gkunter 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## percentage area plot:
        ## if there is only one grouping variable (the time column),
        ## the cross table produces a Series, not a data frame. It
        ## isn't really very informative to plot it, but we provide
        ## for this special case anyway_
        #if type(ct) == pd.Series:
            #ct = ct.apply(lambda x: 100)
        #else:
            #ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1)
        #ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs)
base.py 文件源码 项目:eemeter 作者: openeemeter 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def _save_series(self, series):
        data = [
            [
                d.strftime(self.cache_date_format), t
                if pd.notnull(t) else None
            ]
            for d, t in series.iteritems()
        ]
        self.json_store.save_json(self._get_cache_key(), data)
noaa.py 文件源码 项目:eemeter 作者: openeemeter 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def save_series(self, year, series):
        key = self._get_cache_key(year)
        data = [
            [
                d.strftime(self.cache_date_format), t
                if pd.notnull(t) else None
            ]
            for d, t in series.iteritems()
        ]
        self.json_store.save_json(key, data)


问题


面经


文章

微信
公众号

扫码关注公众号