python类read_csv()的实例源码

eval_word_sim.py 文件源码 项目:KATE 作者: hugochan 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def calc_word_sim(model, eval_file):
    df = pd.read_csv(eval_file, sep=',', header=0) # eval dataset
    col1, col2, score = df.columns.values
    model_vocab = model.vocab.keys()
    ground = []
    sys = []
    for idx, row in df.iterrows():
        if row[col1] in model_vocab and row[col2] in model_vocab:
            ground.append(float(row[score]))
            sys.append(model.similarity(row[col1], row[col2]))

    # compute Spearman's rank correlation coefficient (https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
    print sys
    # import pdb;pdb.set_trace()
    corr, p_val = stats.spearmanr(sys, ground)
    logger.info("# of pairs found: %s / %s" % (len(ground), len(df)))
    logger.info("correlation: %s" % corr)
    return corr, p_val
tsplot.py 文件源码 项目:rca-evaluation 作者: sieve-microservices 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def draw(path, srv):
     filename = os.path.join(path, srv["preprocessed_filename"])
     df = pd.read_csv(filename, sep="\t", index_col='time', parse_dates=True)
     bins = defaultdict(list)
     for i, col in enumerate(df.columns):
         serie = df[col].dropna()
         if pd.algos.is_monotonic_float64(serie.values, False)[0]:
             serie = serie.diff()[1:]
         p_value = adfuller(serie, autolag='AIC')[1]
         if math.isnan(p_value): continue
         nearest = 0.05 * round(p_value/0.05)
         bins[nearest].append(serie)
     for bin, members in bins.items():
         series = [serie.name for serie in members]
         if len(members) <= 10:
             columns = series
         else:
             columns = random.sample(series, 10)

         subset = df[columns]
         name = "%s_adf_confidence_%.2f.png" % (srv["name"], bin)
         print(name)
         axes = subset.plot(subplots=True)
         plt.savefig(os.path.join(path, name))
         plt.close("all")
test_addepar.py 文件源码 项目:pyaddepar 作者: lobnek 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_addepar2frame(self):
        r = {'meta': {'columns': [{'key': 'node_id', 'display_name': 'Entity ID', 'output_type': 'Word'},
                                  {'key': '_custom_13_custodian_name_166730', 'display_name': '15. Custodian Name', 'output_type': 'Word'},
                                  {'key': '_custom_15_reference_currency_165485', 'display_name': '17. Reference Currency', 'output_type': 'Currency'},
                                  {'key': '_custom_16_lwm_risk_profile_114480', 'display_name': '18. LWM Risk Profile', 'output_type': 'Word'},
                                  {'key': '_custom_23_lwm_aum_type_293536', 'display_name': '23. LWM - AUM Type', 'output_type': 'Word'},
                                  {'key': 'inception_event_date', 'display_name': 'Inception Date', 'output_type': 'Date'}],
                      'groupings': [{'key': 'top_level_owner', 'display_name': 'Top Level Owner'}]},
             'data': {'type': 'portfolio_views', 'attributes':
                 {'total': {'name': 'Total', 'columns':
                                {'_custom_15_reference_currency_165485': None, 'inception_event_date': '2013-12-31', '_custom_23_lwm_aum_type_293536': None, '_custom_16_lwm_risk_profile_114480': None, '_custom_13_custodian_name_166730': None, 'node_id': None},
                            'children': [{'entity_id': 1146188, 'name': 'A', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2016-10-31', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Balanced', '_custom_13_custodian_name_166730': 'X', 'node_id': 1146188}, 'children': []},
                                         {'entity_id': 1231399, 'name': 'B', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2016-09-21', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Balanced', '_custom_13_custodian_name_166730': 'Y', 'node_id': 1231399}, 'children': []},
                                         {'entity_id': 1511499, 'name': 'C', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2017-03-31', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Conservative', '_custom_13_custodian_name_166730': 'Z', 'node_id': 1511499}, 'children': []},
                                        ]}}, 'links': {'self': '/v1/portfolio_views/null'}}}

        pdt.assert_frame_equal(addepar2frame(r), pd.read_csv("/pyaddepar/test/resources/frame.csv", parse_dates=True), check_dtype=False)
data2tensor.py 文件源码 项目:deep-summarization 作者: harpribot 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def generate_vocabulary(self, review_summary_file):
        """

        :param review_summary_file:
        :return:
        """
        self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = ""
kitehistory.py 文件源码 项目:kiteHistory 作者: mr-karan 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def plot_csv(stock_data, symbol):
    """
    params:
        - stock_data(list) : list of dict objects containing stock data
        - name(str) : output file name specified by `-output` param.
    """

    try:
        df = pd.read_csv('{}.csv'.format(symbol))

    except:
        write_to_csv(stock_data, symbol)
        df = pd.read_csv('{}.csv'.format(symbol))

    p1 = figure(x_axis_type="datetime", title="Stock Closing Price")
    p1.grid.grid_line_alpha = 0.3
    p1.xaxis.axis_label = 'Date'
    p1.yaxis.axis_label = 'Price'

    p1.line(datetime(list(df['date'])), list(df['close']),
            color='#A6CEE3', legend=symbol)
    output_file("{}.html".format(symbol), title="Stock Closing Prices")

    show(p1)  # open a browser
table.py 文件源码 项目:IgDiscover 作者: NBISweden 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def fix_columns(df):
    """
    Changes DataFrame in-place
    """
    # Convert all string columns to str to avoid a PerformanceWarning
    for col in _STRING_COLUMNS:
        if col not in df:
            continue
        df[col].fillna('', inplace=True)
        df[col] = df[col].astype('str')
        # Empty strings have been set to NaN by read_csv. Replacing
        # by the empty string avoids problems with groupby, which
        # ignores NaN values.
    # Columns that have any NaN values in them cannot be converted to
    # int due to a numpy limitation.
    for col in _INTEGER_COLUMNS:
        if col not in df.columns:
            continue
        if all(df[col].notnull()):
            df[col] = df[col].astype(int)
multidiscover.py 文件源码 项目:IgDiscover 作者: NBISweden 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def main(args):
    if args.minimum_frequency is None:
        minimum_frequency = max((len(args.tables) + 1) // 2, 2)
    else:
        minimum_frequency = args.minimum_frequency
    logger.info('Minimum frequency set to %s', minimum_frequency)

    # Read in tables
    tables = []
    for path in args.tables:
        table = pd.read_csv(path, sep='\t')
        table = table[table.database_diff >= args.minimum_db_diff]
        table = table.dropna()
        tables.append(table)
        if len(table) == 0:
            logger.warn('Table read from %r is empty after filtering out sequences with database diff >= %s.', path, args.minimum_db_diff)

    # Count V sequence occurrences
    counter = Counter()
    for table in tables:
        counter.update(set(table.consensus))

    # Find most frequent occurrences and print result
    print('count', 'gene', 'database_diff', 'sequence', 'names', sep='\t')
    for sequence, frequency in counter.most_common():
        if frequency < minimum_frequency:
            break
        names = []
        gene = None
        for table in tables:
            matching_rows = table[table.consensus == sequence]
            if matching_rows.empty:
                continue
            names.extend(matching_rows.name)
            if gene is None:
                row = matching_rows.iloc[0]
                gene = row.gene
                database_diff = row.database_diff
                #shm = row['V_SHM']
        print(frequency, gene, database_diff, sequence, *names, sep='\t')
discoverj.py 文件源码 项目:IgDiscover 作者: NBISweden 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def count_full_text_occurrences(candidates, table_path, other_gene, other_errors, merge, min_count):
    # Use only records that have a chance of reaching the required min_count
    records = {info.sequence: info for info in candidates if info.max_count >= min_count}

    # Count full-text occurrences in the genomic_sequence, circumventing
    # inaccurate IgBLAST alignment boundaries
    # TODO limit the search to the gene region (especially for D genes)
    # Speed up search by looking for most common sequences first
    search_order = sorted(records, key=lambda s: records[s].max_count, reverse=True)
    cols = [other_gene, 'V_errors', 'J_errors', 'CDR3_nt', 'genomic_sequence']
    for chunk in pd.read_csv(table_path, usecols=cols, chunksize=10000, sep='\t'):
        chunk = chunk[chunk[other_errors] == 0]
        for row in chunk.itertuples():
            for needle in search_order:
                if needle in row.genomic_sequence:
                    record = records[needle]
                    record.count += 1
                    record.other_genes.add(getattr(row, other_gene))
                    record.cdr3s.add(row.CDR3_nt)
                    if merge:
                        break
    return records.values()
filter.py 文件源码 项目:IgDiscover 作者: NBISweden 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def main(args):
    n = 0
    first = True
    written = 0
    stats = FilteringStatistics()
    for chunk in pd.read_csv(args.table, chunksize=10000, sep='\t'):
        fix_columns(chunk)
        n += len(chunk)
        filtered, chunk_stats = filtered_table(chunk, v_gene_coverage=args.v_coverage,
            j_gene_coverage=args.j_coverage, v_gene_evalue=args.v_evalue)
        stats += chunk_stats
        print(filtered.to_csv(sep='\t', index=False, header=first), end='')
        first = False
        written += len(filtered)

    logger.info('%s rows in input table', stats.n)
    logger.info('%s rows have both V and J assignment', stats.vjassigned)
    logger.info('%s of those do not have a stop codon', stats.stop)
    logger.info('%s of those have an E-value of at most %s', stats.v_evalue, args.v_evalue)
    logger.info('%s of those cover the V gene by at least %s%%', stats.v_coverage, args.v_coverage)
    logger.info('%s of those cover the J gene by at least %s%%', stats.j_coverage, args.j_coverage)
    logger.info('%d rows written', written)
treasuries.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_treasury_data(start_date, end_date):
    return pd.read_csv(
        "http://www.federalreserve.gov/datadownload/Output.aspx"
        "?rel=H15"
        "&series=bf17364827e38702b42a58cf8eaa3f78"
        "&lastObs="
        "&from="  # An unbounded query is ~2x faster than specifying dates.
        "&to="
        "&filetype=csv"
        "&label=omit"
        "&layout=seriescolumn"
        "&type=package",
        skiprows=1,  # First row is a useless header.
        parse_dates=['Time Period'],
        na_values=['ND'],  # Presumably this stands for "No Data".
        index_col=0,
    ).loc[
        start_date:end_date
    ].dropna(
        how='all'
    ).rename(
        columns=parse_treasury_csv_column
    ).tz_localize('UTC') * 0.01  # Convert from 2.57% to 0.0257.
mongodb.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def storageindex(self):
        #get the filelist
        onlyfiles = [ f for f in listdir(self.indexdata) if isfile(join(self.indexdata,f)) ]
        #read from using pandas
        for f in onlyfiles:
            df = pd.read_csv(self.indexdata+"/"+f)
            s=f.split('.')
            name = s[0][2:8]
            records = json.loads(df.T.to_json()).values()
            for row in records:
                row['date'] = datetime.datetime.strptime(row['date'], "%Y-%m-%d")
            print name
            self.index[name].insert_many(records)



    #storage stock pool into database
xlsx_usage.py 文件源码 项目:table-compositor 作者: InvestmentSystems 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def load_names_data():
    fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
    if not os.path.exists(fp):
        r = requests.get(URL_NAMES)
        with open(fp, 'wb') as f:
            f.write(r.content)

    post = collections.OrderedDict()
    with zipfile.ZipFile(fp) as zf:
        # get ZipInfo instances
        for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
            fn = zi.filename
            if fn.startswith('yob'):
                year = int(fn[3:7])
                df = pd.read_csv(
                    zf.open(zi),
                    header=None,
                    names=('name', 'gender', 'count'))
                df['year'] = year
                post[year] = df

        df = pd.concat(post.values())
        df.set_index('name', inplace=True, drop=True)
        return df
html_usage.py 文件源码 项目:table-compositor 作者: InvestmentSystems 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def load_names_data():
    fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
    if not os.path.exists(fp):
        r = requests.get(URL_NAMES)
        with open(fp, 'wb') as f:
            f.write(r.content)

    post = collections.OrderedDict()
    with zipfile.ZipFile(fp) as zf:
        # get ZipInfo instances
        for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
            fn = zi.filename
            if fn.startswith('yob'):
                year = int(fn[3:7])
                df = pd.read_csv(
                    zf.open(zi),
                    header=None,
                    names=('name', 'gender', 'count'))
                df['year'] = year
                post[year] = df

        df = pd.concat(post.values())
        df.set_index('name', inplace=True, drop=True)
        return df
soccerstan.py 文件源码 项目:soccerstan 作者: Torvaney 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def read_data(fname):
    """ Read football-data.co.uk csv """
    data = (
        pd.read_csv(fname)
        .rename(columns={
                'HomeTeam': 'home_team',
                'AwayTeam': 'away_team',
                'FTHG': 'home_goals',
                'FTAG': 'away_goals'
            })
        .loc[lambda df: ~pd.isnull(df['home_goals'])]  # Remove future games
    )

    team_map = stan_map(pd.concat([data['home_team'], data['away_team']]))
    data['home_team_id'] = data['home_team'].replace(team_map)
    data['away_team_id'] = data['away_team'].replace(team_map)


    for col in ('home_goals', 'away_goals'):
        data[col] = [int(c) for c in data[col]]

    return data, team_map
main_load_data_DiDi.py 文件源码 项目:didi_competition 作者: Heipiao 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def cluster_map_sheet_pre():
    print("------ load cluster_map data ----------")
    cluster_map_sheet_path = os.path.join(LOAD_DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR)
    print("load data from: ", cluster_map_sheet_path)
    save_path = os.path.join(SAVE_DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR)
    print("save data to: ", save_path)
    file = "cluster_map"

    cluster_sheet = os.path.join(cluster_map_sheet_path, file)
    data = pd.read_csv(cluster_sheet,header=-1)
    data.columns = ["raw"]
    data["district_hash"] = data["raw"].map(lambda x: x.split("\t")[0])
    data["district_map"] = data['raw'].map(lambda x: x.split("\t")[1])

    del data["raw"]

    save_df_to_file(data, save_path, file)


# handle the order_info sheet
operate_hash.py 文件源码 项目:didi_competition 作者: Heipiao 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def create_hash_district_map_dict():
    file = "cluster_map.csv"
    district_hash_map_path = os.path.join(DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR, file)

    hash_data = pd.read_csv(district_hash_map_path)
    ## convert the dataframe into dict
    hash_map_rule = dict(zip(hash_data.district_hash, hash_data.district_map))

    # print(type(hash_map_rule))

    saved_file = "cluster_map.pickle"
    map_save_file = os.path.join(DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR, saved_file)
    ## save into same dir as file
    with open(map_save_file, "wb") as f:
        pickle.dump(hash_map_rule, f)

    #print(hash_map_rule)

# map the district features in the input data_frame into value
test_adapter.py 文件源码 项目:rosie 作者: datasciencebr 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def test_prepare_dataset(self, fetch, chamber_of_deputies):
        """
        * Rename columns.
        * Make `document_type` a category column.
        * Rename values for `category`.
        * Create `is_party_expense` column.
        """
        dataset = self.subject.dataset
        self.assertTrue(set(ADAPTER_COLUMNS.keys()).issubset(set(dataset.columns)))
        document_types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad']
        self.assertEqual(document_types,
                         dataset['document_type'].cat.categories.tolist())
        fixture = pd.read_csv(os.path.join(self.fixtures_path, 'reimbursements.xz'))
        meal_rows = fixture \
            .query('subquota_description == "Congressperson meal"').index
        self.assertEqual(['Meal'],
                         dataset.loc[meal_rows, 'category'].unique().tolist())
        party_expense_rows = fixture[fixture['congressperson_id'].isnull()].index
        self.assertEqual([True],
                         dataset.loc[party_expense_rows, 'is_party_expense'].unique().tolist())
sample.py 文件源码 项目:lung-cancer-detector 作者: YichenGong 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def _load_sets(self):
        print("Loading datasets")

        train_patients = pd.read_csv("data/stage1/" + "stage1_labels.csv")

        for idx, row in train_patients.iterrows():
            if self._check_sample_exists(row['id']):
                self._test_set.append(row['id'])

        for idx, row in train_patients.iterrows():
            if self._check_sample_exists(row['id']):
                self._train_set.append([row['id'], row['cancer']])

        #Create permutation for random loading
        self.shuffle()

        print("Loading datasets: Done!")
stage1.py 文件源码 项目:lung-cancer-detector 作者: YichenGong 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def _load_sets(self):
        print("Loading datasets")

        train_patients = pd.read_csv(os.path.join(self._directory, "stage1_labels.csv"))
        test_patients = pd.read_csv(os.path.join(self._directory, "stage1_sample_submission.csv"))

        for idx, row in test_patients.iterrows():
            self._test_set.append(row['id'])

        for idx, row in train_patients.iterrows():
            self._train_set.append([row['id'], row['cancer']])

        #Create permutation for random loading
        self.shuffle()

        print("Loading datasets: Done!")
wordgen_samples.py 文件源码 项目:saapy 作者: ashapochka 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def doctable(ctx):
    df = pd.read_csv('./docs/flight-options.csv')

    # open an existing document
    doc = docx.Document('./docs/style-reference.docx')

    as_int = partial(format_decimal, format='#')
    as_usd = partial(format_currency, currency='USD')

    s = doc.sections[0]
    width = s.page_width - s.left_margin - s.right_margin

    doc.add_picture('./docs/diagrams_002.png', width=width)

    formatters = {
        'ticket_price': as_usd,
        'total_hours': as_int,
        'trip': as_int,
        'airline': partial(shorten_long_name, width=20),
        'selected': compose({0: 'No', 1: 'Yes'}.get, int)
    }
    add_table(df, doc, table_style='Plain Table 3', formatters=formatters)

    # save the doc
    doc.save('./docs/test.docx')
eval.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def eval(flags):
    name = flags.pred_path
    yp = pd.read_csv(name)
    classes = len([i for i in yp.columns.values if 'class' in i])
    yp = yp[['class%d'%i for i in range(1,classes+1)]].values
    myDB = personalDB(flags,name="full")
    if "stage1" in name:
        y=myDB.data['test_variants_filter']['Class']-1
    else:
        myDB.get_split()
        va = myDB.split[flags.fold][1]
        y = np.argmax(myDB.y[va],axis=1)
    if np.max(y)>classes:
        y = np.argmax(to4c(onehot_encode(y)),axis=1)
    score = cross_entropy(y,yp)
    print(name,score,'\n')
evalx.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def eval(name,clip=False,bar=0.9):
    base = pd.read_csv('../input/stage1_solution_filtered.csv')
    base['Class'] = np.argmax(base[['class%d'%i for i in range(1,10)]].values,axis=1)
    sub = pd.read_csv(name)
    #sub = pd.merge(sub,base[['ID','Class']],on="ID",how='right')
    #print(sub.head())
    y = base['Class'].values
    yp = sub[['class%d'%i for i in range(1,10)]].values
    if clip:
        yp = np.clip(yp,(1.0-bar)/8,bar)
        yp = yp/np.sum(yp,axis=1).reshape([yp.shape[0],1])
    print(name,cross_entropy(y,yp),multiclass_log_loss(y,yp))
    for i in range(9):
        y1 = y[y==i]
        yp1 = yp[y==i]
        print(i,y1.shape,cross_entropy(y1,yp1),multiclass_log_loss(y1,yp1))
cnn.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def post(self):
        if self.flags.task == "test_cnn_stage1":
            docs = self.DB.clean_doc['test_text_filter']
        elif self.flags.task == "test_cnn_stage2":
            docs = self.DB.clean_doc['stage2_test_text']
        else:
            self.mDB.get_split()
            docs = self.mDB.split[self.flags.fold][1]
        nrows = len(docs)
        p = np.zeros([nrows,9])
        for i in range(self.flags.epochs):
            if i==0:
                skiprows=None
            else:
                skiprows = nrows*i
            p = p + (pd.read_csv(self.flags.pred_path,header=None,nrows=nrows,skiprows=skiprows).values)
        p = p/self.flags.epochs
        if '_cv' in self.flags.task:
            from utils.np_utils.utils import cross_entropy
            y = np.argmax(self.mDB.y,axis=1)
            print("cross entropy", cross_entropy(y[self.mDB.split[self.flags.fold][1]],p))
        s = pd.DataFrame(p,columns=['class%d'%i for i in range(1,10)])
        s['ID'] = np.arange(nrows)+1
        s.to_csv(self.flags.pred_path.replace(".csv","_sub.csv"),index=False,float_format="%.5f")
xgb.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def post_cv(flags):
    import re
    import os
    path = flags.data_path
    files = [i for i in os.listdir(path) if len(re.findall('cv_[0-9].csv',i))]
    s = []
    for name in files:
        s.append(pd.read_csv("%s/%s"%(path,name)))

    s = pd.concat(s,axis=0)
    print(s.head())
    classes = len([i for i in s.columns.values if 'class' in i])
    from utils.np_utils.utils import cross_entropy
    yp = s[['class%d'%i for i in range(1,classes+1)]].values
    y=s['real'].values
    print(cross_entropy(y,yp))
    s.to_csv("%s/cv.csv"%path,index=False)
replace.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def replace(s,n):
    seen = pd.read_csv(s)
    unseen = pd.read_csv(n)
    te = pd.read_csv('../input/stage2_test_variants.csv')
    tr = pd.read_csv('../input/training_variants')
    unseen = pd.merge(unseen,te,on='ID',how='right')
    seen = pd.merge(seen,te,on='ID',how='right')
    mask = seen.Gene.isin(tr.Gene)
    cols = ['class%d'%i for i in range(1,10)]
    seen.loc[~mask,cols] = 0

    mask = unseen.Gene.isin(tr.Gene)
    unseen.loc[mask,cols] = 0

    assert (unseen['ID']==seen['ID']).all()
    seen[cols] = seen[cols] + unseen[cols]

    seen[cols+['ID']].to_csv('mix.csv',index=False)
test_lb_split.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test2():
    s1 = pd.read_csv('../input/test_variants')
    s3 = pd.read_csv('../input/test_variants_filter')
    s1 = pd.merge(s1,s3[['ID','Class']],on='ID',how='left').fillna(1)

    s2 = pd.read_csv('../input/stage2_test_variants.csv')
    s1 = pd.merge(s1,s2,on= ["Gene", "Variation"],how='inner')
    s1['ID'] = s1['ID_y']
    s2 = pd.merge(s1[['ID','Class']],s2,on='ID',how='right').fillna(1)
    yp = onehot_encode(s2['Class'].values-1)

    for i in range(1,10):
        s2['class%d'%i] = yp[:,i-1]
    cols = ['class%d'%i for i in range(1,10)]
    mask = s2['ID'].isin(s1['ID_y'])
    s2.loc[~mask,cols] = 0.1

    s2['ID'] = s2['ID'].astype(int)
    cols = ['ID']+['class%d'%i for i in range(1,10)]
    s2[cols].to_csv('sub.csv',index=False)
model.py 文件源码 项目:powerAI 作者: dreameng28 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def x_label(feature_path, pred=False):
    X_list = []
    for each in feature_path:
        X = pd.read_csv(feature_paths.format(str(each)))
        X_list.append(X)
    X = pd.DataFrame(pd.concat(X_list, axis=0)).reset_index().drop('index', axis=1)
    if not pred:
        y = X[power_consumption].tolist()
        X = X.drop([record_date, user_id, power_consumption], axis=1)
        columns = X.columns
        X = X.values
        return X, y, columns
    else:
        X = X.drop([record_date, user_id], axis=1)
        columns = X.columns
        X = X.values
        return X, columns
data.py 文件源码 项目:GOS 作者: crcresearch 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def neighbors():
    """
    Read the neighbors for each country.
    """
    neighbors_csv = pd.read_csv(csv_path("mledoze-countries.csv"), sep=';',
                                usecols=[4, 17])
    neighbors_csv.columns = ["Code", "neighbors"]
    neighbors_csv["neighbors"] = neighbors_csv["neighbors"].str.split(',')
    for row in neighbors_csv.loc[neighbors_csv.neighbors.isnull(), 'neighbors'].index:
        neighbors_csv.at[row, 'neighbors'] = []
    # Island nations are a weird exception
    neighbors_csv.loc[neighbors_csv.Code == "MDG", "neighbors"] = [["MOZ", "ZAF", "TZA"]]
    neighbors_csv.loc[neighbors_csv.Code == "TWN", "neighbors"] = [["CHN", "PHL"]]
    neighbors_csv.loc[neighbors_csv.Code == "AUS", "neighbors"] = [["NZL"]]
    neighbors_csv.loc[neighbors_csv.Code == "NZL", "neighbors"] = [["AUS"]]
    neighbors_csv.loc[neighbors_csv.Code == "JPN", "neighbors"] = [["TWN", "KOR", "PHL"]]
    neighbors_csv.loc[neighbors_csv.Code == "PHL", "neighbors"] = [["TWN", "KOR", "JPN"]]
    neighbors_csv.loc[neighbors_csv.Code == "PRI", "neighbors"] = [["DOM"]]
    neighbors_csv.loc[neighbors_csv.Code == "SGP", "neighbors"] = [["MYS", "IDN"]]
    neighbors_csv.loc[neighbors_csv.Code == "JAM", "neighbors"] = [["CUB", "DOM"]]
    return neighbors_csv
webcrawling0202.py 文件源码 项目:webcrawling 作者: etilelab 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def loadFile(fileName):
    # checkFileName??? ??, ??? ???? ???? ??? ??
    outputFileName = checkFileName(fileName)

    if outputFileName is not -1:
        df = pandas.read_csv(outputFileName)
        content = df["Content"]
        title = df["Title"]
        company = df["Company"]
        print(company)

        print("csv FIle Load Success")
    else:
        print("Error csv File")

# checkFileName ??
# ???? ??? ???? ???? ??? -1 ??, ??? ??? ??
# ??? ???? all?? ?? ??? ?? csv??? ??? ???, csv??? ?? ??
# ??? ???? csv ??? ??
webcrawling0203.py 文件源码 项目:webcrawling 作者: etilelab 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def loadFile(fileName,analyzeValue):
    # checkFileName??? ??, ??? ???? ???? ??? ??
    outputFileName = checkFileName(fileName)

    if outputFileName is not -1:
        df = pandas.read_csv(outputFileName)
        content = df["Content"]
        title = df["Title"]
        company = df["Company"]

        print("csv FIle Load Success")

        if analyzeValue==1:
            # analyze(title)
            analyze(content)

    else:
        print("Error csv File")

# checkFileName ??
# ???? ??? ???? ???? ??? -1 ??, ??? ??? ??
# ??? ???? all?? ?? ??? ?? csv??? ??? ???, csv??? ?? ??
# ??? ???? csv ??? ??


问题


面经


文章

微信
公众号

扫码关注公众号