python类read_csv()的实例源码

notice_dag.py 文件源码 项目:scrapy_projects 作者: morefreeze 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def filter_data(csv_file, start_day=28, end_day=90, interest=780, state=None, **kwargs):
    f = pd.read_csv(csv_file)
    f['sub_title'] = f['sub_title'].fillna('')
    candidate = []
    filter = Filter()
    filter.install_rule(lambda v: v['period'] <= datetime.timedelta(days=20) and v['benefit'] > 6, ok_stop=True, weight=5)
    filter.install_rule(lambda v: v['benefit'] >= 8 and v['period'] < datetime.timedelta(days=230))
    filter.install_rule(lambda v: not v['sub_title'].startswith('????'))
    for row in f.iterrows():
        idx, v = row
        money = money2float(v['money'])
        period = period2timedelta(v['period'])
        # remove percent sign(%)
        benefit = float(v['expected_benefit'][:-1])
        item = {
            'title': v['title'],
            'sub_title': v['sub_title'],
            'money': money,
            'period': period,
            'benefit': benefit,
        }
        if filter.check(item):
            candidate.append(item)
    return candidate
notice_dag.py 文件源码 项目:scrapy_projects 作者: morefreeze 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def filter_data(csv_file, **kwargs):
    f = pd.read_csv(csv_file)
    candidate = []
    filter = Filter()
    filter.install_rule(lambda v: not v['title'].startswith('test'))
    for row in f.iterrows():
        idx, v = row
        item = {
            'title': v['title'],
        }
        if filter.check(item):
            candidate.append(item)
    return candidate


# If len(candicate) > 0 will send to slack, the text will store as slack_txt_file
quality.py 文件源码 项目:ssbio 作者: SBRG 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def parse_psqs(psqs_results_file):
    """Parse a PSQS result file and returns a Pandas DataFrame of the results

    Args:
        psqs_results_file: Path to psqs results file

    Returns:
        Pandas DataFrame: Summary of PSQS results

    """

    # TODO: generalize column names for all results, save as dict instead

    psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None)
    psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb'))
    psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1)
    psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan)
    psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan)
    psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)]

    return psqs_results
DataIO.py 文件源码 项目:kaggle 作者: RankingAI 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def LoadFromTextFile(InputDir):

        ## raw data
        TrainData = pd.read_csv('%s/train_2016_v2.csv' % InputDir, parse_dates=['transactiondate'], header=0)
        TestData = pd.read_csv('%s/sample_submission.csv' % InputDir, header=0)
        TestData['parcelid'] = TestData['ParcelId']
        TestData.drop('ParcelId', axis=1, inplace=True)
        PropertyData = pd.read_csv('%s/properties_2016.csv' % InputDir,header=0)
        for c, dtype in zip(PropertyData.columns, PropertyData.dtypes):
            if dtype == np.float64:
                PropertyData[c] = PropertyData[c].astype(np.float32)

        ## join dynamic data with static data
        TrainData = pd.merge(TrainData, PropertyData, how='left', on='parcelid')
        TestData = pd.merge(TestData, PropertyData, how='left', on='parcelid')

        return TrainData,TestData

    ## class method, save data with pkl format
MicrobeBotResources.py 文件源码 项目:scheduled-bots 作者: SuLab 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def get_microbe_taxids(force_download=False):
    """
    Download the latest bacterial genome assembly summary from the NCBI genome ftp site
    and generate a pd.DataFrame of relevant data for strain items based on taxids of the bacterial reference genomes.
    :return: pandas dataframe of bacteria reference genome data
    """
    if force_download or not os.path.exists("reference_genomes.csv"):
        assembly = urllib.request.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt")
        df = pd.read_csv(assembly[0], sep="\t", dtype=object, skiprows=1, header=0)
        df = df[df['refseq_category'].isin(['reference genome', 'representative genome'])]

        all_tax_wdid = id_mapper('P685')

        df['wdid'] = df['taxid'].apply(lambda x: all_tax_wdid.get(x, None))
        df = df.rename(columns={'# assembly_accession': 'assembly_accession'})
        df.to_csv('reference_genomes.csv', sep="\t")
        df.taxid = df.taxid.astype(int)
        return df
    else:  # use predownloaded and parsed flatfile
        df = pd.read_csv("reference_genomes.csv", sep="\t", dtype=object, index_col=0)
        df.taxid = df.taxid.astype(int)
        return df
ChromosomeBot.py 文件源码 项目:scheduled-bots 作者: SuLab 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_assembly_report(self, taxid):
        if self.ass_sum is None:
            self.get_assembly_summaries()
        df = self.ass_sum.query("taxid == {} & refseq_category == 'reference genome'".format(taxid))
        if len(df) == 0:
            # try "representative genome" (needed for mouse and rat)
            df = self.ass_sum.query("taxid == {} & refseq_category == 'representative genome'".format(taxid))
        if len(df) != 1:
            raise ValueError("unknown reference: {}".format(df))
        print(df)
        ftp_path = list(df.ftp_path)[0]
        assembly = os.path.split(ftp_path)[1]
        url = os.path.join(ftp_path, assembly + "_assembly_report.txt")
        print(url)
        # read the column names from the file
        table = request.urlopen(request.Request(url)).read().decode()
        names = [x for x in table.split("\n") if x.startswith("#")][-1].strip().replace("# ", "").split("\t")
        self.chr_df[taxid] = pd.read_csv(StringIO(table), sep="\t", names=names, comment='#')
        self.chr_df[taxid] = self.chr_df[taxid].rename(columns={'Sequence-Name': 'SequenceName', 'Sequence-Role': 'SequenceRole',
                                                                'Assigned-Molecule': 'AssignedMolecule',
                                                                'Assigned-Molecule-Location/Type': 'AssignedMoleculeLocationType',
                                                                'GenBank-Accn': 'GenBankAccn', 'RefSeq-Accn': 'RefSeqAccn',
                                                                'UCSC-style-name': 'UCSCstylename'})
        #print(self.chr_df[taxid].query("SequenceRole == 'assembled-molecule'"))
preprocess_birds.py 文件源码 项目:how_to_convert_text_to_images 作者: llSourcell 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def load_bbox(data_dir):
    bbox_path = os.path.join(data_dir, 'CUB_200_2011/bounding_boxes.txt')
    df_bounding_boxes = pd.read_csv(bbox_path,
                                    delim_whitespace=True,
                                    header=None).astype(int)
    #
    filepath = os.path.join(data_dir, 'CUB_200_2011/images.txt')
    df_filenames = pd.read_csv(filepath, delim_whitespace=True, header=None)
    filenames = df_filenames[1].tolist()
    print('Total filenames: ', len(filenames), filenames[0])
    #
    filename_bbox = {img_file[:-4]: [] for img_file in filenames}
    numImgs = len(filenames)
    for i in xrange(0, numImgs):
        # bbox = [x-left, y-top, width, height]
        bbox = df_bounding_boxes.iloc[i][1:].tolist()

        key = filenames[i][:-4]
        filename_bbox[key] = bbox
    #
    return filename_bbox
sample_item_file.py 文件源码 项目:evaluation_tools 作者: JSALT-Rosetta 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def get_sample_item_file(wav_file_names_sample, item_file, output):
    """
    From a sampled dataset, get an item file for running an ABX task
    Parameters
    ----------
    item file : text file containing at least as columns : #filename, onset, offset, 
    #phoneme and context and side information such as image ID
    item_file : string,
         path to the item file of the whole dataset
    output: string, 
        path where the sample item file will be stored
    """
    wav_names=[]
    temp=np.load(wav_file_names_sample)
    for s in temp:
        wav_names.append(s.split(".")[0])

    df=pd.read_csv(item_file, sep="\t", index_col="#filename")
    df_sample=df.loc[wav_names]

    df_sample.to_csv(output, sep="\t", header=True, index=False)

    return(df_sample)
plots_and_stats.py 文件源码 项目:BadParser 作者: stanojevic 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def meansOfMeans(datafile):

    df = pd.read_csv(datafile, delimiter=",")
    df = df.loc[df["swapsEager"]>0]
    grouped = df.groupby("words", as_index=True)
    idx = grouped.groups.keys()

    all_means=grouped.mean()
    mean_of_means = all_means.mean()
    std_of_means = all_means.std()

    #Print in latex format:
    print "& Average number of swaps & Average jump size \\\\"
    print "\hline"
    for laziness in ("Eager", "Lazy", "Lazier"):
        print "{} & {}({}) & {}({})\\\\".format(laziness, \
                                                mean_of_means["swaps%s"%laziness], \
                                                std_of_means["swaps%s"%laziness], \
                                                mean_of_means["avgAltBlockSize%s"%laziness], \
                                                std_of_means["avgAltBlockSize%s"%laziness])
sm2hdf.py 文件源码 项目:pyrsss 作者: butala 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def read_sm_csv(csv_fname):
    """
    Parse the SuperMAG CSV format data record *csv_fname*. For each
    station, store the information in pandas
    :class:`DataFrame`. Return a mapping between the station
    identifier and data frame.
    """
    df = PD.read_csv(csv_fname,
                     header=0,
                     parse_dates=[0],
                     date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'),
                     index_col=0)
    df_map = {name: group for name, group in df.groupby('IAGA')}
    for df in df_map.itervalues():
        del df['IAGA']
        df.rename(columns={'N': 'B_N',
                           'E': 'B_E',
                           'Z': 'B_Z'},
                  inplace=True)
    return df_map
test_built_models.py 文件源码 项目:bambi 作者: bambinos 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def crossed_data():
    '''
    Random effects:
    10 subjects, 12 items, 5 sites
    Subjects crossed with items, nested in sites
    Items crossed with sites

    Fixed effects:
    A continuous predictor, a numeric dummy, and a three-level category
    (levels a,b,c)

    Structure:
    Subjects nested in dummy (e.g., gender), crossed with threecats
    Items crossed with dummy, nested in threecats
    Sites partially crossed with dummy (4/5 see a single dummy, 1/5 sees both
    dummies)
    Sites crossed with threecats
    '''
    from os.path import dirname, join
    data_dir = join(dirname(__file__), 'data')
    data = pd.read_csv(join(data_dir, 'crossed_random.csv'))
    return data
calcu_3year_average_pe.py 文件源码 项目:chinese-stock-Financial-Index 作者: lfh2016 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def calcu_all_stocks_3year_average_profit(year):  # ??3???????
    path = os.path.join(current_folder, '????%s.csv' % today)
    if not os.path.exists(path):
        data = ts.get_stock_basics()
        lie = ['??', '??', '??', '???', '????', '???',
               '???(?)', '????', '????', '???', '?????', '????', '????',
               '???', '????', '????', '?????', '????(%)', '????(%)',
               '???(%)', '????(%)', '????']
        data.columns = lie
        data.index.names = ['??']
        data.to_csv(path, encoding='utf-8')

    data = pd.read_csv(path, encoding='utf-8', index_col=0)
    # print(data)
    data['????'] = 0
    for index, row in data.iterrows():
        try:
            data.loc[index, '????'] = calcu_3year_average_profit('%06d' % index, year)
        except Exception as e:
            print(e)
            data.loc[index, '????'] = 0

        print('??%s' % index)
    data.to_csv(os.path.join(current_folder, '3????????????%s.csv' % today), encoding='utf-8')
views.py 文件源码 项目:ml-rest 作者: apinf 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def save_csv_as_dataframe(request):
    print("Save CSV as DataFrame")

    if (request.POST):
        # Get CSV URL from post; default to None if not provided
        csv_url = request.POST.get('csv_url', None)

        if (csv_url):
            csv_data = pd.read_csv(csv_url)

            print(csv_data)

            # Create Data Frame instance
            data = Data()

            # Add CSV Data to data_frame field
            data.data_frame = csv_data
            data.source_url = csv_url

            # Save Data Frame
            data.save()
sequenceNet.py 文件源码 项目:deep-summarization 作者: harpribot 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def store_test_predictions(self, prediction_id='_final'):
        """
        Stores the test predictions in a CSV file

        :param prediction_id: A simple id appended to the name of the summary for uniqueness
        :return: None
        """
        # prediction id is usually the step count
        print 'Storing predictions on Test Data...'
        review = []
        true_summary = []
        generated_summary = []
        for i in range(self.test_size):
            if not self.checkpointer.is_output_file_present():
                review.append(self._index2sentence(self.test_review[i]))
                true_summary.append(self._index2sentence(self.true_summary[i]))
            if i < (self.test_batch_size * (self.test_size // self.test_batch_size)):
                generated_summary.append(self._index2sentence(self.predicted_test_summary[i]))
            else:
                generated_summary.append('')

        prediction_nm = 'generated_summary' + prediction_id
        if self.checkpointer.is_output_file_present():
            df = pd.read_csv(self.checkpointer.get_result_location(), header=0)
            df[prediction_nm] = np.array(generated_summary)
        else:
            df = pd.DataFrame()
            df['review'] = np.array(review)
            df['true_summary'] = np.array(true_summary)
            df[prediction_nm] = np.array(generated_summary)
        df.to_csv(self.checkpointer.get_result_location(), index=False)
        print 'Stored the predictions. Moving Forward'
        if prediction_id == '_final':
            print 'All done. Exiting..'
            print 'Exited'
metric.py 文件源码 项目:deep-summarization 作者: harpribot 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def load_result(self,result_file):
        """

        :param result_file:
        :return:
        """
        self.result = pd.read_csv(result_file, header=0)
        self.__scrape_reference()
        self.__scrape_all_hypotheses()
round.py 文件源码 项目:numerai 作者: gansanay 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def training_set(self):
        return pd.read_csv(resource_filename('numerai.data', self.train_file_name))
round.py 文件源码 项目:numerai 作者: gansanay 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def test_set(self):
        return pd.read_csv(resource_filename('numerai.data', self.test_file_name))
round.py 文件源码 项目:numerai 作者: gansanay 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def sorted_training_set(self):
        return pd.read_csv(resource_filename('numerai.data', self.sorted_file_name))
dal.py 文件源码 项目:toll_road 作者: idosekely 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def _reader(self):
        if not self.does_exist():
            return
        dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d %H:%M:%S.%f')
        df = pd.read_csv(self.data_file, parse_dates='timestamp', index_col='timestamp', date_parser=dateparse)
        return df
data.py 文件源码 项目:DREAM 作者: LaceyChen17 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_orders(self):
        '''
            get order context information
        '''
        orders = pd.read_csv(self.raw_data_dir + 'orders.csv')
        orders = orders.fillna(0.0)
        orders['days'] = orders.groupby(['user_id'])['days_since_prior_order'].cumsum()
        orders['days_last'] = orders.groupby(['user_id'])['days'].transform(max)
        orders['days_up_to_last'] = orders['days_last'] - orders['days']
        del orders['days_last']
        del orders['days']
        return orders


问题


面经


文章

微信
公众号

扫码关注公众号