def filter_data(csv_file, start_day=28, end_day=90, interest=780, state=None, **kwargs):
f = pd.read_csv(csv_file)
f['sub_title'] = f['sub_title'].fillna('')
candidate = []
filter = Filter()
filter.install_rule(lambda v: v['period'] <= datetime.timedelta(days=20) and v['benefit'] > 6, ok_stop=True, weight=5)
filter.install_rule(lambda v: v['benefit'] >= 8 and v['period'] < datetime.timedelta(days=230))
filter.install_rule(lambda v: not v['sub_title'].startswith('????'))
for row in f.iterrows():
idx, v = row
money = money2float(v['money'])
period = period2timedelta(v['period'])
# remove percent sign(%)
benefit = float(v['expected_benefit'][:-1])
item = {
'title': v['title'],
'sub_title': v['sub_title'],
'money': money,
'period': period,
'benefit': benefit,
}
if filter.check(item):
candidate.append(item)
return candidate
python类read_csv()的实例源码
def filter_data(csv_file, **kwargs):
f = pd.read_csv(csv_file)
candidate = []
filter = Filter()
filter.install_rule(lambda v: not v['title'].startswith('test'))
for row in f.iterrows():
idx, v = row
item = {
'title': v['title'],
}
if filter.check(item):
candidate.append(item)
return candidate
# If len(candicate) > 0 will send to slack, the text will store as slack_txt_file
def parse_psqs(psqs_results_file):
"""Parse a PSQS result file and returns a Pandas DataFrame of the results
Args:
psqs_results_file: Path to psqs results file
Returns:
Pandas DataFrame: Summary of PSQS results
"""
# TODO: generalize column names for all results, save as dict instead
psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None)
psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb'))
psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1)
psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan)
psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan)
psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)]
return psqs_results
def LoadFromTextFile(InputDir):
## raw data
TrainData = pd.read_csv('%s/train_2016_v2.csv' % InputDir, parse_dates=['transactiondate'], header=0)
TestData = pd.read_csv('%s/sample_submission.csv' % InputDir, header=0)
TestData['parcelid'] = TestData['ParcelId']
TestData.drop('ParcelId', axis=1, inplace=True)
PropertyData = pd.read_csv('%s/properties_2016.csv' % InputDir,header=0)
for c, dtype in zip(PropertyData.columns, PropertyData.dtypes):
if dtype == np.float64:
PropertyData[c] = PropertyData[c].astype(np.float32)
## join dynamic data with static data
TrainData = pd.merge(TrainData, PropertyData, how='left', on='parcelid')
TestData = pd.merge(TestData, PropertyData, how='left', on='parcelid')
return TrainData,TestData
## class method, save data with pkl format
def get_microbe_taxids(force_download=False):
"""
Download the latest bacterial genome assembly summary from the NCBI genome ftp site
and generate a pd.DataFrame of relevant data for strain items based on taxids of the bacterial reference genomes.
:return: pandas dataframe of bacteria reference genome data
"""
if force_download or not os.path.exists("reference_genomes.csv"):
assembly = urllib.request.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt")
df = pd.read_csv(assembly[0], sep="\t", dtype=object, skiprows=1, header=0)
df = df[df['refseq_category'].isin(['reference genome', 'representative genome'])]
all_tax_wdid = id_mapper('P685')
df['wdid'] = df['taxid'].apply(lambda x: all_tax_wdid.get(x, None))
df = df.rename(columns={'# assembly_accession': 'assembly_accession'})
df.to_csv('reference_genomes.csv', sep="\t")
df.taxid = df.taxid.astype(int)
return df
else: # use predownloaded and parsed flatfile
df = pd.read_csv("reference_genomes.csv", sep="\t", dtype=object, index_col=0)
df.taxid = df.taxid.astype(int)
return df
def get_assembly_report(self, taxid):
if self.ass_sum is None:
self.get_assembly_summaries()
df = self.ass_sum.query("taxid == {} & refseq_category == 'reference genome'".format(taxid))
if len(df) == 0:
# try "representative genome" (needed for mouse and rat)
df = self.ass_sum.query("taxid == {} & refseq_category == 'representative genome'".format(taxid))
if len(df) != 1:
raise ValueError("unknown reference: {}".format(df))
print(df)
ftp_path = list(df.ftp_path)[0]
assembly = os.path.split(ftp_path)[1]
url = os.path.join(ftp_path, assembly + "_assembly_report.txt")
print(url)
# read the column names from the file
table = request.urlopen(request.Request(url)).read().decode()
names = [x for x in table.split("\n") if x.startswith("#")][-1].strip().replace("# ", "").split("\t")
self.chr_df[taxid] = pd.read_csv(StringIO(table), sep="\t", names=names, comment='#')
self.chr_df[taxid] = self.chr_df[taxid].rename(columns={'Sequence-Name': 'SequenceName', 'Sequence-Role': 'SequenceRole',
'Assigned-Molecule': 'AssignedMolecule',
'Assigned-Molecule-Location/Type': 'AssignedMoleculeLocationType',
'GenBank-Accn': 'GenBankAccn', 'RefSeq-Accn': 'RefSeqAccn',
'UCSC-style-name': 'UCSCstylename'})
#print(self.chr_df[taxid].query("SequenceRole == 'assembled-molecule'"))
preprocess_birds.py 文件源码
项目:how_to_convert_text_to_images
作者: llSourcell
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def load_bbox(data_dir):
bbox_path = os.path.join(data_dir, 'CUB_200_2011/bounding_boxes.txt')
df_bounding_boxes = pd.read_csv(bbox_path,
delim_whitespace=True,
header=None).astype(int)
#
filepath = os.path.join(data_dir, 'CUB_200_2011/images.txt')
df_filenames = pd.read_csv(filepath, delim_whitespace=True, header=None)
filenames = df_filenames[1].tolist()
print('Total filenames: ', len(filenames), filenames[0])
#
filename_bbox = {img_file[:-4]: [] for img_file in filenames}
numImgs = len(filenames)
for i in xrange(0, numImgs):
# bbox = [x-left, y-top, width, height]
bbox = df_bounding_boxes.iloc[i][1:].tolist()
key = filenames[i][:-4]
filename_bbox[key] = bbox
#
return filename_bbox
def get_sample_item_file(wav_file_names_sample, item_file, output):
"""
From a sampled dataset, get an item file for running an ABX task
Parameters
----------
item file : text file containing at least as columns : #filename, onset, offset,
#phoneme and context and side information such as image ID
item_file : string,
path to the item file of the whole dataset
output: string,
path where the sample item file will be stored
"""
wav_names=[]
temp=np.load(wav_file_names_sample)
for s in temp:
wav_names.append(s.split(".")[0])
df=pd.read_csv(item_file, sep="\t", index_col="#filename")
df_sample=df.loc[wav_names]
df_sample.to_csv(output, sep="\t", header=True, index=False)
return(df_sample)
def meansOfMeans(datafile):
df = pd.read_csv(datafile, delimiter=",")
df = df.loc[df["swapsEager"]>0]
grouped = df.groupby("words", as_index=True)
idx = grouped.groups.keys()
all_means=grouped.mean()
mean_of_means = all_means.mean()
std_of_means = all_means.std()
#Print in latex format:
print "& Average number of swaps & Average jump size \\\\"
print "\hline"
for laziness in ("Eager", "Lazy", "Lazier"):
print "{} & {}({}) & {}({})\\\\".format(laziness, \
mean_of_means["swaps%s"%laziness], \
std_of_means["swaps%s"%laziness], \
mean_of_means["avgAltBlockSize%s"%laziness], \
std_of_means["avgAltBlockSize%s"%laziness])
def read_sm_csv(csv_fname):
"""
Parse the SuperMAG CSV format data record *csv_fname*. For each
station, store the information in pandas
:class:`DataFrame`. Return a mapping between the station
identifier and data frame.
"""
df = PD.read_csv(csv_fname,
header=0,
parse_dates=[0],
date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'),
index_col=0)
df_map = {name: group for name, group in df.groupby('IAGA')}
for df in df_map.itervalues():
del df['IAGA']
df.rename(columns={'N': 'B_N',
'E': 'B_E',
'Z': 'B_Z'},
inplace=True)
return df_map
def crossed_data():
'''
Random effects:
10 subjects, 12 items, 5 sites
Subjects crossed with items, nested in sites
Items crossed with sites
Fixed effects:
A continuous predictor, a numeric dummy, and a three-level category
(levels a,b,c)
Structure:
Subjects nested in dummy (e.g., gender), crossed with threecats
Items crossed with dummy, nested in threecats
Sites partially crossed with dummy (4/5 see a single dummy, 1/5 sees both
dummies)
Sites crossed with threecats
'''
from os.path import dirname, join
data_dir = join(dirname(__file__), 'data')
data = pd.read_csv(join(data_dir, 'crossed_random.csv'))
return data
calcu_3year_average_pe.py 文件源码
项目:chinese-stock-Financial-Index
作者: lfh2016
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def calcu_all_stocks_3year_average_profit(year): # ??3???????
path = os.path.join(current_folder, '????%s.csv' % today)
if not os.path.exists(path):
data = ts.get_stock_basics()
lie = ['??', '??', '??', '???', '????', '???',
'???(?)', '????', '????', '???', '?????', '????', '????',
'???', '????', '????', '?????', '????(%)', '????(%)',
'???(%)', '????(%)', '????']
data.columns = lie
data.index.names = ['??']
data.to_csv(path, encoding='utf-8')
data = pd.read_csv(path, encoding='utf-8', index_col=0)
# print(data)
data['????'] = 0
for index, row in data.iterrows():
try:
data.loc[index, '????'] = calcu_3year_average_profit('%06d' % index, year)
except Exception as e:
print(e)
data.loc[index, '????'] = 0
print('??%s' % index)
data.to_csv(os.path.join(current_folder, '3????????????%s.csv' % today), encoding='utf-8')
def save_csv_as_dataframe(request):
print("Save CSV as DataFrame")
if (request.POST):
# Get CSV URL from post; default to None if not provided
csv_url = request.POST.get('csv_url', None)
if (csv_url):
csv_data = pd.read_csv(csv_url)
print(csv_data)
# Create Data Frame instance
data = Data()
# Add CSV Data to data_frame field
data.data_frame = csv_data
data.source_url = csv_url
# Save Data Frame
data.save()
def store_test_predictions(self, prediction_id='_final'):
"""
Stores the test predictions in a CSV file
:param prediction_id: A simple id appended to the name of the summary for uniqueness
:return: None
"""
# prediction id is usually the step count
print 'Storing predictions on Test Data...'
review = []
true_summary = []
generated_summary = []
for i in range(self.test_size):
if not self.checkpointer.is_output_file_present():
review.append(self._index2sentence(self.test_review[i]))
true_summary.append(self._index2sentence(self.true_summary[i]))
if i < (self.test_batch_size * (self.test_size // self.test_batch_size)):
generated_summary.append(self._index2sentence(self.predicted_test_summary[i]))
else:
generated_summary.append('')
prediction_nm = 'generated_summary' + prediction_id
if self.checkpointer.is_output_file_present():
df = pd.read_csv(self.checkpointer.get_result_location(), header=0)
df[prediction_nm] = np.array(generated_summary)
else:
df = pd.DataFrame()
df['review'] = np.array(review)
df['true_summary'] = np.array(true_summary)
df[prediction_nm] = np.array(generated_summary)
df.to_csv(self.checkpointer.get_result_location(), index=False)
print 'Stored the predictions. Moving Forward'
if prediction_id == '_final':
print 'All done. Exiting..'
print 'Exited'
def load_result(self,result_file):
"""
:param result_file:
:return:
"""
self.result = pd.read_csv(result_file, header=0)
self.__scrape_reference()
self.__scrape_all_hypotheses()
def training_set(self):
return pd.read_csv(resource_filename('numerai.data', self.train_file_name))
def test_set(self):
return pd.read_csv(resource_filename('numerai.data', self.test_file_name))
def sorted_training_set(self):
return pd.read_csv(resource_filename('numerai.data', self.sorted_file_name))
def _reader(self):
if not self.does_exist():
return
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d %H:%M:%S.%f')
df = pd.read_csv(self.data_file, parse_dates='timestamp', index_col='timestamp', date_parser=dateparse)
return df
def get_orders(self):
'''
get order context information
'''
orders = pd.read_csv(self.raw_data_dir + 'orders.csv')
orders = orders.fillna(0.0)
orders['days'] = orders.groupby(['user_id'])['days_since_prior_order'].cumsum()
orders['days_last'] = orders.groupby(['user_id'])['days'].transform(max)
orders['days_up_to_last'] = orders['days_last'] - orders['days']
del orders['days_last']
del orders['days']
return orders