def __init__(self, up_basket, up_r_basket = None, up_his = None):
if (up_r_basket is not None) and (up_his is not None):
self.is_reordered_included = True
else:
self.is_reordered_included = False
up_basket['num_baskets'] = up_basket.basket.apply(len)
self.user_id = list(up_basket.user_id)
self.num_baskets = [int(n) for n in list(up_basket.num_baskets)]
self.basket = [[[int(p) for p in b]for b in u] for u in list(up_basket.basket)]
if self.is_reordered_included is True:
up_basket = pd.merge(up_basket, up_r_basket, on = ['user_id'], how = 'left')
up_basket = pd.merge(up_basket, up_his, on = ['user_id'], how = 'left')
self.reorder_basket = [[[int(p) for p in b]for b in u] for u in list(up_basket.reorder_basket)]
self.history_item = [[[int(p) for p in b]for b in u] for u in list(up_basket.history_items)]
python类merge()的实例源码
def insert_actor_ids(commit_frame, actor_frame, drop_name_email=True):
actor_columns = ['author_name', 'author_email',
'committer_name', 'committer_email']
cf = commit_frame[actor_columns]
af = actor_frame[['name', 'email', 'actor_id']]
author = pd.merge(
cf, af, left_on=actor_columns[:2],
right_on=('name', 'email'),
how='left')['actor_id']
committer = pd.merge(
cf, af, left_on=actor_columns[2:],
right_on=('name', 'email'),
how='left')['actor_id']
commit_frame.insert(3, 'author', author)
commit_frame.insert(4, 'committer', committer)
if drop_name_email:
commit_frame.drop(actor_columns, axis=1, inplace=True)
return commit_frame
def eval(name,clip=False,bar=0.9):
base = pd.read_csv('../input/stage1_solution_filtered.csv')
base['Class'] = np.argmax(base[['class%d'%i for i in range(1,10)]].values,axis=1)
sub = pd.read_csv(name)
#sub = pd.merge(sub,base[['ID','Class']],on="ID",how='right')
#print(sub.head())
y = base['Class'].values
yp = sub[['class%d'%i for i in range(1,10)]].values
if clip:
yp = np.clip(yp,(1.0-bar)/8,bar)
yp = yp/np.sum(yp,axis=1).reshape([yp.shape[0],1])
print(name,cross_entropy(y,yp),multiclass_log_loss(y,yp))
for i in range(9):
y1 = y[y==i]
yp1 = yp[y==i]
print(i,y1.shape,cross_entropy(y1,yp1),multiclass_log_loss(y1,yp1))
def replace(s,n):
seen = pd.read_csv(s)
unseen = pd.read_csv(n)
te = pd.read_csv('../input/stage2_test_variants.csv')
tr = pd.read_csv('../input/training_variants')
unseen = pd.merge(unseen,te,on='ID',how='right')
seen = pd.merge(seen,te,on='ID',how='right')
mask = seen.Gene.isin(tr.Gene)
cols = ['class%d'%i for i in range(1,10)]
seen.loc[~mask,cols] = 0
mask = unseen.Gene.isin(tr.Gene)
unseen.loc[mask,cols] = 0
assert (unseen['ID']==seen['ID']).all()
seen[cols] = seen[cols] + unseen[cols]
seen[cols+['ID']].to_csv('mix.csv',index=False)
def user_info(df):
grouped = df[[user_id, power_consumption]].groupby([user_id], as_index=False)
user_power_mean = grouped.mean()
user_power_median = grouped.median()
user_power_var = grouped.var()
user_power_max = grouped.max()
user_power_min = grouped.min()
user_power_mean = user_power_mean.rename(columns={power_consumption: 'user_power_mean'})
user_power_median = user_power_median.rename(columns={power_consumption: 'user_power_median'})
user_power_var = user_power_var.rename(columns={power_consumption: 'user_power_var'})
user_power_max = user_power_max.rename(columns={power_consumption: 'user_power_max'})
user_power_min = user_power_min.rename(columns={power_consumption: 'user_power_min'})
return pd.merge(user_power_mean, user_power_median).merge(user_power_var).\
merge(user_power_max).merge(user_power_min)
# ??????????????????????????????
def user_info_m_p(df):
date2 = df[record_date].map(lambda x: str2time(x)).max()
date1 = datetime.datetime(date2.year, date2.month, 1).date()
grouped = DataView(df).filter_by_record_date2(date1, date2)[[user_id, 'month', power_consumption]].groupby([user_id, 'month'], as_index=False)
user_power_mean_m = grouped.mean()
user_power_median_m = grouped.median()
user_power_var_m = grouped.var()
user_power_max_m = grouped.max()
user_power_min_m = grouped.min()
user_power_mean_m = user_power_mean_m.rename(columns={power_consumption: 'user_power_mean_m_p'})
user_power_median_m = user_power_median_m.rename(columns={power_consumption: 'user_power_median_m_p'})
user_power_var_m = user_power_var_m.rename(columns={power_consumption: 'user_power_var_m_p'})
user_power_max_m = user_power_max_m.rename(columns={power_consumption: 'user_power_max_m_p'})
user_power_min_m = user_power_min_m.rename(columns={power_consumption: 'user_power_min_m_p'})
return pd.merge(user_power_mean_m, user_power_median_m).merge(user_power_var_m).\
merge(user_power_max_m).merge(user_power_min_m).drop('month', axis=1)
# ????????
def user_info_m_p(df):
date2 = df[record_date].map(lambda x: str2time(x)).max()
date1 = datetime.datetime(date2.year, date2.month, 1).date()
grouped = DataView(df).filter_by_record_date2(date1, date2)[[user_id, 'month', power_consumption]].groupby([user_id, 'month'], as_index=False)
user_power_mean_m = grouped.mean()
user_power_median_m = grouped.median()
user_power_var_m = grouped.var()
user_power_max_m = grouped.max()
user_power_min_m = grouped.min()
user_power_mean_m = user_power_mean_m.rename(columns={power_consumption: 'user_power_mean_m_p'})
user_power_median_m = user_power_median_m.rename(columns={power_consumption: 'user_power_median_m_p'})
user_power_var_m = user_power_var_m.rename(columns={power_consumption: 'user_power_var_m_p'})
user_power_max_m = user_power_max_m.rename(columns={power_consumption: 'user_power_max_m_p'})
user_power_min_m = user_power_min_m.rename(columns={power_consumption: 'user_power_min_m_p'})
return pd.merge(user_power_mean_m, user_power_median_m).merge(user_power_var_m).\
merge(user_power_max_m).merge(user_power_min_m).drop('month', axis=1)
# ??????????????????????????????
def LoadFromTextFile(InputDir):
## raw data
TrainData = pd.read_csv('%s/train_2016_v2.csv' % InputDir, parse_dates=['transactiondate'], header=0)
TestData = pd.read_csv('%s/sample_submission.csv' % InputDir, header=0)
TestData['parcelid'] = TestData['ParcelId']
TestData.drop('ParcelId', axis=1, inplace=True)
PropertyData = pd.read_csv('%s/properties_2016.csv' % InputDir,header=0)
for c, dtype in zip(PropertyData.columns, PropertyData.dtypes):
if dtype == np.float64:
PropertyData[c] = PropertyData[c].astype(np.float32)
## join dynamic data with static data
TrainData = pd.merge(TrainData, PropertyData, how='left', on='parcelid')
TestData = pd.merge(TestData, PropertyData, how='left', on='parcelid')
return TrainData,TestData
## class method, save data with pkl format
def planetAndStar(how='inner'):
"""Read the SWEET-Cat and ExoplanetEU databases and merge them.
Input
-----
how : str (default: 'inner')
How to merge the two DataFrames. See pd.merge for documentation
Output
------
d : pd.DataFrame
The DataFrame of merged DataFrame
c : list
The columns that can be used for plotting
"""
df, columns = readSC()
deu = readExoplanetEU()
cols = ['stName', 'plMass', 'plRadius', 'period', 'sma', 'eccentricity',
'inclination', 'discovered', 'dist', 'b',
'mag_v', 'mag_i', 'mag_j', 'mag_h', 'mag_k', 'plDensity']
d = pd.merge(df, deu, left_on='Star', right_on='stName', how=how)
d['radius'] = list(map(stellar_radius, d['mass'], d['logg']))
d['teq0'] = d.teff * np.sqrt((d.radius*700000)/(2*d.sma*150000000))
c = columns + cols[1:]
return d, c
def initialize(self):
""" Perform basic checks on provided attributes.
"""
if isinstance(self.on, str):
self.on = [self.on]
elif not isinstance(self.on,list):
raise Exception('on is not a list of strings. Exit.')
assert len(self.on) > 0, 'not specified on which keys to merge.'
assert (self.how == 'inner' or self.how == 'outer' or self.how == 'left' or self.how == 'right'), \
'how to merge not specified correctly.'
assert len(self.output_collection), 'output_collection not specified.'
# add back on to kwargs, so it's picked up by pandas.
if self.on is not None:
self.kwargs['on'] = self.on
if self.how is not None:
self.kwargs['how'] = self.how
self.log().info('kwargs passed on to pandas merge function are: %s' % self.kwargs )
return StatusCode.Success
def merge(self, other):
"""
Merge overlapped guides
For example::
from ggplot import *
gg = ggplot(aes(x='cut', fill='cut', color='cut'), data=diamonds)
gg + stat_bin()
This would create similar guides for fill and color where only
a single guide would do
"""
self.key = pd.merge(self.key, other.key)
duplicated = set(self.override_aes) & set(other.override_aes)
if duplicated:
warn("Duplicated override_aes is ignored.")
self.override_aes.update(other.override_aes)
for ae in duplicated:
self.override_aes.pop(ae)
return self
def cross_join(df1, df2):
"""
Return a dataframe that is a cross between dataframes
df1 and df2
ref: https://github.com/pydata/pandas/issues/5401
"""
if len(df1) == 0:
return df2
if len(df2) == 0:
return df1
# Add as lists so that the new index keeps the items in
# the order that they are added together
all_columns = pd.Index(list(df1.columns) + list(df2.columns))
df1['key'] = 1
df2['key'] = 1
return pd.merge(df1, df2, on='key').loc[:, all_columns]
def enrich(self, column, projects):
""" This method adds a new column named as 'project'
that contains information about the associated project
that the event in 'column' belongs to.
:param column: column with information related to the project
:type column: string
:param projects: information about item - project
:type projects: pandas.DataFrame
:returns: original data frame with a new column named 'project'
:rtype: pandas.DataFrame
"""
if column not in self.data.columns:
return self.data
self.data = pandas.merge(self.data, projects, how='left', on=column)
return self.data
def enrich(self, columns):
""" Merges the original dataframe with corresponding entity uuids based
on the given columns. Also merges other additional information
associated to uuids provided in the uuids dataframe, if any.
:param columns: columns to match for merging
:type column: string array
:return: original dataframe with at least one new column:
* uuid: identity unique identifier
:rtype: pandas.DataFrame
"""
for column in columns:
if column not in self.data.columns:
return self.data
self.data = pandas.merge(self.data, self.uuids_df, how='left', on=columns)
self.data = self.data.fillna("notavailable")
return self.data
def touch_open_indexes(self):
"""
???????????
:return:
"""
quo = self.quotation
assert isinstance(quo, pd.DataFrame)
if self.open_indexes == self.INDEXES_TYPE_MA:
# ????
ma = pd.merge(self.codes[["open_ma", "close"]], quo[["bid1"]], left_index=True, right_index=True)
# ????? < ??,? ??? > ??
ma["open_position"] = ma.bid1 > ma.open_ma
self.codes.open_position = ma.open_position
if self.debug:
open_num = ma.open_position.value_counts()[True]
self.log.debug("%s ????????? " % open_num)
def get_sell_order(self):
"""
???????
:param assert_balance: ???
:return:
"""
codes = self.codes[self.codes.times != self.codes.exc_times]
codes["change"] = codes.exc_times - codes.times
sell_codes = codes[codes.change < 0]
# ????
sell_codes = pd.merge(sell_codes, self.quotation, left_index=True, right_index=True)
# ??????, ?????????
sell_priority_index = sell_codes[sell_codes.exc_times < -1].times.argsort()[::-1]
# ??
sell_codes.take(sell_priority_index)
return sell_codes
def export_data_unresolved():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_work_view = db.get_work_view()
connection = db_work_view._db_connection
df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False)
df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ')
df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ')
df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ')
df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection)
sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved'])
sample_unresoleved['n'].fillna(0, inplace=True)
sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8")
def pickle_correlations_zeros_january():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
print 'read'
df = pd.read_sql('select source_article_id, target_article_id from link_features', conn)
print 'loaded links'
df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501 where link_type_derived= "internal-link";', conn)
print 'loaded counts'
result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id'])
print 'merged counts'
print result
article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index()
article_counts['counts'].fillna(0.0, inplace=True)
print article_counts
print 'write to file'
article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
def antiSMASH_to_dataFrame(content):
""" Extract an antiSMASH file as a pandas.DataFrame
"""
parsed = parse_antiSMASH(content)
output = pd.DataFrame()
for cs in parsed['SignificantHits']:
clusterSubject = parsed['SignificantHits'][cs].copy()
df = pd.merge(
pd.DataFrame(clusterSubject['BlastHit']),
pd.DataFrame(clusterSubject['TableGenes']),
on='subject_gene', how='outer')
del(clusterSubject['BlastHit'])
del(clusterSubject['TableGenes'])
for v in clusterSubject:
df[v] = clusterSubject[v]
output = output.append(df, ignore_index=True)
return output
def get_basicroom_feature(data):
t1=data[['orderid','basicroomid']].drop_duplicates()[['basicroomid']]
t1['basicroomid_unique_order_cnt']=1
t1=t1.groupby(['basicroomid']).agg('sum').reset_index()
t2=data[['orderdate','basicroomid']].drop_duplicates()[['basicroomid']]
t2['basicroomid_unique_orderdate_cnt']=1
t2=t2.groupby(['basicroomid']).agg('sum').reset_index()
t3=data[['uid','basicroomid']].drop_duplicates()[['basicroomid']]
t3['basicroomid_unique_user_cnt']=1
t3=t3.groupby(['basicroomid']).agg('sum').reset_index()
t4=data[['basicroomid','roomid']].drop_duplicates()[['basicroomid']]
t4['basicroomid_unique_roomid_cnt']=1
t4=t4.groupby(['basicroomid']).agg('sum').reset_index()
# basicroom_feature=pd.merge(t,t1,on='basicroomid')
basicroom_feature=pd.merge(t1,t2,on='basicroomid')
basicroom_feature=pd.merge(basicroom_feature,t3,on='basicroomid')
basicroom_feature=pd.merge(basicroom_feature,t4,on='basicroomid')
return basicroom_feature
def preprocess(rawdatapath):
rawfacts = get_raw(os.path.join(rawdatapath, "facts.json"))
rawmetadata = get_raw(os.path.join(rawdatapath, "metadata.json"))
parsed_facts = rawfacts.join(pd.DataFrame(rawfacts["_source"].to_dict()).T).drop("_source", axis=1)
parsed_metadata = rawmetadata.join(pd.DataFrame(rawmetadata["_source"].to_dict()).T).drop("_source", axis=1)
parsed_metadata.rename(columns={"title":"articleTitle"}, inplace=True)
clean(parsed_facts)
clean(parsed_metadata)
parsed_metadata = parsed_metadata.join(pd.DataFrame(parsed_metadata["journalInfo"].to_dict()).T).drop("journalInfo", axis=1)
clean(parsed_metadata)
parsed_metadata = parsed_metadata.join(pd.DataFrame(parsed_metadata["journal"].to_dict()).T).drop("journal", axis=1)
clean(parsed_metadata)
df = pd.merge(parsed_facts, parsed_metadata, how="inner", on="cprojectID", suffixes=('_fact', '_meta'))
df.rename(columns={"title":"journalTitle"}, inplace=True)
df["sourcedict"] = get_dictionary(df)
df["term"] = df["term"].map(str.lower)
df["wikidataID"] = get_wikidataIDs(df)
df.drop_duplicates("_id_fact", inplace=True)
return df
def get_hs300s():
"""
????300??????????
Return
--------
DataFrame
code :????
name :????
date :??
weight:??
"""
from tushare.stock.fundamental import get_stock_basics
try:
wt = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'],
ct.PAGES['hs300w']), parse_cols=[0, 3, 6])
wt.columns = ct.FOR_CLASSIFY_W_COLS
wt['code'] = wt['code'].map(lambda x :str(x).zfill(6))
df = get_stock_basics()[['name']]
df = df.reset_index()
return pd.merge(df,wt)
except Exception as er:
print(str(er))
def get_zz500s():
"""
????500???
Return
--------
DataFrame
code :????
name :????
"""
from tushare.stock.fundamental import get_stock_basics
try:
# df = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'],
# ct.PAGES['zz500b']), parse_cols=[0,1])
# df.columns = ct.FOR_CLASSIFY_B_COLS
# df['code'] = df['code'].map(lambda x :str(x).zfill(6))
wt = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'],
ct.PAGES['zz500wt']), parse_cols=[0, 3, 6])
wt.columns = ct.FOR_CLASSIFY_W_COLS
wt['code'] = wt['code'].map(lambda x :str(x).zfill(6))
df = get_stock_basics()[['name']]
df = df.reset_index()
return pd.merge(df,wt)
except Exception as er:
print(str(er))
def setUp(self):
self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True)
self.A = pd.DataFrame([{'l_id': 1, 'l_attr':'ab cd ef aa bb'},
{'l_id': 2, 'l_attr':''},
{'l_id': 3, 'l_attr':'ab'},
{'l_id': 4, 'l_attr':'ll oo pp'},
{'l_id': 5, 'l_attr':'xy xx zz fg'},
{'l_id': 6, 'l_attr':pd.np.NaN}])
self.B = pd.DataFrame([{'r_id': 1, 'r_attr':'mn'},
{'r_id': 2, 'r_attr':'he ll'},
{'r_id': 3, 'r_attr':'xy pl ou'},
{'r_id': 4, 'r_attr':'aa'},
{'r_id': 5, 'r_attr':'fg cd aa ef'},
{'r_id': 6, 'r_attr':None}])
# generate cartesian product A x B to be used as candset
self.A['tmp_join_key'] = 1
self.B['tmp_join_key'] = 1
self.C = pd.merge(self.A[['l_id', 'tmp_join_key']],
self.B[['r_id', 'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr'])
self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr'])
self.empty_candset = pd.DataFrame(columns=['l_id', 'r_id'])
def label_feature_producer(start_date, end_date, features, labels):
labeller = Labeller(start_date, end_date, labels)
dataset = labeller.get_labels()
dataset['training_end_date'] = start_date
dataset['labeling_end_date'] = end_date
# dataset
feature_grabber = feature_processor.FeatureGrabber(start_date, engine,config_db, con)
feature_name_dictionary = {}
#print dataset
for feature in features:
#print feature
res_training, feature_names_training = feature_grabber.getFeature(feature)
feature_name_dictionary[feature] = feature_names_training
res_training.drop_duplicates(inplace = True)
dataset = pd.merge(dataset, res_training, on = config_db['id_column'],
how = 'left')
return dataset, feature_name_dictionary
#return None, None
def foo1():
# pandas????????????????????????
data = pd.merge(pd.merge(ratings, users), movies)
# ??????????????
# ??????DataFrame??????????????????????????
mean_rating = data.pivot_table('rating', columns='gender', index='title', aggfunc='mean')
# ?????????250????
ratings_by_title = data.groupby('title').size()
active_titles = ratings_by_title.index[ratings_by_title >= 250]
# ??????????250?????????????????mean_rating???????
mean_rating = mean_rating.ix[active_titles]
# ??????????????????F?????
top_female_ratings = mean_rating.sort_index(by='F', ascending=False)
print(top_female_ratings)
def to_df(self):
"""
Get a dataframe containing the states and value of all nodes of computation
::
>>> comp = loman.Computation()
>>> comp.add_node('foo', value=1)
>>> comp.add_node('bar', value=2)
>>> comp.to_df()
state value is_expansion
bar States.UPTODATE 2 NaN
foo States.UPTODATE 1 NaN
"""
df = pd.DataFrame(index=nx.topological_sort(self.dag))
df[_AN_STATE] = pd.Series(nx.get_node_attributes(self.dag, _AN_STATE))
df[_AN_VALUE] = pd.Series(nx.get_node_attributes(self.dag, _AN_VALUE))
df_timing = pd.DataFrame.from_dict(nx.get_node_attributes(self.dag, 'timing'), orient='index')
df = pd.merge(df, df_timing, left_index=True, right_index=True, how='left')
return df
def hashtags_multi(search_id):
ids = [search_id]
ids.extend(request.args.getlist('id'))
in_clause = ','.join([str(i) for i in ids])
searches = query("""
SELECT id, date_path, text
FROM searches WHERE id in (%s)
""" % in_clause)
summary = []
search = searches[0]
summary.append({'id': search['id'], 'date_path': search['date_path'],
'text': search['text'],
'colname': 'count_%s' % search['id']})
d = pd.read_csv('data/%s/count-hashtags.csv' % search['date_path'])
d = d.rename(columns={'count': 'count_%s' % search['id']})
for search in searches[1:]:
summary.append({'id': search['id'], 'date_path': search['date_path'],
'text': search['text'],
'colname': 'count_%s' % search['id']})
e = pd.read_csv('data/%s/count-hashtags.csv' % search['date_path'])
e = e.rename(columns={'count': 'count_%s' % search['id']})
d = pd.merge(d, e, on='hashtag', how='outer').fillna(0)
d.sort_values(by='count_%s' % search_id, inplace=True, ascending=False)
result = {'summary': summary, 'hashtags': d.to_dict(orient='record')}
return jsonify(result)
def censor_diagnosis(genotype_file,phenotype_file,final_pfile, field ='na',start_time=float('nan'),end_time=float('nan')):
import pandas as pd
import numpy as np
genotypes = pd.read_csv(genotype_file)
phenotypes = pd.read_csv(phenotype_file)
mg=pd.merge(phenotypes,genotypes,on='id')
if np.isnan(start_time) and np.isnan(end_time):
print("Choose appropriate time period")
if field=='na':
if np.isfinite(start_time) and np.isnan(end_time):
final = mg[mg['AgeAtICD']>start_time]
elif np.isnan(start_time) and np.isfinite(end_time):
final = mg[mg['AgeAtICD']<end_time]
else:
final = mg[(mg['AgeAtICD']>start_time)&(mg['AgeAtICD']<end_time)]
else:
mg['diff']=mg[field]-mg['AgeAtICD']
if np.isfinite(start_time) and np.isnan(end_time):
final = mg[(mg['diff']>start_time)|(np.isnan(mg['diff']))]
elif np.isnan(start_time) and np.isfinite(end_time):
final = mg[(mg['diff']<end_time)|(np.isnan(mg['diff']))]
else:
final = mg[(mg['diff']>start_time)&(mg['diff']<end_time)|(np.isnan(mg['diff']))]
final[['id','icd9','AgeAtICD']].to_csv(final_pfile)
def main():
# read and preprocess the movie data
movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python')
movie = movie_preprocessing(movie)
# read the ratings data and merge it with movie data
rating = pd.read_table("ratings.dat", sep="::",
names=["user_id", "movie_id", "rating", "timestamp"], engine='python')
data = pd.merge(rating, movie, on="movie_id")
# extract feature from our data set
streaming_batch, user_feature, actions, reward_list = feature_extraction(data)
streaming_batch.to_csv("streaming_batch.csv", sep='\t', index=False)
user_feature.to_csv("user_feature.csv", sep='\t')
pd.DataFrame(actions, columns=['movie_id']).to_csv("actions.csv", sep='\t', index=False)
reward_list.to_csv("reward_list.csv", sep='\t', index=False)
action_context = movie[movie['movie_id'].isin(actions)]
action_context.to_csv("action_context.csv", sep='\t', index = False)