def to_excel(self,filename,predictColumns=[],statsColumns=[],models=[]):
'''Export model predictions and statistics to an Excel workbook with one
worksheet for each model. Preferred method of creating readable output.
Input:
filename (str) = name of Excel workbook to create
predictColumns (list) = labels from the pandas dataframes to override
automatic alphabetization of all dataframe labels (default behavior)
statsColumns (list) = labels from the stats pandas dataframes to override write
(same behavior as predictColumns)
models (list) = models to export, if [], to_excel writes all predicted models by default
Output:
A Excel workbook with model predictions and statistics.'''
assert isinstance(filename,str), "Filename provided, {}, for export() needs to be a string.".format(filename)
if not models:
models = self.predictions.keys()
else:
for m in models:
assert m in self.predictions.keys(), "Model {} was not tested.".format(m)
if filename[-4:] == ".xlsx":
fn = filename
else:
fn = filename + ".xlsx"
writer = pandas.ExcelWriter(fn)
if predictColumns:
for model in models:
self.predictions[model].to_excel(writer,sheet_name=model,columns=predictColumns)
if statsColumns:
for model in models:
if model in self.statistics.keys():
self.statistics[model].to_excel(writer,sheet_name="{}-stats".format(model),columns=statsColumns)
else:
print "Functional form for {} was not specified. Not writing stats.".format(model)
writer.save()
python类ExcelWriter()的实例源码
def close(self, format='csv'):
import pandas as pd
if format == 'csv':
pd.DataFrame.from_dict(self._data, orient='index').to_csv(self._file)
elif format == 'json':
pd.DataFrame.from_dict(self._data, orient='index').to_json(self._file)
else:
writer = pd.ExcelWriter(self._file)
pd.DataFrame.from_dict(self._data, orient='index').to_excel(writer)
writer.save()
def main(path_to_data, column_to_deal, output_file, input_ncores):
'''
??logging??
'''
logger = logging.getLogger('mylogger')
logger.setLevel(logging.INFO)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('[%(levelname)-3s]%(asctime)s %(filename)s[line:%(lineno)d]:%(message)s')
console.setFormatter(formatter)
logger.addHandler(console)
'''
??????
'''
data = pd.read_excel(path_to_data)
logger.info("??????...")
re_sub_vec = np.vectorize(re_sub) # ?????
data[column_to_deal] = re_sub_vec(data[column_to_deal])
logger.info("??????...")
data['content_list'] = data[column_to_deal].map(sentence_split)
seg_word = jieba4null(n_core = input_ncores)
data.loc[:,'seg_words'] = data['content_list'].map(seg_word.cut_sentence)
logger.info("????????...")
pool = Pool(input_ncores)
worker = polar_classifier()
data['sentiment'] = pool.map(worker.multi_list_classify, data['seg_words'])
data = data.drop(['content_list','seg_words'], axis = 1)
logger.info("????????...")
writer = pd.ExcelWriter(output_file)
data.to_excel(writer, sheet_name='sheet1', encoding='utf-8', index=False)
writer.save()
logger.info("Task done!")
def _export_xlsx(self):
try:
import openpyxl
except ImportError:
msg = QtWidgets.QMessageBox()
msg.setIcon(QtWidgets.QMessageBox.Critical)
msg.setText("Feature Not Available")
msg.setInformativeText("The Python package openpyxl must be "
"installed to enable Excel export. Use "
"CSV export instead.")
msg.setWindowTitle("Error")
msg.exec_()
else:
from pandas import ExcelWriter
fp, _ = QtWidgets.QFileDialog.getSaveFileName(self.widget,
'Export XLSX')
if not fp:
return
# Write each event stream to a different spreadsheet in one
# Excel document.
writer = ExcelWriter(fp)
tables = {d['name']: self._db.get_table(self._header,
stream_name=d['name'])
for d in self._header.descriptors}
for name, df in tables.items():
df.to_excel(writmer, name)
writer.save()
def generate_report(result_dict, target_report_csv_path):
from six import StringIO
output_path = os.path.join(target_report_csv_path, result_dict["summary"]["strategy_name"])
try:
os.mkdir(output_path)
except:
pass
xlsx_writer = pd.ExcelWriter(os.path.join(output_path, "report.xlsx"), engine='xlsxwriter')
# summary.csv
csv_txt = StringIO()
summary = result_dict["summary"]
csv_txt.write(u"\n".join(sorted("{},{}".format(key, value) for key, value in six.iteritems(summary))))
df = pd.DataFrame(data=[{"val": val} for val in summary.values()], index=summary.keys()).sort_index()
df.to_excel(xlsx_writer, sheet_name="summary")
with open(os.path.join(output_path, "summary.csv"), 'w') as csvfile:
csvfile.write(csv_txt.getvalue())
for name in ["total_portfolios", "stock_portfolios", "future_portfolios",
"stock_positions", "future_positions", "trades"]:
try:
df = result_dict[name]
except KeyError:
continue
# replace all date in dataframe as string
if df.index.name == "date":
df = df.reset_index()
df["date"] = df["date"].apply(lambda x: x.strftime("%Y-%m-%d"))
df = df.set_index("date")
csv_txt = StringIO()
csv_txt.write(df.to_csv(encoding='utf-8'))
df.to_excel(xlsx_writer, sheet_name=name)
with open(os.path.join(output_path, "{}.csv".format(name)), 'w') as csvfile:
csvfile.write(csv_txt.getvalue())
# report.xls <--- ??sheet???
xlsx_writer.save()
def generate_report(result_dict, target_report_csv_path):
from six import StringIO
output_path = os.path.join(target_report_csv_path, result_dict["summary"]["strategy_name"])
try:
os.mkdir(output_path)
except:
pass
xlsx_writer = pd.ExcelWriter(os.path.join(output_path, "report.xlsx"), engine='xlsxwriter')
# summary.csv
csv_txt = StringIO()
summary = result_dict["summary"]
csv_txt.write(u"\n".join(sorted("{},{}".format(key, value) for key, value in six.iteritems(summary))))
df = pd.DataFrame(data=[{"val": val} for val in summary.values()], index=summary.keys()).sort_index()
df.to_excel(xlsx_writer, sheet_name="summary")
with open(os.path.join(output_path, "summary.csv"), 'w') as csvfile:
csvfile.write(csv_txt.getvalue())
for name in ["total_portfolios", "stock_portfolios", "future_portfolios",
"stock_positions", "future_positions", "trades"]:
try:
df = result_dict[name]
except KeyError:
continue
# replace all date in dataframe as string
if df.index.name == "date":
df = df.reset_index()
df["date"] = df["date"].apply(lambda x: x.strftime("%Y-%m-%d"))
df = df.set_index("date")
csv_txt = StringIO()
csv_txt.write(df.to_csv(encoding='utf-8'))
df.to_excel(xlsx_writer, sheet_name=name)
with open(os.path.join(output_path, "{}.csv".format(name)), 'w') as csvfile:
csvfile.write(csv_txt.getvalue())
# report.xls <--- ??sheet???
xlsx_writer.save()
def add_editor_list_to_excel(case=None):
'''save editor tool list order to the excel input file, "proposals.xlsx".
The list order will be saved to a new worksheet, "edit". Subsequent saved
lists will overwrite previous worksheets. Change the worksheet name of
previously saved worksheets from "edit" to something else prior to running
this function if they are to be preserved within the workbook.
The routine reads the case_dill.pkl file - this provides a write path to
the correct case study folder and excel "proposals.xlsx" file.
Then the routine reads the editor-produced p_new_order.pkl file and writes
it to the new worksheet "edit" in the proposals.xlsx file.
input
case (string)
The case study name (and consequently, the write file path).
This variable will default to the stored case study name contained
within the "dill/case_dill.pkl" file if no input is supplied by
the user.
'''
if not case:
try:
case = pd.read_pickle('dill/case_dill.pkl').case.value
except OSError:
print('case variable not found,',
'tried to find it in "dill/case_dill.pkl"',
'without success\n')
return
xl_str = 'excel/' + case + '/proposals.xlsx'
df = pd.read_pickle('dill/p_new_order.pkl')
df = df.reset_index()[['empkey']]
df.index = df.index + 1
df.index.name = 'order'
ws_dict = pd.read_excel(xl_str, index_col=0, sheetname=None)
ws_dict['edit'] = df
with pd.ExcelWriter(xl_str, engine='xlsxwriter') as writer:
for ws_name, df_sheet in ws_dict.items():
df_sheet.to_excel(writer, sheet_name=ws_name)
# Pretty print a dictionary...
def update_excel(case,
file,
ws_dict={},
sheets_to_remove=None):
'''Read an excel file, optionally remove worksheet(s), add worksheets
or overwrite worksheets with a dictionary of ws_name, dataframe key, value
pairs, and write the excel file back to disk
inputs
case (string)
the data model case name
file (string)
the excel file name without the .xlsx extension
ws_dict (dictionary)
dictionary of worksheet names as keys and pandas dataframes as
values. The items in this dictionary will be passed into the
excel file as worksheets. The worksheet name keys may be the
same as some or all of the worksheet names in the excel file.
In the case of matching names, the data from the input dict will
overwrite the existing data (worksheet) in the excel file.
Non-overlapping worksheet names/dataframe values will be added
as new worksheets.
sheets_to_remove (list)
a list of worksheet names (strings) representing worksheets to
remove from the excel workbook. It is not necessary to remove
sheets which are being replaced by worksheet with the same name.
'''
# read a single or multi-sheet excel file
# (returns dict of sheetname(s), dataframe(s))
path = 'excel/' + case + '/' + file + '.xlsx'
# make a copy of file before modifying
copy_excel_file(case, file, verbose=False)
# get a dictionary from the excel file consisting of worksheet name keys
# and worksheet contents as values (as dataframes)
try:
dict0 = pd.read_excel(path, sheetname=None)
except OSError:
print('Error: Unable to find "' + path + '"')
return
# all worksheets are now accessible as dataframes.
# drop worksheets which match an element in the sheets_to_remove:
if sheets_to_remove is not None:
for ws_name in sheets_to_remove:
dict0.pop(ws_name, None)
# update worksheet dictionary with ws_dict (ws_dict values will override
# existing values in the case of matching worksheet name keys):
dict0.update(ws_dict)
# write the updated dictionary back to excel...
with pd.ExcelWriter(path,
engine='xlsxwriter',
datetime_format='yyyy-mm-dd',
date_format='yyyy-mm-dd') as writer:
for sheet_name, df_sheet in dict0.items():
df_sheet.to_excel(writer, sheet_name=sheet_name)
def get(self, request):
user = request.user
bytes_io = io.BytesIO()
writer = pd.ExcelWriter(bytes_io, engine='xlsxwriter', options={'remove_timezone': True})
# supplement events
supplement_events_worksheet_name = 'SupplementEvents'
supplement_events = SupplementLog.objects.filter(user=user)
df_builder = SupplementEventsDataframeBuilder(supplement_events)
supplement_events_df = df_builder.get_flat_daily_dataframe()
self._write_to_workbook(writer, supplement_events_df, supplement_events_worksheet_name)
# sleep events
sleep_activities_worksheet_name = 'SleepActivities'
sleep_activities = SleepLog.objects.filter(user=user)
df_builder = SleepActivityDataframeBuilder(sleep_activities)
sleep_activities_series = df_builder.get_sleep_history_series()
self._write_to_workbook(writer, sleep_activities_series, sleep_activities_worksheet_name)
# user activity events
user_activity_events_sheet_name = 'UserActivityEvents'
user_activity_events = UserActivityLog.objects.filter(user=user)
df_builder = UserActivityEventDataframeBuilder(user_activity_events)
user_activity_events_df = df_builder.get_flat_daily_dataframe()
self._write_to_workbook(writer, user_activity_events_df, user_activity_events_sheet_name)
# productivity logs
productivity_log_sheet_name = 'DailyProductivityLog'
productivity_log = DailyProductivityLog.objects.filter(user=user)
df_builder = ProductivityLogEventsDataframeBuilder(productivity_log)
# odd why this one isn't sorted the right way
productivity_log_df = df_builder.get_flat_daily_dataframe().sort_index(ascending=True)
self._write_to_workbook(writer, productivity_log_df, productivity_log_sheet_name)
all_dataframes = [productivity_log_df, supplement_events_df, user_activity_events_df]
concat_dataframe = pd.concat(all_dataframes, axis=1)
# include sleep which is a series and not a dataframe
cumulative_log_sheet_name = 'Aggregate Log'
concat_dataframe[SLEEP_MINUTES_COLUMN] = sleep_activities_series
self._write_to_workbook(writer, concat_dataframe, cumulative_log_sheet_name)
cumulative_14_day_dataframe_sheet_name = 'Aggregate 14 Log'
cumulative_14_day_dataframe = concat_dataframe.rolling(window=14, min_periods=1).sum()[14:]
self._write_to_workbook(writer, cumulative_14_day_dataframe, cumulative_14_day_dataframe_sheet_name)
cumulative_28_day_dataframe_sheet_name = 'Aggregate 28 Log'
cumulative_28_day_dataframe = concat_dataframe.rolling(window=28, min_periods=1).sum()[28:]
self._write_to_workbook(writer, cumulative_28_day_dataframe, cumulative_28_day_dataframe_sheet_name)
# make sure all the output gets writen to bytes io
writer.close()
# http response because we are providing data and not doing any template / rendering
response = HttpResponse(
bytes_io.getvalue(),
content_type='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
response['Content-Disposition'] = 'attachment; filename=user_export_data.xlsx'
return response
def road_runner(roadbox, exectime, savepath, timestep='sec', st=True, sm=True, bar=True, ownfun=None):
print _tips.INFO('Process start...', 'GREEN')
if ownfun != None and bar is False:
raise ValueError(_tips.INFO("ownfun isn't None while bar is False", 'RED'))
if bar is True:
loop = trange(exectime)
info_d = loop.set_description
info_p = loop.set_postfix
info_r = loop.refresh
info_w = loop.write
else:
loop = range(exectime)
info_d = __empty
info_p = __empty
info_r = __empty
info_w = __empty
roadstbox = []
summarydata = pd.DataFrame(KEY)
tsdata = pd.DataFrame({'ROAD_HASH_ID':[], 'LANE_ID':[], 'TIME_STAMP':[], 'LOCATE':[]})
writer = pd.ExcelWriter(savepath+'.xlsx')
for road in roadbox:
rds = RoadStatus(road,timestep=timestep)
roadstbox.append(rds)
info_p(stdata=str(st), summarydata=str(sm))
info_d(_tips.INFO('Collecting data', 'GREEN'))
for i in loop:
try:
for road in roadbox:
road.reflush_status()
for stat in roadstbox:
if sm is True:
temp = stat.summary()
if len(temp) != 0:
summarydata = summarydata.append(temp)
if st is True:
temp = stat.get_time_space()
tsdata = tsdata.append(temp)
if ownfun is not None:
info_w(ownfun())
except:
info_d(_tips.INFO('FAILED!', 'RED'))
info_r()
raise KeyError
print _tips.INFO('Start writing data...', 'BLUE')
summarydata.to_excel(writer, 'SummaryData', index=False)
tsdata.to_excel(writer, 'SpaceTimeData', index=False)
writer.save()
print _tips.INFO('Done', 'GREEN')
def make_minimal_neoepitope_report(
ranked_variants_with_vaccine_peptides,
num_epitopes_per_peptide=None,
excel_report_path=None):
"""
Creates a simple Excel spreadsheet containing one neoepitope per row
Parameters
----------
ranked_variants_with_vaccine_peptides :
Ranked list of (variant, list of its vaccine peptides)
num_epitopes_per_peptide : int
The number of epitopes to include for each vaccine peptide; these are sorted before cutoff.
If None, all epitopes will be included in the output
excel_report_path : str
Path to which to write the output Excel file
"""
rows = []
# each row in the spreadsheet is a neoepitope
for (variant, vaccine_peptides) in ranked_variants_with_vaccine_peptides:
for vaccine_peptide in vaccine_peptides:
# only include mutant epitopes
for epitope_prediction in vaccine_peptide.mutant_epitope_predictions:
row = OrderedDict([
('Allele', epitope_prediction.allele),
('Mutant peptide sequence', epitope_prediction.peptide_sequence),
('Score', vaccine_peptide.mutant_epitope_score),
('Predicted mutant pMHC affinity', '%.2f nM' % epitope_prediction.ic50),
('Variant allele RNA read count',
vaccine_peptide.mutant_protein_fragment.n_alt_reads),
('Wildtype sequence', epitope_prediction.wt_peptide_sequence),
('Predicted wildtype pMHC affinity',
'%.2f nM' % epitope_prediction.wt_ic50),
('Gene name', vaccine_peptide.mutant_protein_fragment.gene_name),
('Genomic variant', variant.short_description),
])
rows.append(row)
if len(rows) > 0:
df = pd.DataFrame.from_dict(rows)
writer = pd.ExcelWriter(excel_report_path, engine='xlsxwriter')
df.to_excel(writer, sheet_name='Neoepitopes', index=False)
# resize columns to be not crappy
worksheet = writer.sheets['Neoepitopes']
worksheet.set_column('%s:%s' % ('B', 'B'), 23)
worksheet.set_column('%s:%s' % ('D', 'D'), 27)
worksheet.set_column('%s:%s' % ('E', 'E'), 26)
worksheet.set_column('%s:%s' % ('F', 'F'), 17)
worksheet.set_column('%s:%s' % ('G', 'G'), 30)
worksheet.set_column('%s:%s' % ('H', 'H'), 9)
worksheet.set_column('%s:%s' % ('I', 'I'), 18)
writer.save()
logger.info('Wrote XLSX neoepitope report file to %s', excel_report_path)
def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
float_format=None, columns=None, header=True, index=True,
index_label=None, startrow=0, startcol=0, engine=None,
merge_cells=True, encoding=None, inf_rep='inf', verbose=True,
**kwargs):
"""
Monkeypatched DataFrame.to_excel by xl_link!
Changes:
--------
Returns
-------
XLMap
corresponding to position of frame as it appears in excel (see XLMap for details)
See Also
--------
Pandas.DataFrame.to_excel for info on parameters
Note
----
When providing a path as excel_writer, default engine used is 'xlsxwriter', as xlsxwriter workbooks can only be
saved once, xl_link suppresses calling `excel_writer.save()`, as a result, `xlmap.writer.save()` should be
called once no further changes are to be made to the spreadsheet.
"""
if isinstance(excel_writer, pd.ExcelWriter):
need_save = False
else:
excel_writer = pd.ExcelWriter(_stringify_path(excel_writer), engine=engine)
need_save = True if excel_writer.engine != 'xlsxwriter' else False # xlsxwriter can only save once!
super().to_excel(excel_writer, sheet_name=sheet_name, na_rep=na_rep,
float_format=float_format, columns=columns, header=header, index=index,
index_label=index_label, startrow=startrow, startcol=startcol, engine=engine,
merge_cells=merge_cells, encoding=encoding, inf_rep=inf_rep, verbose=verbose,
**kwargs)
if need_save:
excel_writer.save()
data_range, index_range, col_range, _ = get_xl_ranges(self.index, self.columns,
sheet_name=sheet_name,
columns=columns,
header=header,
index=index,
index_label=index_label,
startrow=startrow,
startcol=startcol,
merge_cells=merge_cells)
f = self.copy()
if isinstance(columns, list) or isinstance(columns, tuple):
f = f[columns]
return XLMap(data_range, index_range, col_range, f, writer=excel_writer)
def make_interpolate_curve(self):
""" interpolate contour curves """
lcx = np.zeros(0)
lcy = np.zeros(0)
lcx = np.append(lcx, np.array(self.lower_concave_in_x)[::-1])
lcx = np.append(lcx, self.lower_arc_x)
lcx = np.append(lcx, np.array(self.lower_concave_out_x))
lcy = np.append(lcy, np.array(self.lower_concave_in_y)[::-1])
lcy = np.append(lcy, self.lower_arc_y)
lcy = np.append(lcy, np.array(self.lower_concave_out_y))
self.lower_curve_x = lcx
self.lower_curve_y = lcy
self.lower_curve_x_shift = lcx
self.lower_curve_y_shift = lcy + self.shift
ucx = np.zeros(0)
ucy = np.zeros(0)
ucx = np.append(ucx, np.array(self.edge_straight_in_x))
ucx = np.append(ucx, np.array(self.upper_straight_in_x))
ucx = np.append(ucx, np.array(self.upper_convex_in_x)[::-1])
ucx = np.append(ucx, self.upper_arc_x)
ucx = np.append(ucx, np.array(self.upper_convex_out_x))
ucx = np.append(ucx, np.array(self.upper_straight_out_x))
ucx = np.append(ucx, np.array(self.edge_straight_out_x))
ucy = np.append(ucy, np.array(self.edge_straight_in_y))
ucy = np.append(ucy, np.array(self.upper_straight_in_y))
ucy = np.append(ucy, np.array(self.upper_convex_in_y)[::-1])
ucy = np.append(ucy, self.upper_arc_y)
ucy = np.append(ucy, np.array(self.upper_convex_out_y))
ucy = np.append(ucy, np.array(self.upper_straight_out_y))
ucy = np.append(ucy, np.array(self.edge_straight_out_y))
self.upper_curve_x = ucx
self.upper_curve_y = ucy
print(len(ucx),len(ucy))
x = np.linspace(ucx.min(), ucx.max(), self.num_output_points)
lcy_func = interp1d(self.lower_curve_x, self.lower_curve_y)
lcy_shift_func = interp1d(self.lower_curve_x_shift, self.lower_curve_y_shift)
ucy_func = interp1d(self.upper_curve_x, self.upper_curve_y)
self.lower_curve_x_interp = x
self.lower_curve_y_interp = lcy_func(x)
self.lower_curve_x_shift_interp = x
self.lower_curve_y_shift_interp = lcy_shift_func(x)
self.upper_curve_x_interp = x
self.upper_curve_y_interp = ucy_func(x)
# make pandas DataFrame to save contour
tmp = [x, self.lower_curve_y_shift_interp,
self.upper_curve_y_interp, self.lower_curve_y_interp]
self.dfc = pd.DataFrame(tmp, index = ["x", "lower curve1", "upper curve", "lower curve2"])
if(self.is_save_excel):
""" save contour in Excel file """
writer = pd.ExcelWriter("result/turbine_contour_" + self.name + ".xlsx")
self.dfc.T.to_excel(writer, "contour")
writer.save()
def render(self, format, kind=None, buffer=None):
# Variable aliases
bucket = self.bucket
df = self.dataframe
# Compute group name for HDF5 and NetCDF formats
# TODO: Optionally prefix with "realm" from "bucket.tdata"
group_name = bucket.title.short
if format == 'xlsx':
# http://pandas.pydata.org/pandas-docs/stable/io.html#io-excel-writer
# https://stackoverflow.com/questions/28058563/write-to-stringio-object-using-pandas-excelwriter
with pandas.ExcelWriter('temp.xlsx', engine='xlsxwriter') as excel_writer:
excel_writer.book.filename = buffer
df.to_excel(excel_writer, sheet_name=bucket.title.compact[:31], index=False)
elif format in ['hdf', 'hdf5', 'h5']:
# Create index from "time" column
df = dataframe_index_and_sort(df, 'time')
# http://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables
t = tempfile.NamedTemporaryFile(suffix='.hdf5')
try:
df.to_hdf(t.name, group_name, format='table', data_columns=True, index=False)
buffer.write(t.read())
except Exception as ex:
return self.request.error_response(bucket, with_traceback=True)
elif format in ['nc', 'cdf']:
# Create index from "time" column
df = dataframe_index_and_sort(df, 'time')
# http://xarray.pydata.org/
# http://xarray.pydata.org/en/stable/io.html#netcdf
t = tempfile.NamedTemporaryFile(suffix='.nc')
try:
#df.to_xarray().to_netcdf(path=t.name, group=group_name)
#df.to_xarray().to_netcdf(path=t.name, format='NETCDF4', engine='h5netcdf', group=group_name)
df.to_xarray().to_netcdf(path=t.name, format='NETCDF4', engine='netcdf4', group=group_name)
buffer.write(t.read())
except Exception as ex:
return self.request.error_response(bucket, with_traceback=True)
elif format in ['dt', 'datatables']:
# https://datatables.net/
# Compute data_uri, forward "from" and "to" parameters
data_uri = get_data_uri(bucket, 'data.html')
# Render HTML snippet containing DataTable widget
page = DatatablesPage(data_uri=data_uri, bucket=bucket)
bucket.request.setHeader('Content-Type', 'text/html; charset=utf-8')
return renderElement(bucket.request, page)
def process(self):
fields = [
"Chrom", "Position", "Covmp", "Ref", "Var", "Cons", "Fasta",
"Qdepth", "Reads1", "Reads2", "Freq", "P-value",
"StrandFilter", "R1+", "R1-", "R2+", "R2-"
]
self.output_fasta = []
shift = int(self.shift.pop(0))
input_orig = self.input_orig.pop(0)
input_shifted = self.input_shifted.pop(0)
all_orig = pd.read_csv(input_orig, sep='\t', header=[0,1], na_values='-').fillna(-1)
all_shifted = pd.read_csv(input_shifted, sep='\t', header=[0,1], na_values='-').fillna(-1)
contents = {}
for sample_id in all_orig.columns.levels[0]:
fasta = ''
df_orig = all_orig[sample_id]
df_shifted = all_shifted[sample_id]
contents[sample_id] = pd.DataFrame(columns=df_orig.columns)
nrows = df_orig.index.size
for idx in df_orig.index:
shifted_idx = (idx+shift)%nrows
# Check the two are aligned. Take into account missing positions in one of them
if df_orig.loc[idx, 'Ref'] != df_shifted.loc[shifted_idx, 'Ref']:
if df_orig.loc[idx, 'Ref'] != -1 and df_shifted.loc[shifted_idx, 'Ref'] != -1:
print 'index =', idx, df_orig.loc[idx:idx+3, 'Ref'], df_shifted.loc[shifted_idx:shifted_idx+3, 'Ref']
raise Exception("Shifted and non-shifted summaries are not aligned")
if df_orig.loc[idx, 'Qdepth'] > df_shifted.loc[shifted_idx, 'Qdepth']:
contents[sample_id].loc[idx] = df_orig.loc[idx]
else:
contents[sample_id].loc[idx] = df_shifted.loc[shifted_idx]
contents[sample_id].loc[idx, 'Position'] = df_orig.loc[idx, 'Position']
if contents[sample_id].loc[idx, 'Fasta']>-1:
fasta += contents[sample_id].loc[idx, 'Fasta']
else:
contents[sample_id].loc[idx, 'Fasta'] = ''
output_fasta = sample_id+'.fasta'
self.output_fasta.append(output_fasta)
with open(output_fasta, 'w') as fh:
fh.write('>'+sample_id+'\n')
fh.write(textwrap.fill(fasta, width=60))
#THIS DOES NOT ALWAYS WORK: EXCEL FAILS TO READ OUTPUT FILE
#with pd.ExcelWriter(self.output_summary, engine='openpyxl') as writer:
# df.to_excel(writer, sheet_name=sample_id, index=False, columns=fields)
# writer.save()
all_samples = pd.concat(contents.values(), keys=contents.keys(), axis=1)
all_samples.dropna(axis=0, how='all', inplace=True)
ordered = all_samples.reindex(columns=fields, level=1)
ordered.to_csv(self.output_summary, sep='\t', index=False)
keyterm_clustering.py 文件源码
项目:contextual-advertising-deploy
作者: andreicnica
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def compute_suggested_adv_keyterms_dataset(relative_dataset_filename, min_members_per_cluster = 5):
filepath = "dataset/keyterm_clustering/" + relative_dataset_filename + ".json"
top_adv_clusters_filepath = "dataset/keyterm_clustering/top_adv_keyterm_clusters.dump"
## load dataset and embedding model
print "Loading Embedding model ..."
embedding_model = load_embedding_model(True)
vocabulary = embedding_model.vocab
df = None
top_adv_clusters = None
print "Loading datasets ..."
with open(top_adv_clusters_filepath) as fp:
top_adv_clusters = np.load(fp)
with open(filepath) as fp:
df = pd.read_json(fp)
## compute
result_dataset = []
print "Starting computation ..."
for index, row in df.iterrows():
url = row['url']
print "Processing clusters for URL: " + url + " ..."
clusters = row['clusters']
for cl_data in clusters:
if cl_data['len'] >= min_members_per_cluster:
suggested_keyterms = suggest_top_adv_keyterms(cl_data, top_adv_clusters, embedding_model)
entry = {
'url': url,
'cl_idx': cl_data['idx'],
'cl_center': cl_data['center'],
'cl_len': cl_data['len'],
'suggested_keyterms': suggested_keyterms
}
result_dataset.append(entry)
df_matching = pd.DataFrame.from_records(result_dataset)
writer = pd.ExcelWriter("dataset/keyterm_clustering/" + relative_dataset_filename + "_suggested_adv" + ".xlsx")
df_matching.to_excel(writer, "adv_matching")
writer.save()
return df_matching
def writeDiffsToXLSX(outFile, referenceFile, otherFiles, skiprows=1, interpolate=False,
years=None, startYear=0):
"""
Compute the differences between the data in a reference .CSV file and one or more other
.CSV files as (other - reference), optionally interpolating annual values between
timesteps, storing the results in a single .XLSX file with each difference matrix
on a separate worksheet, and with an index worksheet with links to the other worksheets.
See also :py:func:`writeDiffsToCSV` and :py:func:`writeDiffsToFile`.
:param outFile: (str) the name of the .XLSX file to create
:param referenceFile: (str) the name of a .CSV file containing reference results
:param otherFiles: (list of str) the names of other .CSV file for which to
compute differences.
:param skiprows: (int) should be 1 for GCAM files, to skip header info before column names
:param interpolate: (bool) if True, linearly interpolate annual values between timesteps
in all data files and compute the differences for all resulting years.
:param years: (iterable of 2 values coercible to int) the range of years to include in
results.
:param startYear: (int) the year at which to begin interpolation, if interpolate is True.
Defaults to the first year in `years`.
:return: none
"""
import pandas as pd
with pd.ExcelWriter(outFile, engine='xlsxwriter') as writer:
sheetNum = 1
_logger.debug("Reading reference file:", referenceFile)
refDF = readCsv(referenceFile, skiprows=skiprows, interpolate=interpolate,
years=years, startYear=startYear)
for otherFile in otherFiles:
otherFile = ensureCSV(otherFile) # add csv extension if needed
_logger.debug("Reading other file:", otherFile)
otherDF = readCsv(otherFile, skiprows=skiprows, interpolate=interpolate,
years=years, startYear=startYear)
sheetName = 'Diff%d' % sheetNum
sheetNum += 1
diff = computeDifference(refDF, otherDF)
diff.to_excel(writer, index=None, sheet_name=sheetName, startrow=2, startcol=0)
worksheet = writer.sheets[sheetName]
label = "[%s] minus [%s]" % (otherFile, referenceFile)
worksheet.write_string(0, 0, label)
startRow = diff.shape[0] + 4
worksheet.write_string(startRow, 0, otherFile)
startRow += 2
otherDF.reset_index(inplace=True)
otherDF.to_excel(writer, index=None, sheet_name=sheetName, startrow=startRow, startcol=0)
dropExtraCols(refDF, inplace=True)
_logger.debug("writing DF to excel file", outFile)
refDF.to_excel(writer, index=None, sheet_name='Reference', startrow=0, startcol=0)
def generate_report(result_dict, target_report_csv_path):
from six import StringIO
output_path = os.path.join(target_report_csv_path, result_dict["summary"]["strategy_name"])
try:
os.mkdir(output_path)
except:
pass
xlsx_writer = pd.ExcelWriter(os.path.join(output_path, "report.xlsx"), engine='xlsxwriter')
# summary.csv
csv_txt = StringIO()
summary = result_dict["summary"]
csv_txt.write(u"\n".join(sorted("{},{}".format(key, value) for key, value in six.iteritems(summary))))
df = pd.DataFrame(data=[{"val": val} for val in summary.values()], index=summary.keys()).sort_index()
df.to_excel(xlsx_writer, sheet_name="summary")
with open(os.path.join(output_path, "summary.csv"), 'w') as csvfile:
csvfile.write(csv_txt.getvalue())
for name in ["portfolio", "stock_account", "future_account",
"stock_positions", "future_positions", "trades"]:
try:
df = result_dict[name]
except KeyError:
continue
# replace all date in dataframe as string
if df.index.name == "date":
df = df.reset_index()
df["date"] = df["date"].apply(lambda x: x.strftime("%Y-%m-%d"))
df = df.set_index("date")
csv_txt = StringIO()
csv_txt.write(df.to_csv(encoding='utf-8'))
df.to_excel(xlsx_writer, sheet_name=name)
with open(os.path.join(output_path, "{}.csv".format(name)), 'w') as csvfile:
csvfile.write(csv_txt.getvalue())
# report.xls <--- ??sheet???
xlsx_writer.save()
def get_context_data(self, **kwargs):
context = {}
params_data = self.request.GET
subject = Subject.objects.get(id=params_data['subject_id'])
context['title'] = _('Interaction Data')
context['subject_name'] = subject.name
if params_data['topic'] == _("All"):
context['topic_name'] = params_data['topic']
else:
context['topic_name'] = Topic.objects.get(id=int(params_data['topic'])).name
context['init_date'] = params_data['init_date']
context['end_date'] = params_data['end_date']
context['subject'] = subject
#I used getlist method so it can get more than one tag and one resource class_name
resources = params_data.getlist('resource')
tags = params_data.getlist('tag')
self.from_mural = params_data['from_mural']
self.from_messages = params_data['from_messages']
context['data'], context['header'] = self.get_mural_data(subject, params_data['topic'], params_data['init_date'], params_data['end_date'],
resources, tags )
#this is to save the csv for further download
df = pd.DataFrame.from_dict(context['data'], orient='index')
df.columns = context['header']
#so it does not exist more than one report CSV available for that user to download
if ReportCSV.objects.filter(user= self.request.user).count() > 0:
report = ReportCSV.objects.get(user=self.request.user)
report.delete()
report = ReportCSV(user= self.request.user, csv_data = df.to_csv())
report.save()
#for excel files
if ReportXLS.objects.filter(user= self.request.user).count() > 0:
report = ReportXLS.objects.get(user=self.request.user)
report.delete()
folder_path = join(settings.MEDIA_ROOT, 'files')
#check if the folder already exists
if not os.path.isdir(folder_path):
os.makedirs(folder_path)
path = join(settings.MEDIA_ROOT, 'files' , 'report'+str(self.request.user.id)+'.xls')
writer = pd.ExcelWriter(path)
df.to_excel(writer, sheet_name='first_sheet')
writer.save()
report = ReportXLS(user= self.request.user )
report.xls_data.name = path
report.save()
return context
def create_excel(transactions, name_file):
"""
Create an Excel file using transactions
This function is responsible for creating an Excel file using all the
transactions we have until today.
Creates a table order by date of transaction and highlights the positive
and negative transactions.
Args:
name_file (string): name of the excel file.
transactions (list): list of transactions
"""
df = pd.DataFrame({'Data': transactions.keys(),
'Montante': transactions.values()})
df['Data'] = pd.to_datetime(df.Data, dayfirst=[True])
df = df.sort_values(['Data', 'Montante'], ascending=[True, False])
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('{}.xlsx'.format(name_file), engine='xlsxwriter',
datetime_format='dd-mm-yyyy')
df.to_excel(writer, sheet_name='Transactions', index=False)
# Get the xlsxwriter objects from the dataframe writer object.
workbook = writer.book
worksheet = writer.sheets['Transactions']
format_mont = workbook.add_format({'num_format': u'#,##0.00 \u20ac'})
format_red = workbook.add_format({'bg_color': '#FFC7CE',
'font_color': '#9C0006'})
format_green = workbook.add_format({'bg_color': '#C6EFCE',
'font_color': '#006100'})
worksheet.set_column('B:B', 10, format_mont)
worksheet.set_column('A:A', 13, None)
worksheet.conditional_format('B2:B{}'.format(len(df.index) + 1), {'type': 'cell',
'criteria': '>',
'value': 0,
'format': format_green})
worksheet.conditional_format('B1:B{}'.format(len(df.index) + 1), {'type': 'cell',
'criteria': '<',
'value': 0,
'format': format_red})
writer.save()