lfvmerge.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:pypers 作者: frankosan 项目源码 文件源码
def process(self):
        fields = [ 
                    "Chrom",  "Position", "Covmp", "Ref", "Var", "Cons", "Fasta",
                    "Qdepth", "Reads1", "Reads2", "Freq", "P-value", 
                    "StrandFilter", "R1+", "R1-", "R2+", "R2-" 
                 ]
        self.output_fasta = []
        shift = int(self.shift.pop(0))
        input_orig = self.input_orig.pop(0)
        input_shifted = self.input_shifted.pop(0)

        all_orig    = pd.read_csv(input_orig,    sep='\t', header=[0,1], na_values='-').fillna(-1)
        all_shifted = pd.read_csv(input_shifted, sep='\t', header=[0,1], na_values='-').fillna(-1)
        contents = {}
        for sample_id in all_orig.columns.levels[0]:
            fasta = ''
            df_orig    = all_orig[sample_id]
            df_shifted = all_shifted[sample_id]
            contents[sample_id] = pd.DataFrame(columns=df_orig.columns)
            nrows = df_orig.index.size
            for idx in df_orig.index:
                shifted_idx = (idx+shift)%nrows
                # Check the two are aligned. Take into account missing positions in one of them
                if df_orig.loc[idx, 'Ref'] != df_shifted.loc[shifted_idx, 'Ref']:
                    if df_orig.loc[idx, 'Ref'] != -1 and df_shifted.loc[shifted_idx, 'Ref'] != -1:
                        print 'index =', idx, df_orig.loc[idx:idx+3, 'Ref'], df_shifted.loc[shifted_idx:shifted_idx+3, 'Ref']
                        raise Exception("Shifted and non-shifted summaries are not aligned")
                if df_orig.loc[idx, 'Qdepth'] > df_shifted.loc[shifted_idx, 'Qdepth']:
                    contents[sample_id].loc[idx] = df_orig.loc[idx]
                else:
                    contents[sample_id].loc[idx] = df_shifted.loc[shifted_idx]
                    contents[sample_id].loc[idx, 'Position'] = df_orig.loc[idx, 'Position']
                if contents[sample_id].loc[idx, 'Fasta']>-1:
                    fasta += contents[sample_id].loc[idx, 'Fasta']
                else:
                    contents[sample_id].loc[idx, 'Fasta'] = ''


            output_fasta = sample_id+'.fasta'
            self.output_fasta.append(output_fasta)
            with open(output_fasta, 'w') as fh:
                fh.write('>'+sample_id+'\n')
                fh.write(textwrap.fill(fasta, width=60))

            #THIS DOES NOT ALWAYS WORK: EXCEL FAILS TO READ OUTPUT FILE     
            #with pd.ExcelWriter(self.output_summary, engine='openpyxl') as writer:
            #    df.to_excel(writer, sheet_name=sample_id, index=False, columns=fields)
            #    writer.save()

        all_samples = pd.concat(contents.values(), keys=contents.keys(), axis=1)
        all_samples.dropna(axis=0, how='all', inplace=True)
        ordered = all_samples.reindex(columns=fields, level=1)
        ordered.to_csv(self.output_summary, sep='\t', index=False)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号