def filter_data(self, mind=0.2, recalculate=True):
"""
Re-write with Pandas
"""
if mind is None:
stamp("Returning data without filtering.")
return self.data, self.attributes
stamp("Filtering samples with missing data >", mind)
stamp("Missing data calculated over", len(self.data), "SNPs")
mind_prop = self._calculate_mind()
to_remove = mind_prop[mind_prop > mind].index.tolist()
filtered_data = {}
for snp, data in self.data.items():
data["calls"] = [snp_call for i, snp_call in self._iterate_call_indices(data["calls"])
if i not in to_remove]
filtered_data[snp] = data
attributes = self._adjust_attributes(self.attributes, mind, to_remove)
percent_removed = format((len(to_remove) / attributes["sample_size"])*100, ".2f")
stamp("Removed {r} samples out of {t} samples ({p}%)"
.format(r=len(to_remove), t=attributes["sample_size"], p=percent_removed))
# Recalculating SNP parameters:
if recalculate:
stamp("Recalculating MAF, CALL RATE and HWE for SNPs")
marker = SNPModule(filtered_data, attributes)
filtered_data, attributes = marker.get_data(threshold=None)
return filtered_data, attributes
评论列表
文章目录