def construct_pssm(cds, length=90, out_path="", prob=None):
"""
Construct Position Specific Scoring Matrices with log-likelihood values
length: size of analyzed region from start, in bp (sequences that are not this size are discarded)
prob : a dict of bases with a priori expected probabilities
"""
cds = cds[0]
if not prob:
prob = {"a":0.25, "t":0.25, "g":0.25, "c":0.25}
m = {"a":[0]*length, "t":[0]*length, "g":[0]*length, "c":[0]*length}
tot_gene = 0.0
for gene in cds:
if len(cds[gene]) >= length:
tot_gene += 1
for i in range(length):
m[cds[gene][i]][i] += 1
for k in m:
m[k] = [log((v/tot_gene)/prob[k]) for v in m[k]]
if out_path:
h = open(out_path, "w")
h.write(","+",".join([str(i) for i in range(1,length+1)])+"\n")
for b in ["a", "t", "g", "c"]:
h.write(b+","+",".join(["%.2f" % v for v in m[b]])+"\n")
h.close()
return m
评论列表
文章目录