def parsimony_grouping(g, peps):
''' Group peptides to proteins using the rule of parsimony
Inputs:
g: an undirected graph with peptide <-> protein as edges
peps: the set of peptide sequences, nodes not listed in the peptide set
are protein IDs.
'''
not_peps = set(g.nodes()) - set(peps)
prot_groups = dict()
for cc in nx.connected_component_subgraphs(g):
in_group_peptides = set(cc.nodes()) - not_peps
in_group_proteins = not_peps.intersection(cc.nodes())
if len(in_group_proteins) == 1:
prot_groups[in_group_proteins.pop()] = in_group_peptides
elif len(in_group_proteins) > 1:
reported = set()
while len(in_group_proteins - reported) > 0:
candidate_proteins = sorted(in_group_proteins - reported,
key=lambda p: (
len(set(cc[p].keys()) - reported), p),
reverse=True)
p = candidate_proteins[0]
current_peps = set(cc[p].keys())
plabel = [p]
for i in range(1, len(candidate_proteins)):
_p = candidate_proteins[i]
_peps = set(cc[_p].keys())
if _peps == current_peps:
plabel.append(_p)
if len(_peps - current_peps) == 0:
reported.add(_p)
plabel = ';'.join(sorted(plabel))
if len(current_peps - reported) > 0:
prot_groups[plabel] = current_peps
reported = reported.union(current_peps)
reported.add(p)
return prot_groups
评论列表
文章目录