rev |
line source |
BurkovBA@0
|
1 #!usr/bin/python |
BurkovBA@0
|
2 |
BurkovBA@0
|
3 import sys |
BurkovBA@0
|
4 |
bnagaev@175
|
5 import alignment |
BurkovBA@0
|
6 import sequence |
BurkovBA@0
|
7 import monomer |
bnagaev@116
|
8 import config |
bnagaev@121
|
9 from graph import Graph |
bnagaev@146
|
10 from Bio.PDB import Superimposer |
bnagaev@149
|
11 from tempfile import NamedTemporaryFile |
bnagaev@149
|
12 import os |
bnagaev@151
|
13 from fasta import save_fasta |
BurkovBA@0
|
14 |
BurkovBA@0
|
15 class Block(object): |
bnagaev@147
|
16 """ Block of alignment |
bnagaev@147
|
17 |
BurkovBA@0
|
18 Mandatory data: |
bnagaev@175
|
19 * self.alignment -- alignment object, which the block belongs to |
BurkovBA@1
|
20 * self.sequences - set of sequence objects that contain monomers |
BurkovBA@0
|
21 and/or gaps, that constitute the block |
bnagaev@175
|
22 * self.positions -- list of positions of the alignment.body that |
bnagaev@172
|
23 are included in the block; position[i+1] is always to the right from position[i] |
bnagaev@116
|
24 |
bnagaev@132
|
25 Don't change self.sequences -- it may be a link to other block.sequences |
bnagaev@132
|
26 |
BurkovBA@0
|
27 How to create a new block: |
bnagaev@175
|
28 >>> import alignment |
BurkovBA@0
|
29 >>> import block |
bnagaev@175
|
30 >>> proj = alignment.Alignment(open("test.fasta")) |
bnagaev@114
|
31 >>> block1 = block.Block(proj) |
BurkovBA@0
|
32 """ |
BurkovBA@0
|
33 |
bnagaev@175
|
34 def __init__(self, alignment, sequences=None, positions=None): |
bnagaev@175
|
35 """ Builds new block from alignment |
bnagaev@147
|
36 |
bnagaev@112
|
37 if sequences==None, all sequences are used |
bnagaev@112
|
38 if positions==None, all positions are used |
bnagaev@112
|
39 """ |
bnagaev@112
|
40 if sequences == None: |
bnagaev@175
|
41 sequences = set(alignment.sequences) # copy |
bnagaev@112
|
42 if positions == None: |
bnagaev@175
|
43 positions = range(len(alignment)) |
bnagaev@175
|
44 self.alignment = alignment |
BurkovBA@73
|
45 self.sequences = sequences |
BurkovBA@73
|
46 self.positions = positions |
bnagaev@146
|
47 |
bnagaev@151
|
48 def save_fasta(self, out_file, long_line=70, gap='-'): |
bnagaev@148
|
49 """ Saves alignment to given file in fasta-format |
bnagaev@148
|
50 |
BurkovBA@0
|
51 No changes in the names, descriptions or order of the sequences |
BurkovBA@0
|
52 are made. |
BurkovBA@0
|
53 """ |
BurkovBA@0
|
54 for sequence in self.sequences: |
bnagaev@175
|
55 alignment_monomers = self.alignment.body[sequence] |
bnagaev@115
|
56 block_monomers = [alignment_monomers[i] for i in self.positions] |
bnagaev@113
|
57 string = ''.join([m.type.code1 if m else '-' for m in block_monomers]) |
bnagaev@151
|
58 save_fasta(out_file, string, sequence.name, sequence.description, long_line) |
bnagaev@116
|
59 |
bnagaev@127
|
60 def geometrical_cores(self, max_delta=config.delta, |
bnagaev@129
|
61 timeout=config.timeout, minsize=config.minsize, |
bnagaev@129
|
62 ac_new_atoms=config.ac_new_atoms, |
bnagaev@129
|
63 ac_count=config.ac_count): |
bnagaev@150
|
64 """ Returns length-sorted list of blocks, representing GCs |
bnagaev@126
|
65 |
bnagaev@129
|
66 max_delta -- threshold of distance spreading |
bnagaev@129
|
67 timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm) |
bnagaev@129
|
68 minsize -- min size of each core |
bnagaev@129
|
69 ac_new_atoms -- min part or new atoms in new alternative core |
bnagaev@129
|
70 current GC is compared with each of already selected GCs |
bnagaev@129
|
71 if difference is less then ac_new_atoms, current GC is skipped |
bnagaev@129
|
72 difference = part of new atoms in current core |
bnagaev@129
|
73 ac_count -- max number of cores (including main core) |
bnagaev@130
|
74 -1 means infinity |
bnagaev@120
|
75 If more than one pdb chain for some sequence provided, consider all of them |
bnagaev@120
|
76 cost is calculated as 1 / (delta + 1) |
bnagaev@120
|
77 delta in [0, +inf) => cost in (0, 1] |
bnagaev@116
|
78 """ |
bnagaev@117
|
79 nodes = self.positions |
bnagaev@117
|
80 lines = {} |
bnagaev@116
|
81 for i in self.positions: |
bnagaev@116
|
82 for j in self.positions: |
bnagaev@117
|
83 if i < j: |
bnagaev@120
|
84 distances = [] |
bnagaev@120
|
85 for sequence in self.sequences: |
bnagaev@120
|
86 for chain in sequence.pdb_chains: |
bnagaev@175
|
87 m1 = self.alignment.body[sequence][i] |
bnagaev@175
|
88 m2 = self.alignment.body[sequence][j] |
bnagaev@122
|
89 if m1 and m2: |
bnagaev@153
|
90 r1 = sequence.pdb_residues[chain][m1] |
bnagaev@153
|
91 r2 = sequence.pdb_residues[chain][m2] |
bnagaev@153
|
92 ca1 = r1['CA'] |
bnagaev@153
|
93 ca2 = r2['CA'] |
bnagaev@122
|
94 d = ca1 - ca2 # Bio.PDB feature |
bnagaev@122
|
95 distances.append(d) |
bnagaev@122
|
96 if len(distances) >= 2: |
bnagaev@122
|
97 delta = max(distances) - min(distances) |
bnagaev@122
|
98 if delta <= max_delta: |
bnagaev@122
|
99 lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta) |
bnagaev@120
|
100 graph = Graph(nodes, lines) |
bnagaev@129
|
101 cliques = graph.cliques(timeout=timeout, minsize=minsize) |
bnagaev@129
|
102 GCs = [] |
bnagaev@129
|
103 for clique in cliques: |
bnagaev@129
|
104 for GC in GCs: |
bnagaev@129
|
105 if len(clique - set(GC.positions)) < ac_new_atoms * len(clique): |
bnagaev@129
|
106 break |
bnagaev@129
|
107 else: |
bnagaev@175
|
108 GCs.append(Block(self.alignment, self.sequences, clique)) |
bnagaev@130
|
109 if ac_count != -1 and len(GCs) >= ac_count: |
bnagaev@129
|
110 break |
bnagaev@129
|
111 return GCs |
bnagaev@123
|
112 |
bnagaev@137
|
113 def xstring(self, x='X', gap='-'): |
bnagaev@148
|
114 """ Returns string consisting of gap chars and chars x at self.positions |
bnagaev@148
|
115 |
bnagaev@175
|
116 Length of returning string = length of alignment |
bnagaev@123
|
117 """ |
bnagaev@175
|
118 monomers = [False] * len(self.alignment) |
bnagaev@123
|
119 for i in self.positions: |
bnagaev@123
|
120 monomers[i] = True |
bnagaev@137
|
121 return ''.join([x if m else gap for m in monomers]) |
bnagaev@134
|
122 |
bnagaev@151
|
123 def save_xstring(self, out_file, name, description='', x='X', gap='-', long_line=70): |
bnagaev@148
|
124 """ Save xstring and name in fasta format """ |
bnagaev@151
|
125 save_fasta(out_file, self.xstring(x=x, gap=gap), name, description, long_line) |
bnagaev@146
|
126 |
bnagaev@146
|
127 def monomers(self, sequence): |
bnagaev@146
|
128 """ Iterates monomers of this sequence from this block """ |
bnagaev@175
|
129 alignment_sequence = self.alignment.body[sequence] |
bnagaev@146
|
130 return (alignment_sequence[i] for i in self.positions) |
bnagaev@146
|
131 |
bnagaev@146
|
132 def ca_atoms(self, sequence, pdb_chain): |
bnagaev@146
|
133 """ Iterates Ca-atom of monomers of this sequence from this block """ |
bnagaev@153
|
134 return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers()) |
bnagaev@146
|
135 |
bnagaev@146
|
136 def sequences_chains(self): |
bnagaev@146
|
137 """ Iterates pairs (sequence, chain) """ |
bnagaev@175
|
138 for sequence in self.alignment.sequences: |
bnagaev@151
|
139 if sequence in self.sequences: |
bnagaev@151
|
140 for chain in sequence.pdb_chains: |
bnagaev@151
|
141 yield (sequence, chain) |
bnagaev@146
|
142 |
bnagaev@146
|
143 def superimpose(self): |
bnagaev@146
|
144 """ Superimpose all pdb_chains in this block """ |
bnagaev@149
|
145 sequences_chains = list(self.sequences_chains()) |
bnagaev@149
|
146 if len(sequences_chains) >= 1: |
bnagaev@146
|
147 sup = Superimposer() |
bnagaev@146
|
148 fixed_sequence, fixed_chain = sequences_chains.pop() |
bnagaev@146
|
149 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain) |
bnagaev@146
|
150 for sequence, chain in sequences_chains: |
bnagaev@146
|
151 moving_atoms = self.ca_atoms(sequence, chain) |
bnagaev@146
|
152 sup.set_atoms(fixed_atoms, moving_atoms) |
bnagaev@146
|
153 # Apply rotation/translation to the moving atoms |
bnagaev@146
|
154 sup.apply(moving_atoms) |
bnagaev@146
|
155 |
bnagaev@146
|
156 def pdb_save(self, out_file): |
bnagaev@149
|
157 """ Save all sequences |
bnagaev@149
|
158 |
bnagaev@149
|
159 Returns {(sequence, chain): CHAIN} |
bnagaev@149
|
160 CHAIN is chain letter in new file |
bnagaev@149
|
161 """ |
bnagaev@149
|
162 tmp_file = NamedTemporaryFile(delete=False) |
bnagaev@149
|
163 tmp_file.close() |
bnagaev@149
|
164 |
bnagaev@149
|
165 for sequence, chain in self.sequences_chains(): |
bnagaev@149
|
166 sequence.pdb_save(tmp_file.name, chain) |
bnagaev@149
|
167 # TODO: read from tmp_file.name |
bnagaev@149
|
168 # change CHAIN |
bnagaev@149
|
169 # add to out_file |
bnagaev@149
|
170 |
bnagaev@149
|
171 os.unlink(NamedTemporaryFile) |
bnagaev@146
|
172 |