rev |
line source |
me@261
|
1 import sys |
me@261
|
2 import os |
me@262
|
3 import os.path |
me@261
|
4 from tempfile import NamedTemporaryFile |
me@262
|
5 import urllib2 |
me@261
|
6 |
me@261
|
7 import config |
me@284
|
8 import fasta |
me@261
|
9 from graph import Graph |
me@262
|
10 from Bio.PDB.DSSP import make_dssp_dict |
me@260
|
11 import data.codes |
me@260
|
12 |
me@260
|
13 class MonomerType(object): |
me@260
|
14 """Class of monomer types. |
me@260
|
15 |
me@260
|
16 Each MonomerType object represents a known monomer type, e.g. Valine, |
me@260
|
17 and is referenced to by each instance of monomer in a given sequence. |
me@260
|
18 |
me@260
|
19 - `name`: full name of monomer type |
me@260
|
20 - `code1`: one-letter code |
me@260
|
21 - `code3`: three-letter code |
me@260
|
22 - `is_modified`: either of True or False |
me@260
|
23 |
me@260
|
24 class atributes: |
me@260
|
25 |
me@260
|
26 - `by_code1`: a mapping from one-letter code to MonomerType object |
me@260
|
27 - `by_code3`: a mapping from three-letter code to MonomerType object |
me@260
|
28 - `by_name`: a mapping from monomer name to MonomerType object |
me@260
|
29 - `instance_type`: class of Monomer objects to use when creating new |
me@260
|
30 objects; this must be redefined in descendent classes |
me@260
|
31 |
me@260
|
32 All of the class attributes MUST be redefined when subclassing. |
me@260
|
33 """ |
me@260
|
34 |
me@260
|
35 by_code1 = {} |
me@260
|
36 by_code3 = {} |
me@260
|
37 by_name = {} |
me@260
|
38 instance_type = None |
me@260
|
39 |
me@260
|
40 def __init__(self, name="", code1="", code3="", is_modified=False): |
me@260
|
41 self.name = name.capitalize() |
me@260
|
42 self.code1 = code1.upper() |
me@260
|
43 self.code3 = code3.upper() |
me@260
|
44 self.is_modified = bool(is_modified) |
me@260
|
45 if not is_modified: |
me@260
|
46 self.by_code1[self.code1] = self |
me@260
|
47 self.by_code3[code3] = self |
me@260
|
48 self.by_name[name] = self |
me@260
|
49 # We duplicate distinguished long names into MonomerType itself, |
me@260
|
50 # so that we can use MonomerType.from_code3 to create the relevant |
me@260
|
51 # type of monomer. |
me@260
|
52 MonomerType.by_code3[code3] = self |
me@260
|
53 MonomerType.by_name[name] = self |
me@260
|
54 |
me@260
|
55 @classmethod |
me@260
|
56 def _initialize(cls, type_letter, codes=data.codes.codes): |
me@260
|
57 """Create all relevant instances of MonomerType. |
me@260
|
58 |
me@260
|
59 `type_letter` is either of: |
me@260
|
60 |
me@260
|
61 - 'p' for protein |
me@260
|
62 - 'd' for DNA |
me@260
|
63 - 'r' for RNA |
me@260
|
64 |
me@260
|
65 `codes` is a table of monomer codes |
me@260
|
66 """ |
me@260
|
67 for type, code1, is_modified, code3, name in codes: |
me@260
|
68 if type == type_letter: |
me@260
|
69 cls(name, code1, code3, is_modified) |
me@260
|
70 |
me@260
|
71 @classmethod |
me@260
|
72 def from_code1(cls, code1): |
me@260
|
73 """Return monomer type by one-letter code.""" |
me@260
|
74 return cls.by_code1[code1.upper()] |
me@260
|
75 |
me@260
|
76 @classmethod |
me@260
|
77 def from_code3(cls, code3): |
me@260
|
78 """Return monomer type by three-letter code.""" |
me@260
|
79 return cls.by_code3[code3.upper()] |
me@260
|
80 |
me@260
|
81 @classmethod |
me@260
|
82 def from_name(cls, name): |
me@260
|
83 """Return monomer type by name.""" |
me@260
|
84 return cls.by_name[name.capitalize()] |
me@260
|
85 |
me@260
|
86 def instance(self): |
me@260
|
87 """Create a new monomer of given type.""" |
me@260
|
88 return self.instance_type(self) |
me@260
|
89 |
me@260
|
90 def __eq__(self, other): |
me@260
|
91 if hasattr(other, "type"): |
me@260
|
92 return self is other.type |
me@260
|
93 return self is other |
me@260
|
94 |
me@260
|
95 class Monomer(object): |
me@260
|
96 """Monomer object. |
me@260
|
97 |
me@260
|
98 attributes: |
me@260
|
99 |
me@260
|
100 - `type`: type of monomer (a MonomerType object) |
me@260
|
101 |
me@282
|
102 class attributes: |
me@282
|
103 |
me@282
|
104 - `monomer_type`: either MonomerType or one of it's subclasses, it is used |
me@282
|
105 when creating new monomers. It SHOULD be redefined when subclassing |
me@282
|
106 Monomer. |
me@260
|
107 """ |
me@260
|
108 monomer_type = MonomerType |
me@260
|
109 |
me@260
|
110 def __init__(self, type): |
me@260
|
111 self.type = type |
me@260
|
112 |
me@260
|
113 @classmethod |
me@260
|
114 def from_code1(cls, code1): |
me@260
|
115 return cls(cls.monomer_type.by_code1[code1.upper()]) |
me@260
|
116 |
me@260
|
117 @classmethod |
me@260
|
118 def from_code3(cls, code3): |
me@260
|
119 return cls(cls.monomer_type.by_code3[code3.upper()]) |
me@260
|
120 |
me@260
|
121 @classmethod |
me@260
|
122 def from_name(cls, name): |
me@260
|
123 return cls(cls.monomer_type.by_name[name.capitalize()]) |
me@260
|
124 |
me@260
|
125 def __eq__(self, other): |
me@260
|
126 if hasattr(other, "type"): |
me@260
|
127 return self.type is other.type |
me@260
|
128 return self.type is other |
bnagaev@239
|
129 |
bnagaev@239
|
130 class Sequence(list): |
me@274
|
131 """Sequence of Monomers. |
bnagaev@243
|
132 |
me@274
|
133 This behaves like list of monomer objects. In addition to standard list |
me@274
|
134 behaviour, Sequence has the following attributes: |
me@270
|
135 |
me@274
|
136 * name -- str with the name of the sequence |
me@274
|
137 * description -- str with description of the sequence |
me@274
|
138 * source -- str denoting source of the sequence |
me@266
|
139 |
me@274
|
140 Any of them may be empty (i.e. hold empty string) |
me@275
|
141 |
me@275
|
142 Class attributes: |
me@282
|
143 |
me@275
|
144 * monomer_type -- type of monomers in sequence, must be redefined when |
me@275
|
145 subclassing |
me@274
|
146 """ |
me@270
|
147 |
me@275
|
148 monomer_type = Monomer |
me@270
|
149 |
me@275
|
150 name = '' |
me@275
|
151 description = '' |
me@275
|
152 source = '' |
me@275
|
153 |
me@275
|
154 def __init__(self, sequence=[], name=None, description=None, source=None): |
me@275
|
155 super(Sequence, self).__init__(sequence) |
me@275
|
156 if hasattr(sequence, 'name'): |
me@275
|
157 vars(self).update(vars(sequence)) |
me@275
|
158 if name: |
me@275
|
159 self.name = name |
me@275
|
160 if description: |
me@275
|
161 self.description = description |
me@275
|
162 if source: |
me@275
|
163 self.source = source |
me@270
|
164 |
me@262
|
165 def __str__(self): |
me@275
|
166 """Returns sequence in one-letter code.""" |
me@275
|
167 return ''.join(monomer.code1 for monomer in self) |
me@270
|
168 |
me@273
|
169 @classmethod |
me@273
|
170 def from_string(cls, string, name='', description=''): |
me@273
|
171 """Create sequences from string of one-letter codes.""" |
me@273
|
172 monomer = cls.monomer_type.from_code1 |
me@273
|
173 monomers = [monomer(letter) for letter in string] |
me@273
|
174 return cls(monomers, name, description) |
me@262
|
175 |
me@284
|
176 @classmethod |
me@284
|
177 def from_fasta(cls, file): |
me@284
|
178 """Read sequence from FASTA file. |
me@286
|
179 |
me@284
|
180 File must contain exactly one sequence. |
me@284
|
181 """ |
me@284
|
182 sequences = fasta.parse_file(file) |
me@284
|
183 assert len(sequences) == 1 |
me@287
|
184 name, description = sequences.keys()[0] |
me@284
|
185 return cls(sequences[header], name, description, file.name) |
me@284
|
186 |
me@295
|
187 class Alignment(object): |
me@295
|
188 """Alignment. It is a list of Columns.""" |
bnagaev@249
|
189 |
me@287
|
190 sequence_type = Sequence |
me@289
|
191 """Type of sequences in alignment. SHOULD be redefined when subclassing.""" |
me@288
|
192 |
me@289
|
193 sequences = None |
me@289
|
194 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!""" |
bnagaev@249
|
195 |
me@287
|
196 def __init__(self): |
me@287
|
197 """Initialize empty alignment.""" |
me@287
|
198 super(Alignment, self).__init__() |
me@287
|
199 self.sequences = [] |
me@295
|
200 self.columns = [] |
me@282
|
201 |
me@299
|
202 # Alignment modification methods |
me@299
|
203 # ============================== |
me@299
|
204 |
me@294
|
205 def append_sequence(self, sequence): |
me@294
|
206 """Add sequence to alignment. |
me@294
|
207 |
me@294
|
208 If sequence is too short, pad it with gaps on the right. |
me@294
|
209 """ |
me@294
|
210 self.sequences.append(sequence) |
me@294
|
211 for i, monomer in enumerate(sequence): |
me@302
|
212 self.column_at(i)[sequence] = monomer |
me@294
|
213 |
me@294
|
214 def append_gapped_line(self, line, name='', description='', source=''): |
me@287
|
215 """Add row from a line of one-letter codes and gaps.""" |
me@287
|
216 Sequence = cls.sequence_type |
me@287
|
217 not_gap = lambda (i, char): char != "-" |
me@287
|
218 no_gaps = line.replace("-", "") |
me@287
|
219 sequence = Sequence(no_gaps, name, description, source) |
me@303
|
220 # The following line has some simple magic: |
me@303
|
221 # 1. attach natural numbers to monomers |
me@303
|
222 # 2. delete gaps |
me@303
|
223 # 3. attach numbers again |
me@303
|
224 # This way we have a pair of numbers attached to monomer: |
me@303
|
225 # - it's position in alignment (the first attached number, j) |
me@303
|
226 # - it's position in sequence (the second attached number, i) |
me@287
|
227 for i, (j, char) in enumerate(filter(not_gap, enumerate(line))): |
me@302
|
228 self.column_at(j)[seq] = sequence[i] |
me@287
|
229 self.sequences.append(sequence) |
me@287
|
230 |
me@302
|
231 def column_at(self, n): |
me@302
|
232 """Return column by index. Create required new columns if required. |
me@302
|
233 |
me@302
|
234 Do NOT use this method, unless you are sure it is what you want. |
me@302
|
235 """ |
me@302
|
236 for i in range(len(self.columns), n + 1): |
me@302
|
237 self.columns.append(Column()) |
me@302
|
238 return self.columns[n] |
me@302
|
239 |
me@299
|
240 # Alignment IO methods |
me@299
|
241 # ==================== |
me@299
|
242 |
me@287
|
243 @classmethod |
me@287
|
244 def from_fasta(cls, file): |
me@287
|
245 """Create new alignment from FASTA file.""" |
me@287
|
246 self = cls() |
me@287
|
247 for ((name, description), body) in fasta.parse_file(file): |
me@294
|
248 self.append_gapped_line(body, name, description) |
me@287
|
249 return self |
bnagaev@249
|
250 |
me@292
|
251 def to_fasta(self, file): |
me@292
|
252 """Write alignment in FASTA file as sequences with gaps.""" |
me@292
|
253 def char(monomer): |
me@292
|
254 if monomer: |
me@292
|
255 return monomer.code1 |
me@292
|
256 return "-" |
me@292
|
257 for row in self.rows_as_lists(): |
me@292
|
258 seq = row.sequence |
me@292
|
259 line = "".join(map(char, row)) |
me@292
|
260 fasta.save_file(file, line, seq.name, seq.description) |
me@292
|
261 |
me@299
|
262 # Data access methods for alignment |
me@299
|
263 # ================================= |
me@299
|
264 |
me@299
|
265 def rows(self): |
me@299
|
266 """Return list of rows (temporary objects) in alignment. |
me@299
|
267 |
me@299
|
268 Each row is a dictionary of { column : monomer }. |
me@299
|
269 |
me@299
|
270 For gap positions there is no key for the column in row. |
me@299
|
271 |
me@299
|
272 Each row has attribute `sequence` pointing to the sequence the row is |
me@299
|
273 describing. |
me@299
|
274 |
me@299
|
275 Modifications of row have no effect on the alignment. |
me@299
|
276 """ |
me@299
|
277 # For now, the function returns a list rather than iterator. |
me@299
|
278 # It is yet to see, whether memory performance here becomes critical, |
me@299
|
279 # or is random access useful. |
me@299
|
280 rows = [] |
me@299
|
281 for sequence in self.sequences: |
me@299
|
282 row = util.UserDict() |
me@299
|
283 row.sequence = sequence |
me@299
|
284 for column in self.columns: |
me@299
|
285 if sequence in column: |
me@299
|
286 row[column] = column[sequence] |
me@299
|
287 rows.append(row) |
me@299
|
288 return rows |
me@299
|
289 |
me@299
|
290 def rows_as_lists(self): |
me@299
|
291 """Return list of rows (temporary objects) in alignment. |
me@299
|
292 |
me@299
|
293 Each row here is a list of either monomer or None (for gaps). |
me@299
|
294 |
me@299
|
295 Each row has attribute `sequence` pointing to the sequence of row. |
me@299
|
296 |
me@299
|
297 Modifications of row have no effect on the alignment. |
me@299
|
298 """ |
me@299
|
299 rows = [] |
me@299
|
300 for sequence in self.sequences: |
me@299
|
301 row = util.UserList() |
me@299
|
302 row.sequence = sequence |
me@299
|
303 for column in self.columns: |
me@299
|
304 row.append(column.get(sequence)) |
me@299
|
305 rows.append(row) |
me@299
|
306 return rows |
me@299
|
307 |
me@299
|
308 def columns_as_lists(self): |
me@299
|
309 """Return list of columns (temorary objects) in alignment. |
me@299
|
310 |
me@299
|
311 Each column here is a list of either monomer or None (for gaps). |
me@299
|
312 |
me@299
|
313 Items of column are sorted in the same way as alignment.sequences. |
me@299
|
314 |
me@299
|
315 Modifications of column have no effect on the alignment. |
me@299
|
316 """ |
me@299
|
317 columns = [] |
me@299
|
318 for column in self.columns: |
me@299
|
319 col = [] |
me@299
|
320 for sequence in self.sequences: |
me@299
|
321 col.append(column.get(sequence)) |
me@299
|
322 columns.append(col) |
me@299
|
323 return columns |
me@299
|
324 |
me@300
|
325 class Column(dict): |
me@300
|
326 """Column of alignment. |
me@300
|
327 |
me@300
|
328 Column is a dict of { sequence : monomer }. |
me@300
|
329 |
me@300
|
330 For sequences that have gaps in current row, given key is not present in |
me@300
|
331 the column. |
me@300
|
332 """ |
me@300
|
333 pass |
me@300
|
334 |
me@301
|
335 ## Unclean code follows |
me@301
|
336 |
bnagaev@249
|
337 class Block(object): |
me@261
|
338 """ Block of alignment |
me@270
|
339 |
me@261
|
340 Mandatory data: |
me@266
|
341 |
me@261
|
342 * self.alignment -- alignment object, which the block belongs to |
me@261
|
343 * self.sequences - set of sequence objects that contain monomers |
me@261
|
344 and/or gaps, that constitute the block |
me@261
|
345 * self.positions -- list of positions of the alignment.body that |
me@261
|
346 are included in the block; position[i+1] is always to the right from position[i] |
me@270
|
347 |
me@261
|
348 Don't change self.sequences -- it may be a link to other block.sequences |
me@270
|
349 |
me@261
|
350 How to create a new block: |
me@282
|
351 |
me@261
|
352 >>> import alignment |
me@261
|
353 >>> import block |
me@261
|
354 >>> proj = alignment.Alignment(open("test.fasta")) |
me@261
|
355 >>> block1 = block.Block(proj) |
me@261
|
356 """ |
me@270
|
357 |
me@261
|
358 def __init__(self, alignment, sequences=None, positions=None): |
me@261
|
359 """ Builds new block from alignment |
me@270
|
360 |
me@261
|
361 if sequences==None, all sequences are used |
me@261
|
362 if positions==None, all positions are used |
me@261
|
363 """ |
me@261
|
364 if sequences == None: |
me@261
|
365 sequences = set(alignment.sequences) # copy |
me@261
|
366 if positions == None: |
me@261
|
367 positions = range(len(alignment)) |
me@261
|
368 self.alignment = alignment |
me@261
|
369 self.sequences = sequences |
me@261
|
370 self.positions = positions |
me@270
|
371 |
me@261
|
372 def save_fasta(self, out_file, long_line=70, gap='-'): |
me@270
|
373 """ Saves alignment to given file in fasta-format |
me@270
|
374 |
me@261
|
375 No changes in the names, descriptions or order of the sequences |
me@261
|
376 are made. |
me@261
|
377 """ |
me@261
|
378 for sequence in self.sequences: |
me@261
|
379 alignment_monomers = self.alignment.body[sequence] |
me@261
|
380 block_monomers = [alignment_monomers[i] for i in self.positions] |
me@261
|
381 string = ''.join([m.type.code1 if m else '-' for m in block_monomers]) |
me@261
|
382 save_fasta(out_file, string, sequence.name, sequence.description, long_line) |
me@270
|
383 |
me@270
|
384 def geometrical_cores(self, max_delta=config.delta, |
me@270
|
385 timeout=config.timeout, minsize=config.minsize, |
me@261
|
386 ac_new_atoms=config.ac_new_atoms, |
me@261
|
387 ac_count=config.ac_count): |
me@261
|
388 """ Returns length-sorted list of blocks, representing GCs |
me@270
|
389 |
me@282
|
390 * max_delta -- threshold of distance spreading |
me@282
|
391 * timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm) |
me@282
|
392 * minsize -- min size of each core |
me@282
|
393 * ac_new_atoms -- min part or new atoms in new alternative core |
me@282
|
394 current GC is compared with each of already selected GCs if |
me@282
|
395 difference is less then ac_new_atoms, current GC is skipped |
me@261
|
396 difference = part of new atoms in current core |
me@282
|
397 * ac_count -- max number of cores (including main core) |
me@261
|
398 -1 means infinity |
me@282
|
399 |
me@261
|
400 If more than one pdb chain for some sequence provided, consider all of them |
me@270
|
401 cost is calculated as 1 / (delta + 1) |
me@282
|
402 |
me@261
|
403 delta in [0, +inf) => cost in (0, 1] |
me@261
|
404 """ |
me@261
|
405 nodes = self.positions |
me@261
|
406 lines = {} |
me@261
|
407 for i in self.positions: |
me@261
|
408 for j in self.positions: |
me@261
|
409 if i < j: |
me@261
|
410 distances = [] |
me@261
|
411 for sequence in self.sequences: |
me@261
|
412 for chain in sequence.pdb_chains: |
me@261
|
413 m1 = self.alignment.body[sequence][i] |
me@261
|
414 m2 = self.alignment.body[sequence][j] |
me@261
|
415 if m1 and m2: |
me@261
|
416 r1 = sequence.pdb_residues[chain][m1] |
me@261
|
417 r2 = sequence.pdb_residues[chain][m2] |
me@261
|
418 ca1 = r1['CA'] |
me@261
|
419 ca2 = r2['CA'] |
me@261
|
420 d = ca1 - ca2 # Bio.PDB feature |
me@261
|
421 distances.append(d) |
me@261
|
422 if len(distances) >= 2: |
me@261
|
423 delta = max(distances) - min(distances) |
me@261
|
424 if delta <= max_delta: |
me@261
|
425 lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta) |
me@261
|
426 graph = Graph(nodes, lines) |
me@261
|
427 cliques = graph.cliques(timeout=timeout, minsize=minsize) |
me@261
|
428 GCs = [] |
me@261
|
429 for clique in cliques: |
me@261
|
430 for GC in GCs: |
me@261
|
431 if len(clique - set(GC.positions)) < ac_new_atoms * len(clique): |
me@261
|
432 break |
me@261
|
433 else: |
me@261
|
434 GCs.append(Block(self.alignment, self.sequences, clique)) |
me@261
|
435 if ac_count != -1 and len(GCs) >= ac_count: |
me@261
|
436 break |
me@261
|
437 return GCs |
me@270
|
438 |
me@261
|
439 def xstring(self, x='X', gap='-'): |
me@261
|
440 """ Returns string consisting of gap chars and chars x at self.positions |
me@270
|
441 |
me@261
|
442 Length of returning string = length of alignment |
me@261
|
443 """ |
me@261
|
444 monomers = [False] * len(self.alignment) |
me@261
|
445 for i in self.positions: |
me@261
|
446 monomers[i] = True |
me@261
|
447 return ''.join([x if m else gap for m in monomers]) |
me@270
|
448 |
me@261
|
449 def save_xstring(self, out_file, name, description='', x='X', gap='-', long_line=70): |
me@261
|
450 """ Save xstring and name in fasta format """ |
me@261
|
451 save_fasta(out_file, self.xstring(x=x, gap=gap), name, description, long_line) |
me@270
|
452 |
me@261
|
453 def monomers(self, sequence): |
me@261
|
454 """ Iterates monomers of this sequence from this block """ |
me@261
|
455 alignment_sequence = self.alignment.body[sequence] |
me@261
|
456 return (alignment_sequence[i] for i in self.positions) |
me@270
|
457 |
me@261
|
458 def ca_atoms(self, sequence, pdb_chain): |
me@261
|
459 """ Iterates Ca-atom of monomers of this sequence from this block """ |
me@261
|
460 return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers()) |
me@270
|
461 |
me@261
|
462 def sequences_chains(self): |
me@261
|
463 """ Iterates pairs (sequence, chain) """ |
me@261
|
464 for sequence in self.alignment.sequences: |
me@261
|
465 if sequence in self.sequences: |
me@261
|
466 for chain in sequence.pdb_chains: |
me@261
|
467 yield (sequence, chain) |
me@270
|
468 |
me@261
|
469 def superimpose(self): |
me@261
|
470 """ Superimpose all pdb_chains in this block """ |
me@261
|
471 sequences_chains = list(self.sequences_chains()) |
me@261
|
472 if len(sequences_chains) >= 1: |
me@261
|
473 sup = Superimposer() |
me@261
|
474 fixed_sequence, fixed_chain = sequences_chains.pop() |
me@261
|
475 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain) |
me@261
|
476 for sequence, chain in sequences_chains: |
me@261
|
477 moving_atoms = self.ca_atoms(sequence, chain) |
me@261
|
478 sup.set_atoms(fixed_atoms, moving_atoms) |
me@261
|
479 # Apply rotation/translation to the moving atoms |
me@261
|
480 sup.apply(moving_atoms) |
me@270
|
481 |
me@261
|
482 def pdb_save(self, out_file): |
me@270
|
483 """ Save all sequences |
me@270
|
484 |
me@261
|
485 Returns {(sequence, chain): CHAIN} |
me@261
|
486 CHAIN is chain letter in new file |
me@261
|
487 """ |
me@261
|
488 tmp_file = NamedTemporaryFile(delete=False) |
me@261
|
489 tmp_file.close() |
me@270
|
490 |
me@261
|
491 for sequence, chain in self.sequences_chains(): |
me@261
|
492 sequence.pdb_save(tmp_file.name, chain) |
me@261
|
493 # TODO: read from tmp_file.name |
me@261
|
494 # change CHAIN |
me@261
|
495 # add to out_file |
me@270
|
496 |
me@261
|
497 os.unlink(NamedTemporaryFile) |
bnagaev@239
|
498 |
me@260
|
499 # vim: set ts=4 sts=4 sw=4 et: |