Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/d7ad71def6c8/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 19:36:40 2013
Кодировка:
allpy: d7ad71def6c8 allpy/base.py

allpy

view allpy/base.py @ 289:d7ad71def6c8

Experimenting with styles of docstrings in allpy.base.Alignment (to be recognized by Sphinx)
author Daniil Alexeyevsky <me.dendik@gmail.com>
date Thu, 16 Dec 2010 01:25:54 +0300
parents 9c68d8eab8f5
children 751048390c45
line source
1 import sys
2 import os
3 import os.path
4 from tempfile import NamedTemporaryFile
5 import urllib2
7 import config
8 import fasta
9 from graph import Graph
10 from Bio.PDB.DSSP import make_dssp_dict
11 import data.codes
13 class MonomerType(object):
14 """Class of monomer types.
16 Each MonomerType object represents a known monomer type, e.g. Valine,
17 and is referenced to by each instance of monomer in a given sequence.
19 - `name`: full name of monomer type
20 - `code1`: one-letter code
21 - `code3`: three-letter code
22 - `is_modified`: either of True or False
24 class atributes:
26 - `by_code1`: a mapping from one-letter code to MonomerType object
27 - `by_code3`: a mapping from three-letter code to MonomerType object
28 - `by_name`: a mapping from monomer name to MonomerType object
29 - `instance_type`: class of Monomer objects to use when creating new
30 objects; this must be redefined in descendent classes
32 All of the class attributes MUST be redefined when subclassing.
33 """
35 by_code1 = {}
36 by_code3 = {}
37 by_name = {}
38 instance_type = None
40 def __init__(self, name="", code1="", code3="", is_modified=False):
41 self.name = name.capitalize()
42 self.code1 = code1.upper()
43 self.code3 = code3.upper()
44 self.is_modified = bool(is_modified)
45 if not is_modified:
46 self.by_code1[self.code1] = self
47 self.by_code3[code3] = self
48 self.by_name[name] = self
49 # We duplicate distinguished long names into MonomerType itself,
50 # so that we can use MonomerType.from_code3 to create the relevant
51 # type of monomer.
52 MonomerType.by_code3[code3] = self
53 MonomerType.by_name[name] = self
55 @classmethod
56 def _initialize(cls, type_letter, codes=data.codes.codes):
57 """Create all relevant instances of MonomerType.
59 `type_letter` is either of:
61 - 'p' for protein
62 - 'd' for DNA
63 - 'r' for RNA
65 `codes` is a table of monomer codes
66 """
67 for type, code1, is_modified, code3, name in codes:
68 if type == type_letter:
69 cls(name, code1, code3, is_modified)
71 @classmethod
72 def from_code1(cls, code1):
73 """Return monomer type by one-letter code."""
74 return cls.by_code1[code1.upper()]
76 @classmethod
77 def from_code3(cls, code3):
78 """Return monomer type by three-letter code."""
79 return cls.by_code3[code3.upper()]
81 @classmethod
82 def from_name(cls, name):
83 """Return monomer type by name."""
84 return cls.by_name[name.capitalize()]
86 def instance(self):
87 """Create a new monomer of given type."""
88 return self.instance_type(self)
90 def __eq__(self, other):
91 if hasattr(other, "type"):
92 return self is other.type
93 return self is other
95 class Monomer(object):
96 """Monomer object.
98 attributes:
100 - `type`: type of monomer (a MonomerType object)
102 class attributes:
104 - `monomer_type`: either MonomerType or one of it's subclasses, it is used
105 when creating new monomers. It SHOULD be redefined when subclassing
106 Monomer.
107 """
108 monomer_type = MonomerType
110 def __init__(self, type):
111 self.type = type
113 @classmethod
114 def from_code1(cls, code1):
115 return cls(cls.monomer_type.by_code1[code1.upper()])
117 @classmethod
118 def from_code3(cls, code3):
119 return cls(cls.monomer_type.by_code3[code3.upper()])
121 @classmethod
122 def from_name(cls, name):
123 return cls(cls.monomer_type.by_name[name.capitalize()])
125 def __eq__(self, other):
126 if hasattr(other, "type"):
127 return self.type is other.type
128 return self.type is other
130 class Sequence(list):
131 """Sequence of Monomers.
133 This behaves like list of monomer objects. In addition to standard list
134 behaviour, Sequence has the following attributes:
136 * name -- str with the name of the sequence
137 * description -- str with description of the sequence
138 * source -- str denoting source of the sequence
140 Any of them may be empty (i.e. hold empty string)
142 Class attributes:
144 * monomer_type -- type of monomers in sequence, must be redefined when
145 subclassing
146 """
148 monomer_type = Monomer
150 name = ''
151 description = ''
152 source = ''
154 def __init__(self, sequence=[], name=None, description=None, source=None):
155 super(Sequence, self).__init__(sequence)
156 if hasattr(sequence, 'name'):
157 vars(self).update(vars(sequence))
158 if name:
159 self.name = name
160 if description:
161 self.description = description
162 if source:
163 self.source = source
165 def __str__(self):
166 """Returns sequence in one-letter code."""
167 return ''.join(monomer.code1 for monomer in self)
169 @classmethod
170 def from_string(cls, string, name='', description=''):
171 """Create sequences from string of one-letter codes."""
172 monomer = cls.monomer_type.from_code1
173 monomers = [monomer(letter) for letter in string]
174 return cls(monomers, name, description)
176 @classmethod
177 def from_fasta(cls, file):
178 """Read sequence from FASTA file.
180 File must contain exactly one sequence.
181 """
182 sequences = fasta.parse_file(file)
183 assert len(sequences) == 1
184 name, description = sequences.keys()[0]
185 return cls(sequences[header], name, description, file.name)
187 class Alignment(list):
188 """Alignment. Behaves like a list of Columns."""
190 sequence_type = Sequence
191 """Type of sequences in alignment. SHOULD be redefined when subclassing."""
193 sequences = None
194 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
196 def __init__(self):
197 """Initialize empty alignment."""
198 super(Alignment, self).__init__()
199 self.sequences = []
201 def add_gapped_line(self, line, name='', description='', source=''):
202 """Add row from a line of one-letter codes and gaps."""
203 Sequence = cls.sequence_type
204 not_gap = lambda (i, char): char != "-"
205 no_gaps = line.replace("-", "")
206 sequence = Sequence(no_gaps, name, description, source)
207 for i, (j, char) in enumerate(filter(not_gap, enumerate(line))):
208 self[j][seq] = sequence[i]
209 self.sequences.append(sequence)
211 @classmethod
212 def from_fasta(cls, file):
213 """Create new alignment from FASTA file."""
214 self = cls()
215 for ((name, description), body) in fasta.parse_file(file):
216 self.add_gapped_line(body, name, description)
217 return self
219 def length(self):
220 """Return width, ie length of each sequence with gaps."""
221 return max([len(line) for line in self.body.values()])
223 def height(self):
224 """ The number of sequences in alignment (it's thickness). """
225 return len(self.body)
227 def identity(self):
228 """ Calculate the identity of alignment positions for colouring.
230 For every (row, column) in alignment the percentage of the exactly
231 same residue in the same column in the alignment is calculated.
232 The data structure is just like the Alignment.body, but istead of
233 monomers it contains float percentages.
234 """
235 # Oh, God, that's awful! Absolutely not understandable.
236 # First, calculate percentages of amino acids in every column
237 contribution = 1.0 / len(self.sequences)
238 all_columns = []
239 for position in range(len(self)):
240 column_percentage = {}
241 for seq in self.body:
242 if self.body[seq][position] is not None:
243 aa = self.body[seq][position].code
244 else:
245 aa = None
246 if aa in allpy.data.amino_acids:
247 if aa in column_percentage.keys():
248 column_percentage[aa] += contribution
249 else:
250 column_percentage[aa] = contribution
251 all_columns.append(column_percentage)
252 # Second, map these percentages onto the alignment
253 self.identity_percentages = {}
254 for seq in self.sequences:
255 self.identity_percentages[seq] = []
256 for seq in self.identity_percentages:
257 line = self.identity_percentages[seq]
258 for position in range(len(self)):
259 if self.body[seq][position] is not None:
260 aa = self.body[seq][position].code
261 else:
262 aa = None
263 line.append(all_columns[position].get(aa))
264 return self.identity_percentages
266 @staticmethod
267 def from_sequences(*sequences):
268 """ Constructs new alignment from sequences
270 Add None's to right end to make equal lengthes of alignment sequences
271 """
272 alignment = Alignment()
273 alignment.sequences = sequences
274 max_length = max(len(sequence) for sequence in sequences)
275 for sequence in sequences:
276 gaps_count = max_length - len(sequence)
277 alignment.body[sequence] = sequence.monomers + [None] * gaps_count
278 return alignment
280 def save_fasta(self, out_file, long_line=70, gap='-'):
281 """ Saves alignment to given file
283 Splits long lines to substrings of length=long_line
284 To prevent this, set long_line=None
285 """
286 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)
288 def muscle_align(self):
289 """ Simple align ths alignment using sequences (muscle)
291 uses old Monomers and Sequences objects
292 """
293 tmp_file = NamedTemporaryFile(delete=False)
294 self.save_fasta(tmp_file)
295 tmp_file.close()
296 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})
297 sequences, body = Alignment.from_fasta(open(tmp_file.name))
298 for sequence in self.sequences:
299 try:
300 new_sequence = [i for i in sequences if sequence==i][0]
301 except:
302 raise Exception("Align: Cann't find sequence %s in muscle output" % \
303 sequence.name)
304 old_monomers = iter(sequence.monomers)
305 self.body[sequence] = []
306 for monomer in body[new_sequence]:
307 if not monomer:
308 self.body[sequence].append(monomer)
309 else:
310 old_monomer = old_monomers.next()
311 if monomer != old_monomer:
312 raise Exception("Align: alignment errors")
313 self.body[sequence].append(old_monomer)
314 os.unlink(tmp_file.name)
316 def column(self, sequence=None, sequences=None, original=None):
317 """ returns list of columns of alignment
319 sequence or sequences:
320 * if sequence is given, then column is (original_monomer, monomer)
321 * if sequences is given, then column is (original_monomer, {sequence: monomer})
322 * if both of them are given, it is an error
324 original (Sequence type):
325 * if given, this filters only columns represented by original sequence
326 """
327 if sequence and sequences:
328 raise Exception("Wrong usage. read help")
329 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])
330 alignment = self.body.items()
331 alignment.sort(key=lambda i: indexes[i[0]])
332 alignment = [monomers for seq, monomers in alignment]
333 for column in zip(*alignment):
334 if not original or column[indexes[original]]:
335 if sequence:
336 yield (column[indexes[original]], column[indexes[sequence]])
337 else:
338 yield (column[indexes[original]],
339 dict([(s, column[indexes[s]]) for s in sequences]))
341 def secstr(self, sequence, pdb_chain, gap='-'):
342 """ Returns string representing secondary structure """
343 return ''.join([
344 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)
345 for m in self.body[sequence]])
347 class Block(object):
348 """ Block of alignment
350 Mandatory data:
352 * self.alignment -- alignment object, which the block belongs to
353 * self.sequences - set of sequence objects that contain monomers
354 and/or gaps, that constitute the block
355 * self.positions -- list of positions of the alignment.body that
356 are included in the block; position[i+1] is always to the right from position[i]
358 Don't change self.sequences -- it may be a link to other block.sequences
360 How to create a new block:
362 >>> import alignment
363 >>> import block
364 >>> proj = alignment.Alignment(open("test.fasta"))
365 >>> block1 = block.Block(proj)
366 """
368 def __init__(self, alignment, sequences=None, positions=None):
369 """ Builds new block from alignment
371 if sequences==None, all sequences are used
372 if positions==None, all positions are used
373 """
374 if sequences == None:
375 sequences = set(alignment.sequences) # copy
376 if positions == None:
377 positions = range(len(alignment))
378 self.alignment = alignment
379 self.sequences = sequences
380 self.positions = positions
382 def save_fasta(self, out_file, long_line=70, gap='-'):
383 """ Saves alignment to given file in fasta-format
385 No changes in the names, descriptions or order of the sequences
386 are made.
387 """
388 for sequence in self.sequences:
389 alignment_monomers = self.alignment.body[sequence]
390 block_monomers = [alignment_monomers[i] for i in self.positions]
391 string = ''.join([m.type.code1 if m else '-' for m in block_monomers])
392 save_fasta(out_file, string, sequence.name, sequence.description, long_line)
394 def geometrical_cores(self, max_delta=config.delta,
395 timeout=config.timeout, minsize=config.minsize,
396 ac_new_atoms=config.ac_new_atoms,
397 ac_count=config.ac_count):
398 """ Returns length-sorted list of blocks, representing GCs
400 * max_delta -- threshold of distance spreading
401 * timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm)
402 * minsize -- min size of each core
403 * ac_new_atoms -- min part or new atoms in new alternative core
404 current GC is compared with each of already selected GCs if
405 difference is less then ac_new_atoms, current GC is skipped
406 difference = part of new atoms in current core
407 * ac_count -- max number of cores (including main core)
408 -1 means infinity
410 If more than one pdb chain for some sequence provided, consider all of them
411 cost is calculated as 1 / (delta + 1)
413 delta in [0, +inf) => cost in (0, 1]
414 """
415 nodes = self.positions
416 lines = {}
417 for i in self.positions:
418 for j in self.positions:
419 if i < j:
420 distances = []
421 for sequence in self.sequences:
422 for chain in sequence.pdb_chains:
423 m1 = self.alignment.body[sequence][i]
424 m2 = self.alignment.body[sequence][j]
425 if m1 and m2:
426 r1 = sequence.pdb_residues[chain][m1]
427 r2 = sequence.pdb_residues[chain][m2]
428 ca1 = r1['CA']
429 ca2 = r2['CA']
430 d = ca1 - ca2 # Bio.PDB feature
431 distances.append(d)
432 if len(distances) >= 2:
433 delta = max(distances) - min(distances)
434 if delta <= max_delta:
435 lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta)
436 graph = Graph(nodes, lines)
437 cliques = graph.cliques(timeout=timeout, minsize=minsize)
438 GCs = []
439 for clique in cliques:
440 for GC in GCs:
441 if len(clique - set(GC.positions)) < ac_new_atoms * len(clique):
442 break
443 else:
444 GCs.append(Block(self.alignment, self.sequences, clique))
445 if ac_count != -1 and len(GCs) >= ac_count:
446 break
447 return GCs
449 def xstring(self, x='X', gap='-'):
450 """ Returns string consisting of gap chars and chars x at self.positions
452 Length of returning string = length of alignment
453 """
454 monomers = [False] * len(self.alignment)
455 for i in self.positions:
456 monomers[i] = True
457 return ''.join([x if m else gap for m in monomers])
459 def save_xstring(self, out_file, name, description='', x='X', gap='-', long_line=70):
460 """ Save xstring and name in fasta format """
461 save_fasta(out_file, self.xstring(x=x, gap=gap), name, description, long_line)
463 def monomers(self, sequence):
464 """ Iterates monomers of this sequence from this block """
465 alignment_sequence = self.alignment.body[sequence]
466 return (alignment_sequence[i] for i in self.positions)
468 def ca_atoms(self, sequence, pdb_chain):
469 """ Iterates Ca-atom of monomers of this sequence from this block """
470 return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers())
472 def sequences_chains(self):
473 """ Iterates pairs (sequence, chain) """
474 for sequence in self.alignment.sequences:
475 if sequence in self.sequences:
476 for chain in sequence.pdb_chains:
477 yield (sequence, chain)
479 def superimpose(self):
480 """ Superimpose all pdb_chains in this block """
481 sequences_chains = list(self.sequences_chains())
482 if len(sequences_chains) >= 1:
483 sup = Superimposer()
484 fixed_sequence, fixed_chain = sequences_chains.pop()
485 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain)
486 for sequence, chain in sequences_chains:
487 moving_atoms = self.ca_atoms(sequence, chain)
488 sup.set_atoms(fixed_atoms, moving_atoms)
489 # Apply rotation/translation to the moving atoms
490 sup.apply(moving_atoms)
492 def pdb_save(self, out_file):
493 """ Save all sequences
495 Returns {(sequence, chain): CHAIN}
496 CHAIN is chain letter in new file
497 """
498 tmp_file = NamedTemporaryFile(delete=False)
499 tmp_file.close()
501 for sequence, chain in self.sequences_chains():
502 sequence.pdb_save(tmp_file.name, chain)
503 # TODO: read from tmp_file.name
504 # change CHAIN
505 # add to out_file
507 os.unlink(NamedTemporaryFile)
509 # vim: set ts=4 sts=4 sw=4 et: