Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/bee4d155f526/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 19:36:44 2013
Кодировка:
allpy: bee4d155f526 allpy/base.py

allpy

view allpy/base.py @ 284:bee4d155f526

Added base.Sequence.from_fasta
author Daniil Alexeyevsky <me.dendik@gmail.com>
date Wed, 15 Dec 2010 23:44:20 +0300
parents 59320c160dae
children cf6cdc3b7ec5
line source
1 import sys
2 import os
3 import os.path
4 from tempfile import NamedTemporaryFile
5 import urllib2
7 import config
8 import fasta
9 from graph import Graph
10 from Bio.PDB.DSSP import make_dssp_dict
11 import data.codes
13 class MonomerType(object):
14 """Class of monomer types.
16 Each MonomerType object represents a known monomer type, e.g. Valine,
17 and is referenced to by each instance of monomer in a given sequence.
19 - `name`: full name of monomer type
20 - `code1`: one-letter code
21 - `code3`: three-letter code
22 - `is_modified`: either of True or False
24 class atributes:
26 - `by_code1`: a mapping from one-letter code to MonomerType object
27 - `by_code3`: a mapping from three-letter code to MonomerType object
28 - `by_name`: a mapping from monomer name to MonomerType object
29 - `instance_type`: class of Monomer objects to use when creating new
30 objects; this must be redefined in descendent classes
32 All of the class attributes MUST be redefined when subclassing.
33 """
35 by_code1 = {}
36 by_code3 = {}
37 by_name = {}
38 instance_type = None
40 def __init__(self, name="", code1="", code3="", is_modified=False):
41 self.name = name.capitalize()
42 self.code1 = code1.upper()
43 self.code3 = code3.upper()
44 self.is_modified = bool(is_modified)
45 if not is_modified:
46 self.by_code1[self.code1] = self
47 self.by_code3[code3] = self
48 self.by_name[name] = self
49 # We duplicate distinguished long names into MonomerType itself,
50 # so that we can use MonomerType.from_code3 to create the relevant
51 # type of monomer.
52 MonomerType.by_code3[code3] = self
53 MonomerType.by_name[name] = self
55 @classmethod
56 def _initialize(cls, type_letter, codes=data.codes.codes):
57 """Create all relevant instances of MonomerType.
59 `type_letter` is either of:
61 - 'p' for protein
62 - 'd' for DNA
63 - 'r' for RNA
65 `codes` is a table of monomer codes
66 """
67 for type, code1, is_modified, code3, name in codes:
68 if type == type_letter:
69 cls(name, code1, code3, is_modified)
71 @classmethod
72 def from_code1(cls, code1):
73 """Return monomer type by one-letter code."""
74 return cls.by_code1[code1.upper()]
76 @classmethod
77 def from_code3(cls, code3):
78 """Return monomer type by three-letter code."""
79 return cls.by_code3[code3.upper()]
81 @classmethod
82 def from_name(cls, name):
83 """Return monomer type by name."""
84 return cls.by_name[name.capitalize()]
86 def instance(self):
87 """Create a new monomer of given type."""
88 return self.instance_type(self)
90 def __eq__(self, other):
91 if hasattr(other, "type"):
92 return self is other.type
93 return self is other
95 class Monomer(object):
96 """Monomer object.
98 attributes:
100 - `type`: type of monomer (a MonomerType object)
102 class attributes:
104 - `monomer_type`: either MonomerType or one of it's subclasses, it is used
105 when creating new monomers. It SHOULD be redefined when subclassing
106 Monomer.
107 """
108 monomer_type = MonomerType
110 def __init__(self, type):
111 self.type = type
113 @classmethod
114 def from_code1(cls, code1):
115 return cls(cls.monomer_type.by_code1[code1.upper()])
117 @classmethod
118 def from_code3(cls, code3):
119 return cls(cls.monomer_type.by_code3[code3.upper()])
121 @classmethod
122 def from_name(cls, name):
123 return cls(cls.monomer_type.by_name[name.capitalize()])
125 def __eq__(self, other):
126 if hasattr(other, "type"):
127 return self.type is other.type
128 return self.type is other
130 class Sequence(list):
131 """Sequence of Monomers.
133 This behaves like list of monomer objects. In addition to standard list
134 behaviour, Sequence has the following attributes:
136 * name -- str with the name of the sequence
137 * description -- str with description of the sequence
138 * source -- str denoting source of the sequence
140 Any of them may be empty (i.e. hold empty string)
142 Class attributes:
144 * monomer_type -- type of monomers in sequence, must be redefined when
145 subclassing
146 """
148 monomer_type = Monomer
150 name = ''
151 description = ''
152 source = ''
154 def __init__(self, sequence=[], name=None, description=None, source=None):
155 super(Sequence, self).__init__(sequence)
156 if hasattr(sequence, 'name'):
157 vars(self).update(vars(sequence))
158 if name:
159 self.name = name
160 if description:
161 self.description = description
162 if source:
163 self.source = source
165 def __str__(self):
166 """Returns sequence in one-letter code."""
167 return ''.join(monomer.code1 for monomer in self)
169 @classmethod
170 def from_string(cls, string, name='', description=''):
171 """Create sequences from string of one-letter codes."""
172 monomer = cls.monomer_type.from_code1
173 monomers = [monomer(letter) for letter in string]
174 return cls(monomers, name, description)
176 @classmethod
177 def from_fasta(cls, file):
178 """Read sequence from FASTA file.
180 File must contain exactly one sequence.
181 """
182 sequences = fasta.parse_file(file)
183 assert len(sequences) == 1
184 header = sequences.keys()[0]
185 name, _, description = header.partition(" ")
186 return cls(sequences[header], name, description, file.name)
188 class Alignment(dict):
189 """Alignment.
191 Behaves like a list of Columns.
192 """
193 # _sequences -- list of Sequence objects. Sequences don't contain gaps
194 # - see sequence.py module
196 def __init__(self, *args):
197 """overloaded constructor
199 Alignment()
200 new empty Alignment
202 Alignment(sequences, body)
203 new Alignment with sequences and body initialized from arguments
205 Alignment(fasta_file)
206 new Alignment, read body and sequences from fasta file
207 """
208 if len(args)>1:#overloaded constructor
209 self.sequences=args[0]
210 self.body=args[1]
211 elif len(args)==0:
212 self.sequences=[]
213 self.body={}
214 else:
215 self.sequences, self.body = Alignment.from_fasta(args[0])
217 def length(self):
218 """ Returns width, ie length of each sequence with gaps """
219 return max([len(line) for line in self.body.values()])
221 def height(self):
222 """ The number of sequences in alignment (it's thickness). """
223 return len(self.body)
225 def identity(self):
226 """ Calculate the identity of alignment positions for colouring.
228 For every (row, column) in alignment the percentage of the exactly
229 same residue in the same column in the alignment is calculated.
230 The data structure is just like the Alignment.body, but istead of
231 monomers it contains float percentages.
232 """
233 # Oh, God, that's awful! Absolutely not understandable.
234 # First, calculate percentages of amino acids in every column
235 contribution = 1.0 / len(self.sequences)
236 all_columns = []
237 for position in range(len(self)):
238 column_percentage = {}
239 for seq in self.body:
240 if self.body[seq][position] is not None:
241 aa = self.body[seq][position].code
242 else:
243 aa = None
244 if aa in allpy.data.amino_acids:
245 if aa in column_percentage.keys():
246 column_percentage[aa] += contribution
247 else:
248 column_percentage[aa] = contribution
249 all_columns.append(column_percentage)
250 # Second, map these percentages onto the alignment
251 self.identity_percentages = {}
252 for seq in self.sequences:
253 self.identity_percentages[seq] = []
254 for seq in self.identity_percentages:
255 line = self.identity_percentages[seq]
256 for position in range(len(self)):
257 if self.body[seq][position] is not None:
258 aa = self.body[seq][position].code
259 else:
260 aa = None
261 line.append(all_columns[position].get(aa))
262 return self.identity_percentages
264 @classmethod
265 def from_fasta(file):
266 """ Import data from fasta file
268 >>> import alignment
269 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))
270 """
271 import re
273 sequences = []
274 body = {}
276 raw_sequences = file.read().split(">")
277 if len(raw_sequences) <= 1:
278 raise Exception("Wrong format of fasta-file %s" % file.name)
280 raw_sequences = raw_sequences[1:] #ignore everything before the first >
281 for raw in raw_sequences:
282 parsed_raw_sequence = raw.split("\n")
283 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
284 name_and_description = parsed_raw_sequence[0]
285 name_and_description = name_and_description.split(" ",1)
286 if len(name_and_description) == 2:
287 name, description = name_and_description
288 elif len(name_and_description) == 1:
289 #if there is description
290 name = name_and_description[0]
291 description = ''
292 else:
293 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \
294 {'name': name, 'file': file.name})
296 if len(parsed_raw_sequence) <= 1:
297 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \
298 {'name': name, 'file': file.name})
299 string = ""
300 for piece in parsed_raw_sequence[1:]:
301 piece_without_whitespace_chars = re.sub("\s", "", piece)
302 string += piece_without_whitespace_chars
303 monomers = [] #convert into Monomer objects
304 body_list = [] #create the respective list in body dict
305 for current_monomer in string:
306 if current_monomer not in ["-", ".", "~"]:
307 monomers.append(cls.monomer_type.from_code1(current_monomer))
308 body_list.append(monomers[-1])
309 else:
310 body_list.append(None)
311 s = sequence.Sequence(monomers, name, description)
312 sequences.append(s)
313 body[s] = body_list
314 return sequences, body
316 @staticmethod
317 def from_sequences(*sequences):
318 """ Constructs new alignment from sequences
320 Add None's to right end to make equal lengthes of alignment sequences
321 """
322 alignment = Alignment()
323 alignment.sequences = sequences
324 max_length = max(len(sequence) for sequence in sequences)
325 for sequence in sequences:
326 gaps_count = max_length - len(sequence)
327 alignment.body[sequence] = sequence.monomers + [None] * gaps_count
328 return alignment
330 def save_fasta(self, out_file, long_line=70, gap='-'):
331 """ Saves alignment to given file
333 Splits long lines to substrings of length=long_line
334 To prevent this, set long_line=None
335 """
336 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)
338 def muscle_align(self):
339 """ Simple align ths alignment using sequences (muscle)
341 uses old Monomers and Sequences objects
342 """
343 tmp_file = NamedTemporaryFile(delete=False)
344 self.save_fasta(tmp_file)
345 tmp_file.close()
346 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})
347 sequences, body = Alignment.from_fasta(open(tmp_file.name))
348 for sequence in self.sequences:
349 try:
350 new_sequence = [i for i in sequences if sequence==i][0]
351 except:
352 raise Exception("Align: Cann't find sequence %s in muscle output" % \
353 sequence.name)
354 old_monomers = iter(sequence.monomers)
355 self.body[sequence] = []
356 for monomer in body[new_sequence]:
357 if not monomer:
358 self.body[sequence].append(monomer)
359 else:
360 old_monomer = old_monomers.next()
361 if monomer != old_monomer:
362 raise Exception("Align: alignment errors")
363 self.body[sequence].append(old_monomer)
364 os.unlink(tmp_file.name)
366 def column(self, sequence=None, sequences=None, original=None):
367 """ returns list of columns of alignment
369 sequence or sequences:
370 * if sequence is given, then column is (original_monomer, monomer)
371 * if sequences is given, then column is (original_monomer, {sequence: monomer})
372 * if both of them are given, it is an error
374 original (Sequence type):
375 * if given, this filters only columns represented by original sequence
376 """
377 if sequence and sequences:
378 raise Exception("Wrong usage. read help")
379 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])
380 alignment = self.body.items()
381 alignment.sort(key=lambda i: indexes[i[0]])
382 alignment = [monomers for seq, monomers in alignment]
383 for column in zip(*alignment):
384 if not original or column[indexes[original]]:
385 if sequence:
386 yield (column[indexes[original]], column[indexes[sequence]])
387 else:
388 yield (column[indexes[original]],
389 dict([(s, column[indexes[s]]) for s in sequences]))
391 def secstr(self, sequence, pdb_chain, gap='-'):
392 """ Returns string representing secondary structure """
393 return ''.join([
394 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)
395 for m in self.body[sequence]])
397 class Block(object):
398 """ Block of alignment
400 Mandatory data:
402 * self.alignment -- alignment object, which the block belongs to
403 * self.sequences - set of sequence objects that contain monomers
404 and/or gaps, that constitute the block
405 * self.positions -- list of positions of the alignment.body that
406 are included in the block; position[i+1] is always to the right from position[i]
408 Don't change self.sequences -- it may be a link to other block.sequences
410 How to create a new block:
412 >>> import alignment
413 >>> import block
414 >>> proj = alignment.Alignment(open("test.fasta"))
415 >>> block1 = block.Block(proj)
416 """
418 def __init__(self, alignment, sequences=None, positions=None):
419 """ Builds new block from alignment
421 if sequences==None, all sequences are used
422 if positions==None, all positions are used
423 """
424 if sequences == None:
425 sequences = set(alignment.sequences) # copy
426 if positions == None:
427 positions = range(len(alignment))
428 self.alignment = alignment
429 self.sequences = sequences
430 self.positions = positions
432 def save_fasta(self, out_file, long_line=70, gap='-'):
433 """ Saves alignment to given file in fasta-format
435 No changes in the names, descriptions or order of the sequences
436 are made.
437 """
438 for sequence in self.sequences:
439 alignment_monomers = self.alignment.body[sequence]
440 block_monomers = [alignment_monomers[i] for i in self.positions]
441 string = ''.join([m.type.code1 if m else '-' for m in block_monomers])
442 save_fasta(out_file, string, sequence.name, sequence.description, long_line)
444 def geometrical_cores(self, max_delta=config.delta,
445 timeout=config.timeout, minsize=config.minsize,
446 ac_new_atoms=config.ac_new_atoms,
447 ac_count=config.ac_count):
448 """ Returns length-sorted list of blocks, representing GCs
450 * max_delta -- threshold of distance spreading
451 * timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm)
452 * minsize -- min size of each core
453 * ac_new_atoms -- min part or new atoms in new alternative core
454 current GC is compared with each of already selected GCs if
455 difference is less then ac_new_atoms, current GC is skipped
456 difference = part of new atoms in current core
457 * ac_count -- max number of cores (including main core)
458 -1 means infinity
460 If more than one pdb chain for some sequence provided, consider all of them
461 cost is calculated as 1 / (delta + 1)
463 delta in [0, +inf) => cost in (0, 1]
464 """
465 nodes = self.positions
466 lines = {}
467 for i in self.positions:
468 for j in self.positions:
469 if i < j:
470 distances = []
471 for sequence in self.sequences:
472 for chain in sequence.pdb_chains:
473 m1 = self.alignment.body[sequence][i]
474 m2 = self.alignment.body[sequence][j]
475 if m1 and m2:
476 r1 = sequence.pdb_residues[chain][m1]
477 r2 = sequence.pdb_residues[chain][m2]
478 ca1 = r1['CA']
479 ca2 = r2['CA']
480 d = ca1 - ca2 # Bio.PDB feature
481 distances.append(d)
482 if len(distances) >= 2:
483 delta = max(distances) - min(distances)
484 if delta <= max_delta:
485 lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta)
486 graph = Graph(nodes, lines)
487 cliques = graph.cliques(timeout=timeout, minsize=minsize)
488 GCs = []
489 for clique in cliques:
490 for GC in GCs:
491 if len(clique - set(GC.positions)) < ac_new_atoms * len(clique):
492 break
493 else:
494 GCs.append(Block(self.alignment, self.sequences, clique))
495 if ac_count != -1 and len(GCs) >= ac_count:
496 break
497 return GCs
499 def xstring(self, x='X', gap='-'):
500 """ Returns string consisting of gap chars and chars x at self.positions
502 Length of returning string = length of alignment
503 """
504 monomers = [False] * len(self.alignment)
505 for i in self.positions:
506 monomers[i] = True
507 return ''.join([x if m else gap for m in monomers])
509 def save_xstring(self, out_file, name, description='', x='X', gap='-', long_line=70):
510 """ Save xstring and name in fasta format """
511 save_fasta(out_file, self.xstring(x=x, gap=gap), name, description, long_line)
513 def monomers(self, sequence):
514 """ Iterates monomers of this sequence from this block """
515 alignment_sequence = self.alignment.body[sequence]
516 return (alignment_sequence[i] for i in self.positions)
518 def ca_atoms(self, sequence, pdb_chain):
519 """ Iterates Ca-atom of monomers of this sequence from this block """
520 return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers())
522 def sequences_chains(self):
523 """ Iterates pairs (sequence, chain) """
524 for sequence in self.alignment.sequences:
525 if sequence in self.sequences:
526 for chain in sequence.pdb_chains:
527 yield (sequence, chain)
529 def superimpose(self):
530 """ Superimpose all pdb_chains in this block """
531 sequences_chains = list(self.sequences_chains())
532 if len(sequences_chains) >= 1:
533 sup = Superimposer()
534 fixed_sequence, fixed_chain = sequences_chains.pop()
535 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain)
536 for sequence, chain in sequences_chains:
537 moving_atoms = self.ca_atoms(sequence, chain)
538 sup.set_atoms(fixed_atoms, moving_atoms)
539 # Apply rotation/translation to the moving atoms
540 sup.apply(moving_atoms)
542 def pdb_save(self, out_file):
543 """ Save all sequences
545 Returns {(sequence, chain): CHAIN}
546 CHAIN is chain letter in new file
547 """
548 tmp_file = NamedTemporaryFile(delete=False)
549 tmp_file.close()
551 for sequence, chain in self.sequences_chains():
552 sequence.pdb_save(tmp_file.name, chain)
553 # TODO: read from tmp_file.name
554 # change CHAIN
555 # add to out_file
557 os.unlink(NamedTemporaryFile)
559 # vim: set ts=4 sts=4 sw=4 et: