Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/annotate/237deca17963/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Mar 2 07:03:45 2014
Кодировка:
allpy: allpy/base.py annotate

allpy

annotate allpy/base.py @ 307:237deca17963

Converted docstrings and init of base.Block to match our plan
author Daniil Alexeyevsky <me.dendik@gmail.com>
date Thu, 16 Dec 2010 20:42:42 +0300
parents 88631907f23d
children 4d610c277281
rev   line source
me@261 1 import sys
me@261 2 import os
me@262 3 import os.path
me@261 4 from tempfile import NamedTemporaryFile
me@262 5 import urllib2
me@261 6
me@261 7 import config
me@284 8 import fasta
me@261 9 from graph import Graph
me@262 10 from Bio.PDB.DSSP import make_dssp_dict
me@260 11 import data.codes
me@260 12
me@306 13 default_gaps = set((".", "-", "~"))
me@306 14 """Set of characters to recoginze as gaps when parsing alignment."""
me@306 15
me@260 16 class MonomerType(object):
me@260 17 """Class of monomer types.
me@260 18
me@260 19 Each MonomerType object represents a known monomer type, e.g. Valine,
me@260 20 and is referenced to by each instance of monomer in a given sequence.
me@260 21
me@260 22 - `name`: full name of monomer type
me@260 23 - `code1`: one-letter code
me@260 24 - `code3`: three-letter code
me@260 25 - `is_modified`: either of True or False
me@260 26
me@260 27 class atributes:
me@260 28
me@260 29 - `by_code1`: a mapping from one-letter code to MonomerType object
me@260 30 - `by_code3`: a mapping from three-letter code to MonomerType object
me@260 31 - `by_name`: a mapping from monomer name to MonomerType object
me@260 32 - `instance_type`: class of Monomer objects to use when creating new
me@260 33 objects; this must be redefined in descendent classes
me@260 34
me@260 35 All of the class attributes MUST be redefined when subclassing.
me@260 36 """
me@260 37
me@260 38 by_code1 = {}
me@260 39 by_code3 = {}
me@260 40 by_name = {}
me@260 41 instance_type = None
me@260 42
me@260 43 def __init__(self, name="", code1="", code3="", is_modified=False):
me@260 44 self.name = name.capitalize()
me@260 45 self.code1 = code1.upper()
me@260 46 self.code3 = code3.upper()
me@260 47 self.is_modified = bool(is_modified)
me@260 48 if not is_modified:
me@260 49 self.by_code1[self.code1] = self
me@260 50 self.by_code3[code3] = self
me@260 51 self.by_name[name] = self
me@260 52 # We duplicate distinguished long names into MonomerType itself,
me@260 53 # so that we can use MonomerType.from_code3 to create the relevant
me@260 54 # type of monomer.
me@260 55 MonomerType.by_code3[code3] = self
me@260 56 MonomerType.by_name[name] = self
me@260 57
me@260 58 @classmethod
me@260 59 def _initialize(cls, type_letter, codes=data.codes.codes):
me@260 60 """Create all relevant instances of MonomerType.
me@260 61
me@260 62 `type_letter` is either of:
me@260 63
me@260 64 - 'p' for protein
me@260 65 - 'd' for DNA
me@260 66 - 'r' for RNA
me@260 67
me@260 68 `codes` is a table of monomer codes
me@260 69 """
me@260 70 for type, code1, is_modified, code3, name in codes:
me@260 71 if type == type_letter:
me@260 72 cls(name, code1, code3, is_modified)
me@260 73
me@260 74 @classmethod
me@260 75 def from_code1(cls, code1):
me@260 76 """Return monomer type by one-letter code."""
me@260 77 return cls.by_code1[code1.upper()]
me@260 78
me@260 79 @classmethod
me@260 80 def from_code3(cls, code3):
me@260 81 """Return monomer type by three-letter code."""
me@260 82 return cls.by_code3[code3.upper()]
me@260 83
me@260 84 @classmethod
me@260 85 def from_name(cls, name):
me@260 86 """Return monomer type by name."""
me@260 87 return cls.by_name[name.capitalize()]
me@260 88
me@260 89 def instance(self):
me@260 90 """Create a new monomer of given type."""
me@260 91 return self.instance_type(self)
me@260 92
me@260 93 def __eq__(self, other):
me@260 94 if hasattr(other, "type"):
me@260 95 return self is other.type
me@260 96 return self is other
me@260 97
me@260 98 class Monomer(object):
me@260 99 """Monomer object.
me@260 100
me@260 101 attributes:
me@260 102
me@260 103 - `type`: type of monomer (a MonomerType object)
me@260 104
me@282 105 class attributes:
me@282 106
me@282 107 - `monomer_type`: either MonomerType or one of it's subclasses, it is used
me@282 108 when creating new monomers. It SHOULD be redefined when subclassing
me@282 109 Monomer.
me@260 110 """
me@260 111 monomer_type = MonomerType
me@260 112
me@260 113 def __init__(self, type):
me@260 114 self.type = type
me@260 115
me@260 116 @classmethod
me@260 117 def from_code1(cls, code1):
me@260 118 return cls(cls.monomer_type.by_code1[code1.upper()])
me@260 119
me@260 120 @classmethod
me@260 121 def from_code3(cls, code3):
me@260 122 return cls(cls.monomer_type.by_code3[code3.upper()])
me@260 123
me@260 124 @classmethod
me@260 125 def from_name(cls, name):
me@260 126 return cls(cls.monomer_type.by_name[name.capitalize()])
me@260 127
me@260 128 def __eq__(self, other):
me@260 129 if hasattr(other, "type"):
me@260 130 return self.type is other.type
me@260 131 return self.type is other
bnagaev@239 132
bnagaev@239 133 class Sequence(list):
me@274 134 """Sequence of Monomers.
bnagaev@243 135
me@274 136 This behaves like list of monomer objects. In addition to standard list
me@274 137 behaviour, Sequence has the following attributes:
me@270 138
me@274 139 * name -- str with the name of the sequence
me@274 140 * description -- str with description of the sequence
me@274 141 * source -- str denoting source of the sequence
me@266 142
me@274 143 Any of them may be empty (i.e. hold empty string)
me@275 144
me@275 145 Class attributes:
me@282 146
me@275 147 * monomer_type -- type of monomers in sequence, must be redefined when
me@275 148 subclassing
me@274 149 """
me@270 150
me@275 151 monomer_type = Monomer
me@270 152
me@275 153 name = ''
me@275 154 description = ''
me@275 155 source = ''
me@275 156
me@275 157 def __init__(self, sequence=[], name=None, description=None, source=None):
me@275 158 super(Sequence, self).__init__(sequence)
me@275 159 if hasattr(sequence, 'name'):
me@275 160 vars(self).update(vars(sequence))
me@275 161 if name:
me@275 162 self.name = name
me@275 163 if description:
me@275 164 self.description = description
me@275 165 if source:
me@275 166 self.source = source
me@270 167
me@262 168 def __str__(self):
me@275 169 """Returns sequence in one-letter code."""
me@275 170 return ''.join(monomer.code1 for monomer in self)
me@270 171
me@273 172 @classmethod
me@273 173 def from_string(cls, string, name='', description=''):
me@273 174 """Create sequences from string of one-letter codes."""
me@273 175 monomer = cls.monomer_type.from_code1
me@273 176 monomers = [monomer(letter) for letter in string]
me@273 177 return cls(monomers, name, description)
me@262 178
me@284 179 @classmethod
me@284 180 def from_fasta(cls, file):
me@284 181 """Read sequence from FASTA file.
me@286 182
me@284 183 File must contain exactly one sequence.
me@284 184 """
me@284 185 sequences = fasta.parse_file(file)
me@284 186 assert len(sequences) == 1
me@287 187 name, description = sequences.keys()[0]
me@284 188 return cls(sequences[header], name, description, file.name)
me@284 189
me@295 190 class Alignment(object):
me@295 191 """Alignment. It is a list of Columns."""
bnagaev@249 192
me@287 193 sequence_type = Sequence
me@289 194 """Type of sequences in alignment. SHOULD be redefined when subclassing."""
me@288 195
me@289 196 sequences = None
me@289 197 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
bnagaev@249 198
me@287 199 def __init__(self):
me@287 200 """Initialize empty alignment."""
me@287 201 super(Alignment, self).__init__()
me@287 202 self.sequences = []
me@295 203 self.columns = []
me@282 204
me@299 205 # Alignment modification methods
me@299 206 # ==============================
me@299 207
me@294 208 def append_sequence(self, sequence):
me@294 209 """Add sequence to alignment.
me@294 210
me@294 211 If sequence is too short, pad it with gaps on the right.
me@294 212 """
me@294 213 self.sequences.append(sequence)
me@294 214 for i, monomer in enumerate(sequence):
me@302 215 self.column_at(i)[sequence] = monomer
me@294 216
me@306 217 def append_gapped_line(self, line, name='', description='', source='',
me@306 218 gaps=default_gaps):
me@287 219 """Add row from a line of one-letter codes and gaps."""
me@287 220 Sequence = cls.sequence_type
me@306 221 not_gap = lambda (i, char): char not in gaps
me@306 222 without_gaps = util.remove_each(line, gaps)
me@306 223 sequence = Sequence(without_gaps, name, description, source)
me@303 224 # The following line has some simple magic:
me@303 225 # 1. attach natural numbers to monomers
me@303 226 # 2. delete gaps
me@303 227 # 3. attach numbers again
me@303 228 # This way we have a pair of numbers attached to monomer:
me@303 229 # - it's position in alignment (the first attached number, j)
me@303 230 # - it's position in sequence (the second attached number, i)
me@287 231 for i, (j, char) in enumerate(filter(not_gap, enumerate(line))):
me@302 232 self.column_at(j)[seq] = sequence[i]
me@287 233 self.sequences.append(sequence)
me@287 234
me@302 235 def column_at(self, n):
me@302 236 """Return column by index. Create required new columns if required.
me@302 237
me@302 238 Do NOT use this method, unless you are sure it is what you want.
me@302 239 """
me@302 240 for i in range(len(self.columns), n + 1):
me@302 241 self.columns.append(Column())
me@302 242 return self.columns[n]
me@302 243
me@299 244 # Alignment IO methods
me@299 245 # ====================
me@299 246
me@287 247 @classmethod
me@306 248 def from_fasta(cls, file, gaps=default_gaps):
me@287 249 """Create new alignment from FASTA file."""
me@287 250 self = cls()
me@287 251 for ((name, description), body) in fasta.parse_file(file):
me@306 252 self.append_gapped_line(body, name, description, gaps)
me@287 253 return self
bnagaev@249 254
me@292 255 def to_fasta(self, file):
me@292 256 """Write alignment in FASTA file as sequences with gaps."""
me@292 257 def char(monomer):
me@292 258 if monomer:
me@292 259 return monomer.code1
me@292 260 return "-"
me@292 261 for row in self.rows_as_lists():
me@292 262 seq = row.sequence
me@292 263 line = "".join(map(char, row))
me@292 264 fasta.save_file(file, line, seq.name, seq.description)
me@292 265
me@299 266 # Data access methods for alignment
me@299 267 # =================================
me@299 268
me@299 269 def rows(self):
me@299 270 """Return list of rows (temporary objects) in alignment.
me@299 271
me@299 272 Each row is a dictionary of { column : monomer }.
me@299 273
me@299 274 For gap positions there is no key for the column in row.
me@299 275
me@299 276 Each row has attribute `sequence` pointing to the sequence the row is
me@299 277 describing.
me@299 278
me@299 279 Modifications of row have no effect on the alignment.
me@299 280 """
me@299 281 # For now, the function returns a list rather than iterator.
me@299 282 # It is yet to see, whether memory performance here becomes critical,
me@299 283 # or is random access useful.
me@299 284 rows = []
me@299 285 for sequence in self.sequences:
me@299 286 row = util.UserDict()
me@299 287 row.sequence = sequence
me@299 288 for column in self.columns:
me@299 289 if sequence in column:
me@299 290 row[column] = column[sequence]
me@299 291 rows.append(row)
me@299 292 return rows
me@299 293
me@299 294 def rows_as_lists(self):
me@299 295 """Return list of rows (temporary objects) in alignment.
me@299 296
me@299 297 Each row here is a list of either monomer or None (for gaps).
me@299 298
me@299 299 Each row has attribute `sequence` pointing to the sequence of row.
me@299 300
me@299 301 Modifications of row have no effect on the alignment.
me@299 302 """
me@299 303 rows = []
me@299 304 for sequence in self.sequences:
me@299 305 row = util.UserList()
me@299 306 row.sequence = sequence
me@299 307 for column in self.columns:
me@299 308 row.append(column.get(sequence))
me@299 309 rows.append(row)
me@299 310 return rows
me@299 311
me@299 312 def columns_as_lists(self):
me@299 313 """Return list of columns (temorary objects) in alignment.
me@299 314
me@299 315 Each column here is a list of either monomer or None (for gaps).
me@299 316
me@299 317 Items of column are sorted in the same way as alignment.sequences.
me@299 318
me@299 319 Modifications of column have no effect on the alignment.
me@299 320 """
me@299 321 columns = []
me@299 322 for column in self.columns:
me@299 323 col = []
me@299 324 for sequence in self.sequences:
me@299 325 col.append(column.get(sequence))
me@299 326 columns.append(col)
me@299 327 return columns
me@299 328
me@300 329 class Column(dict):
me@300 330 """Column of alignment.
me@300 331
me@300 332 Column is a dict of { sequence : monomer }.
me@300 333
me@300 334 For sequences that have gaps in current row, given key is not present in
me@300 335 the column.
me@300 336 """
me@300 337 pass
me@300 338
me@307 339 class Block(object):
me@307 340 """Block of alignment.
me@301 341
me@307 342 Block is intersection of a set of columns & a set of rows. Most of blocks
me@307 343 look like rectangular part of alignment if you shuffle alignment rows the
me@307 344 right way.
me@261 345 """
me@270 346
me@307 347 alignment = None
me@307 348 """Alignment the block belongs to."""
me@270 349
me@307 350 sequences = ()
me@307 351 """List of sequences in block."""
me@307 352
me@307 353 columns = ()
me@307 354 """List of columns in block."""
me@307 355
me@307 356 def __init__(self, alignment, sequences=None, columns=None):
me@307 357 """Build new block from alignment.
me@307 358
me@307 359 If sequences are not given, the block uses all sequences in alignment.
me@307 360
me@307 361 If columns are not given, the block uses all columns in alignment.
me@307 362
me@307 363 In both cases we use exactly the list used in alignment, thus, if new
me@307 364 sequences or columns are added to alignment, the block tracks this too.
me@261 365 """
me@307 366 if sequences is None:
me@307 367 sequences = alignment.sequences
me@307 368 if colums is None:
me@307 369 columns = alignment.columns
me@261 370 self.alignment = alignment
me@261 371 self.sequences = sequences
me@261 372 self.positions = positions
me@270 373
me@307 374 ## Unclean code follows
me@307 375
me@261 376 def save_fasta(self, out_file, long_line=70, gap='-'):
me@270 377 """ Saves alignment to given file in fasta-format
me@270 378
me@261 379 No changes in the names, descriptions or order of the sequences
me@261 380 are made.
me@261 381 """
me@261 382 for sequence in self.sequences:
me@261 383 alignment_monomers = self.alignment.body[sequence]
me@261 384 block_monomers = [alignment_monomers[i] for i in self.positions]
me@261 385 string = ''.join([m.type.code1 if m else '-' for m in block_monomers])
me@261 386 save_fasta(out_file, string, sequence.name, sequence.description, long_line)
me@270 387
me@270 388 def geometrical_cores(self, max_delta=config.delta,
me@270 389 timeout=config.timeout, minsize=config.minsize,
me@261 390 ac_new_atoms=config.ac_new_atoms,
me@261 391 ac_count=config.ac_count):
me@261 392 """ Returns length-sorted list of blocks, representing GCs
me@270 393
me@282 394 * max_delta -- threshold of distance spreading
me@282 395 * timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm)
me@282 396 * minsize -- min size of each core
me@282 397 * ac_new_atoms -- min part or new atoms in new alternative core
me@282 398 current GC is compared with each of already selected GCs if
me@282 399 difference is less then ac_new_atoms, current GC is skipped
me@261 400 difference = part of new atoms in current core
me@282 401 * ac_count -- max number of cores (including main core)
me@261 402 -1 means infinity
me@282 403
me@261 404 If more than one pdb chain for some sequence provided, consider all of them
me@270 405 cost is calculated as 1 / (delta + 1)
me@282 406
me@261 407 delta in [0, +inf) => cost in (0, 1]
me@261 408 """
me@261 409 nodes = self.positions
me@261 410 lines = {}
me@261 411 for i in self.positions:
me@261 412 for j in self.positions:
me@261 413 if i < j:
me@261 414 distances = []
me@261 415 for sequence in self.sequences:
me@261 416 for chain in sequence.pdb_chains:
me@261 417 m1 = self.alignment.body[sequence][i]
me@261 418 m2 = self.alignment.body[sequence][j]
me@261 419 if m1 and m2:
me@261 420 r1 = sequence.pdb_residues[chain][m1]
me@261 421 r2 = sequence.pdb_residues[chain][m2]
me@261 422 ca1 = r1['CA']
me@261 423 ca2 = r2['CA']
me@261 424 d = ca1 - ca2 # Bio.PDB feature
me@261 425 distances.append(d)
me@261 426 if len(distances) >= 2:
me@261 427 delta = max(distances) - min(distances)
me@261 428 if delta <= max_delta:
me@261 429 lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta)
me@261 430 graph = Graph(nodes, lines)
me@261 431 cliques = graph.cliques(timeout=timeout, minsize=minsize)
me@261 432 GCs = []
me@261 433 for clique in cliques:
me@261 434 for GC in GCs:
me@261 435 if len(clique - set(GC.positions)) < ac_new_atoms * len(clique):
me@261 436 break
me@261 437 else:
me@261 438 GCs.append(Block(self.alignment, self.sequences, clique))
me@261 439 if ac_count != -1 and len(GCs) >= ac_count:
me@261 440 break
me@261 441 return GCs
me@270 442
me@261 443 def xstring(self, x='X', gap='-'):
me@261 444 """ Returns string consisting of gap chars and chars x at self.positions
me@270 445
me@261 446 Length of returning string = length of alignment
me@261 447 """
me@261 448 monomers = [False] * len(self.alignment)
me@261 449 for i in self.positions:
me@261 450 monomers[i] = True
me@261 451 return ''.join([x if m else gap for m in monomers])
me@270 452
me@261 453 def save_xstring(self, out_file, name, description='', x='X', gap='-', long_line=70):
me@261 454 """ Save xstring and name in fasta format """
me@261 455 save_fasta(out_file, self.xstring(x=x, gap=gap), name, description, long_line)
me@270 456
me@261 457 def monomers(self, sequence):
me@261 458 """ Iterates monomers of this sequence from this block """
me@261 459 alignment_sequence = self.alignment.body[sequence]
me@261 460 return (alignment_sequence[i] for i in self.positions)
me@270 461
me@261 462 def ca_atoms(self, sequence, pdb_chain):
me@261 463 """ Iterates Ca-atom of monomers of this sequence from this block """
me@261 464 return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers())
me@270 465
me@261 466 def sequences_chains(self):
me@261 467 """ Iterates pairs (sequence, chain) """
me@261 468 for sequence in self.alignment.sequences:
me@261 469 if sequence in self.sequences:
me@261 470 for chain in sequence.pdb_chains:
me@261 471 yield (sequence, chain)
me@270 472
me@261 473 def superimpose(self):
me@261 474 """ Superimpose all pdb_chains in this block """
me@261 475 sequences_chains = list(self.sequences_chains())
me@261 476 if len(sequences_chains) >= 1:
me@261 477 sup = Superimposer()
me@261 478 fixed_sequence, fixed_chain = sequences_chains.pop()
me@261 479 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain)
me@261 480 for sequence, chain in sequences_chains:
me@261 481 moving_atoms = self.ca_atoms(sequence, chain)
me@261 482 sup.set_atoms(fixed_atoms, moving_atoms)
me@261 483 # Apply rotation/translation to the moving atoms
me@261 484 sup.apply(moving_atoms)
me@270 485
me@261 486 def pdb_save(self, out_file):
me@270 487 """ Save all sequences
me@270 488
me@261 489 Returns {(sequence, chain): CHAIN}
me@261 490 CHAIN is chain letter in new file
me@261 491 """
me@261 492 tmp_file = NamedTemporaryFile(delete=False)
me@261 493 tmp_file.close()
me@270 494
me@261 495 for sequence, chain in self.sequences_chains():
me@261 496 sequence.pdb_save(tmp_file.name, chain)
me@261 497 # TODO: read from tmp_file.name
me@261 498 # change CHAIN
me@261 499 # add to out_file
me@270 500
me@261 501 os.unlink(NamedTemporaryFile)
bnagaev@239 502
me@260 503 # vim: set ts=4 sts=4 sw=4 et: