Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/annotate/0bd118c2d72b/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Mar 2 07:42:23 2014
Кодировка:
allpy: allpy/base.py annotate

allpy

annotate allpy/base.py @ 424:0bd118c2d72b

msf io: fix bugs * convert msf to temp fasta before each read and write * convert temp fasta to msf after each write
author boris <bnagaev@gmail.com>
date Mon, 14 Feb 2011 16:19:42 +0300
parents 42a05616a09b
children df68a5bc96fc
rev   line source
me@261 1 import sys
bnagaev@357 2 import re
me@261 3
me@315 4 import util
bnagaev@418 5 import fileio
me@260 6
dendik@382 7 # import this very module as means of having all related classes in one place
dendik@382 8 import base
dendik@382 9
me@306 10 default_gaps = set((".", "-", "~"))
me@306 11 """Set of characters to recoginze as gaps when parsing alignment."""
me@306 12
me@328 13 class Monomer(object):
me@328 14 """Monomer object."""
me@260 15
me@328 16 type = None
me@328 17 """Either of 'dna', 'rna', 'protein'."""
me@260 18
dendik@382 19 types = base
dendik@382 20 """Mapping of related types. SHOULD be redefined in subclasses."""
dendik@382 21
me@260 22 by_code1 = {}
me@328 23 """A mapping from 1-letter code to Monomer subclass."""
me@328 24
me@260 25 by_code3 = {}
me@328 26 """A mapping from 3-letter code to Monomer subclass."""
me@328 27
me@260 28 by_name = {}
me@328 29 """A mapping from full monomer name to Monomer subclass."""
me@260 30
me@260 31 @classmethod
me@328 32 def _subclass(cls, name='', code1='', code3='', is_modified=False):
me@328 33 """Create new subclass of Monomer for given monomer type."""
me@328 34 class TheMonomer(cls):
me@328 35 pass
me@328 36 name = name.strip().capitalize()
me@328 37 code1 = code1.upper()
me@328 38 code3 = code3.upper()
bnagaev@357 39 TheMonomer.__name__ = re.sub(r"[^\w]", "_", name)
me@328 40 TheMonomer.name = name
me@328 41 TheMonomer.code1 = code1
me@328 42 TheMonomer.code3 = code3
me@328 43 TheMonomer.is_modified = is_modified
me@328 44 if not is_modified:
me@328 45 cls.by_code1[code1] = TheMonomer
me@328 46 cls.by_code3[code3] = TheMonomer
me@328 47 cls.by_name[name] = TheMonomer
me@328 48 # We duplicate distinguished long names into Monomer itself, so that we
me@328 49 # can use Monomer.from_code3 to create the relevant type of monomer.
me@328 50 Monomer.by_code3[code3] = TheMonomer
me@328 51 Monomer.by_name[name] = TheMonomer
me@260 52
me@328 53 @classmethod
me@353 54 def _initialize(cls, codes=None):
me@328 55 """Create all relevant subclasses of Monomer."""
bnagaev@378 56 for code1, is_modified, code3, name in codes:
bnagaev@378 57 cls._subclass(name, code1, code3, is_modified)
me@260 58
me@260 59 @classmethod
me@260 60 def from_code1(cls, code1):
me@328 61 """Create new monomer from 1-letter code."""
me@328 62 return cls.by_code1[code1.upper()]()
me@260 63
me@260 64 @classmethod
me@260 65 def from_code3(cls, code3):
me@328 66 """Create new monomer from 3-letter code."""
me@328 67 return cls.by_code3[code3.upper()]()
me@260 68
me@260 69 @classmethod
me@260 70 def from_name(cls, name):
me@328 71 """Create new monomer from full name."""
me@328 72 return cls.by_name[name.strip().capitalize()]()
me@260 73
me@329 74 def __repr__(self):
me@329 75 return '<Monomer %s>' % self.code3
me@329 76
me@329 77 def __str__(self):
me@329 78 """Returns one-letter code"""
me@329 79 return self.code1
me@329 80
me@260 81 def __eq__(self, other):
me@328 82 """Monomers within same monomer type are compared by code1."""
me@328 83 assert self.type == other.type
me@328 84 return self.code1 == other.code1
bnagaev@239 85
bnagaev@422 86 def __ne__(self, other):
bnagaev@422 87 return not (self == other)
bnagaev@422 88
bnagaev@239 89 class Sequence(list):
me@274 90 """Sequence of Monomers.
bnagaev@243 91
me@274 92 This behaves like list of monomer objects. In addition to standard list
me@274 93 behaviour, Sequence has the following attributes:
me@270 94
me@274 95 * name -- str with the name of the sequence
me@274 96 * description -- str with description of the sequence
me@274 97 * source -- str denoting source of the sequence
me@266 98
me@274 99 Any of them may be empty (i.e. hold empty string)
me@274 100 """
me@270 101
dendik@382 102 types = base
dendik@382 103 """Mapping of related types. SHOULD be redefined in subclasses."""
me@270 104
me@275 105 name = ''
me@275 106 description = ''
me@275 107 source = ''
me@275 108
me@347 109 @classmethod
me@347 110 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
me@347 111 """Create sequence from a list of monomer objecst."""
bnagaev@378 112 result = cls(monomers)
me@275 113 if name:
me@347 114 result.name = name
me@275 115 if description:
me@347 116 result.description = description
me@275 117 if source:
me@347 118 result.source = source
me@347 119 return result
me@347 120
me@347 121 @classmethod
me@347 122 def from_string(cls, string, name='', description='', source=''):
me@347 123 """Create sequences from string of one-letter codes."""
dendik@382 124 monomer = cls.types.Monomer.from_code1
me@347 125 monomers = [monomer(letter) for letter in string]
me@347 126 return cls.from_monomers(monomers, name, description, source)
me@270 127
me@329 128 def __repr__(self):
me@329 129 return '<Sequence %s>' % str(self)
me@329 130
me@262 131 def __str__(self):
me@329 132 """Returns sequence of one-letter codes."""
me@275 133 return ''.join(monomer.code1 for monomer in self)
me@270 134
me@316 135 def __hash__(self):
me@316 136 """Hash sequence by identity."""
me@316 137 return id(self)
me@316 138
me@295 139 class Alignment(object):
me@295 140 """Alignment. It is a list of Columns."""
bnagaev@249 141
dendik@382 142 types = base
dendik@382 143 """Mapping of related types. SHOULD be redefined in subclasses."""
me@288 144
me@289 145 sequences = None
me@289 146 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
bnagaev@249 147
me@287 148 def __init__(self):
me@287 149 """Initialize empty alignment."""
me@287 150 self.sequences = []
me@295 151 self.columns = []
me@282 152
me@362 153 # Alignment grow & IO methods
me@299 154 # ==============================
me@299 155
me@294 156 def append_sequence(self, sequence):
me@365 157 """Add sequence to alignment. Return self.
me@294 158
me@294 159 If sequence is too short, pad it with gaps on the right.
me@294 160 """
me@294 161 self.sequences.append(sequence)
dendik@388 162 self._pad_to_width(len(sequence))
dendik@388 163 for column, monomer in zip(self.columns, sequence):
dendik@388 164 column[sequence] = monomer
me@365 165 return self
me@294 166
me@364 167 def append_row_from_string(self, string,
me@364 168 name='', description='', source='', gaps=default_gaps):
me@364 169 """Add row from a string of one-letter codes and gaps. Return self."""
dendik@382 170 Sequence = self.types.Sequence
me@349 171 without_gaps = util.remove_each(string, gaps)
me@321 172 sequence = Sequence.from_string(without_gaps, name, description, source)
dendik@388 173 self._pad_to_width(len(string))
dendik@388 174 non_gap_columns = [column
dendik@390 175 for column, char in zip(self.columns, string)
dendik@388 176 if char not in gaps
dendik@388 177 ]
dendik@388 178 for monomer, column in zip(sequence, non_gap_columns):
dendik@388 179 column[sequence] = monomer
me@287 180 self.sequences.append(sequence)
me@364 181 return self
me@287 182
dendik@388 183 def _pad_to_width(self, n):
dendik@388 184 """Pad alignment with empty columns on the right to width n."""
dendik@388 185 for i in range(len(self.columns), n):
me@302 186 self.columns.append(Column())
me@302 187
me@362 188 def append_file(self, file, format='fasta', gaps=default_gaps):
me@365 189 """Append sequences from file to alignment. Return self.
me@299 190
me@362 191 If sequences in file have gaps (detected as characters belonging to
me@362 192 `gaps` set), treat them accordingly.
me@362 193 """
bnagaev@418 194 sequences = []
bnagaev@418 195 if format == 'fasta':
bnagaev@418 196 sequences = fileio.FastaIo(file).get_all_strings()
bnagaev@418 197 elif format == 'msf':
bnagaev@418 198 sequences = fileio.MsfIo(file).get_all_strings()
bnagaev@418 199 else:
bnagaev@418 200 raise Exception("We don't support other formats yet")
bnagaev@418 201 for (name, description, body) in sequences:
bnagaev@378 202 self.append_row_from_string(body, name, description, file.name, gaps)
me@287 203 return self
bnagaev@249 204
bnagaev@418 205 def to_file(self, file, format='fasta', gap='-'):
me@292 206 """Write alignment in FASTA file as sequences with gaps."""
me@367 207 assert format == "fasta", "We don't support other formats yet"
me@292 208 def char(monomer):
me@292 209 if monomer:
me@292 210 return monomer.code1
bnagaev@418 211 return gap
bnagaev@418 212 if format == 'fasta':
bnagaev@418 213 io = fileio.FastaIo(file)
bnagaev@418 214 elif format == 'msf':
bnagaev@418 215 io = fileio.MsfIo(file)
bnagaev@418 216 else:
bnagaev@418 217 raise Exception("We don't support other formats yet")
me@292 218 for row in self.rows_as_lists():
me@292 219 seq = row.sequence
me@292 220 line = "".join(map(char, row))
bnagaev@418 221 io.save_string(line, seq.name, seq.description)
me@292 222
me@299 223 # Data access methods for alignment
me@299 224 # =================================
me@299 225
me@299 226 def rows(self):
me@299 227 """Return list of rows (temporary objects) in alignment.
me@299 228
me@299 229 Each row is a dictionary of { column : monomer }.
me@363 230
me@299 231 For gap positions there is no key for the column in row.
me@299 232
me@299 233 Each row has attribute `sequence` pointing to the sequence the row is
me@299 234 describing.
me@299 235
me@299 236 Modifications of row have no effect on the alignment.
me@299 237 """
me@299 238 # For now, the function returns a list rather than iterator.
me@299 239 # It is yet to see, whether memory performance here becomes critical,
me@299 240 # or is random access useful.
me@299 241 rows = []
me@299 242 for sequence in self.sequences:
me@299 243 row = util.UserDict()
me@299 244 row.sequence = sequence
me@299 245 for column in self.columns:
me@299 246 if sequence in column:
me@299 247 row[column] = column[sequence]
me@299 248 rows.append(row)
me@299 249 return rows
me@299 250
me@299 251 def rows_as_lists(self):
me@299 252 """Return list of rows (temporary objects) in alignment.
me@299 253
me@299 254 Each row here is a list of either monomer or None (for gaps).
me@299 255
me@299 256 Each row has attribute `sequence` pointing to the sequence of row.
me@299 257
me@299 258 Modifications of row have no effect on the alignment.
me@299 259 """
me@299 260 rows = []
me@299 261 for sequence in self.sequences:
me@299 262 row = util.UserList()
me@299 263 row.sequence = sequence
me@299 264 for column in self.columns:
me@299 265 row.append(column.get(sequence))
me@299 266 rows.append(row)
me@299 267 return rows
me@299 268
me@299 269 def columns_as_lists(self):
me@299 270 """Return list of columns (temorary objects) in alignment.
me@299 271
me@299 272 Each column here is a list of either monomer or None (for gaps).
me@299 273
me@299 274 Items of column are sorted in the same way as alignment.sequences.
me@299 275
me@299 276 Modifications of column have no effect on the alignment.
me@299 277 """
me@299 278 columns = []
me@299 279 for column in self.columns:
me@299 280 col = []
me@299 281 for sequence in self.sequences:
me@299 282 col.append(column.get(sequence))
me@299 283 columns.append(col)
me@299 284 return columns
me@299 285
me@368 286 # Alignment / Block editing methods
me@368 287 # =================================
me@368 288
me@368 289 def _flush_row(self, row, whence='left'):
me@368 290 """Helper for `flush`: flush to one side all monomers in one row."""
me@368 291 row = filter(None, row)
me@368 292 padding = [None] * len(self.columns)
me@368 293 if whence == 'left':
me@368 294 return row + padding
me@368 295 if whence == 'right':
me@368 296 return padding + row
me@368 297 if whence == 'center':
me@368 298 pad_len = (len(self.columns) - len(row)) // 2
me@368 299 # vvv fix padding for case when length is odd: better have more
me@368 300 pad_len += len(self.columns) - 2 * pad_len
me@368 301 padding = [None] * pad_len
me@368 302 return padding + row + padding
me@368 303 assert True, "whence must be either 'left' or 'right' or 'center'"
me@368 304
me@368 305 def flush(self, whence='left'):
me@368 306 """Remove all gaps from alignment and flush results to one side.
me@368 307
me@368 308 `whence` must be one of 'left', 'right' or 'center'
me@368 309 """
me@368 310 for row in self.rows_as_lists():
me@368 311 sequence = row.sequence
me@368 312 row = self._flush_row(row, whence)
me@368 313 for monomer, column in zip(row, self.columns):
me@368 314 if monomer:
me@368 315 column[sequence] = monomer
me@368 316 elif sequence in column:
me@368 317 del column[sequence]
me@368 318
me@369 319 def remove_gap_columns(self):
me@369 320 """Remove all empty columns."""
me@369 321 for n, column in reversed(enumerate(self.columns)):
me@369 322 if column == {}:
me@369 323 self.columns[n:n+1] = []
me@369 324
me@371 325 def _wipe(self):
me@371 326 """Make all positions gaps (but keep sequences intact)."""
me@371 327 for column in self.columns:
dendik@412 328 for sequence in list(column):
me@371 329 del column[sequence]
me@371 330
me@372 331 def _merge(self, dst, new, merge):
me@373 332 """Replace contents of `dst` with those of `new`.
me@372 333
me@372 334 Replace contents of elements using function `merge(dst_el, new_le)`.
me@372 335 """
bnagaev@384 336 for el, new_el in zip(dst, new):
bnagaev@384 337 merge(el, new_el)
me@372 338 dst[len(dst):] = new[len(dst):]
me@372 339 del dst[len(new):]
me@371 340
me@373 341 def _replace_sequence_contents(self, new, copy_descriptions):
me@373 342 """Replace contents of sequences with those of `new` alignment."""
me@371 343 # XXX: we manually copy sequence contents here
me@372 344 # XXX: we only copy, overlapping parts and link to the rest
me@372 345 def merge_monomers(dst, new):
me@372 346 dst.__class__ = new.__class__
me@372 347 def merge_sequences(dst, new):
me@373 348 if copy_descriptions:
me@373 349 vars(dst).update(vars(new))
me@372 350 self._merge(dst, new, merge_monomers)
me@372 351 self._merge(self.sequences, new.sequences, merge_sequences)
me@371 352
me@371 353 def _replace_column_contents(self, new):
me@373 354 """Replace column contents with those of `new` alignment.
me@371 355
me@373 356 Synonym: copy gap patterns from `new` to `self`.
me@372 357
me@373 358 `self.sequences` and `new.sequences` should have the same contents.
me@371 359 """
me@371 360 self._wipe()
me@371 361 not_gap = lambda (a,b): a != None
me@371 362 for sequence, new_row in zip(self.sequences, new.rows_as_lists()):
me@371 363 assert len(sequence) == len(new_row.sequence)
dendik@388 364 non_gap_columns = [column
dendik@388 365 for column, monomer in zip(self.columns, new_row)
dendik@388 366 if monomer
dendik@388 367 ]
dendik@388 368 for monomer, column in zip(sequence, non_gap_columns):
dendik@388 369 column[sequence] = monomer
me@371 370
me@373 371 def _replace_contents(self, new, copy_descriptions, copy_contents):
me@371 372 """Replace alignment contents with those of other alignment."""
me@373 373 if copy_contents:
me@373 374 self._replace_sequence_contents(new, copy_descriptions)
bnagaev@378 375 self._replace_column_contents(new)
me@371 376
me@373 377 def process(self, function, copy_descriptions=True, copy_contents=True):
me@371 378 """Apply function to the alignment (or block); inject results back.
me@371 379
me@373 380 - `function(block)` must return block with same line order.
me@373 381 - if `copy_descriptions` is False, ignore new sequence names.
me@373 382 - if `copy_contents` is False, don't copy sequence contents too.
dendik@380 383
dendik@380 384 `function` (object) may have attributes `copy_descriptions` and
dendik@380 385 `copy_contents`, which override the same named arguments.
me@371 386 """
me@371 387 new = function(self)
dendik@380 388 if hasattr(function, 'copy_descriptions'):
dendik@380 389 copy_descriptions = function.copy_descriptions
dendik@380 390 if hasattr(function, 'copy_contents'):
dendik@380 391 copy_contents = function.copy_contents
me@373 392 self._replace_contents(new, copy_descriptions, copy_contents)
me@371 393
me@300 394 class Column(dict):
me@300 395 """Column of alignment.
me@300 396
me@300 397 Column is a dict of { sequence : monomer }.
me@300 398
me@300 399 For sequences that have gaps in current row, given key is not present in
me@300 400 the column.
me@300 401 """
me@325 402
dendik@382 403 types = base
dendik@382 404 """Mapping of related types. SHOULD be redefined in subclasses."""
dendik@382 405
me@325 406 def __hash__(self):
me@325 407 """Return hash by identity."""
me@325 408 return id(self)
me@300 409
me@317 410 class Block(Alignment):
me@307 411 """Block of alignment.
me@301 412
dendik@415 413 Block is an intersection of several rows & columns. (The collections of
dendik@415 414 rows and columns are represented as ordered lists, to retain display order
dendik@415 415 of Alignment or add ability to tweak it). Most of blocks look like
dendik@415 416 rectangular part of alignment if you shuffle alignment rows the right way.
me@261 417 """
me@270 418
me@307 419 alignment = None
me@307 420 """Alignment the block belongs to."""
me@270 421
me@307 422 sequences = ()
me@307 423 """List of sequences in block."""
me@307 424
me@307 425 columns = ()
me@307 426 """List of columns in block."""
me@307 427
me@317 428 @classmethod
me@317 429 def from_alignment(cls, alignment, sequences=None, columns=None):
me@307 430 """Build new block from alignment.
me@307 431
me@307 432 If sequences are not given, the block uses all sequences in alignment.
me@307 433
me@307 434 If columns are not given, the block uses all columns in alignment.
me@307 435
me@307 436 In both cases we use exactly the list used in alignment, thus, if new
me@307 437 sequences or columns are added to alignment, the block tracks this too.
me@261 438 """
me@307 439 if sequences is None:
me@307 440 sequences = alignment.sequences
me@318 441 if columns is None:
me@307 442 columns = alignment.columns
me@320 443 block = cls()
me@320 444 block.alignment = alignment
me@320 445 block.sequences = sequences
me@320 446 block.columns = columns
me@320 447 return block
me@270 448
me@260 449 # vim: set ts=4 sts=4 sw=4 et: