Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/15633bca9c90/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 21:58:38 2013
Кодировка:
allpy: 15633bca9c90 allpy/base.py

allpy

view allpy/base.py @ 748:15633bca9c90

extract_pfam: Added sequence case markup. (closes #76)
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Mon, 11 Jul 2011 14:36:57 +0400
parents 0a7917b4ca71
children fbf79117dd18
line source
1 import sys
2 import re
4 import util
5 import fileio
6 import data.monomers
8 # import this very module as means of having all related classes in one place
9 import base
11 default_gaps = set((".", "-", "~"))
12 """Set of characters to recoginze as gaps when parsing alignment."""
14 class Monomer(object):
15 """Monomer object."""
17 type = None
18 """Either of 'dna', 'rna', 'protein'."""
20 types = base
21 """Mapping of related types. SHOULD be redefined in subclasses."""
23 by_code1 = {}
24 """A mapping from 1-letter code to Monomer subclass."""
26 by_code3 = {}
27 """A mapping from 3-letter code to Monomer subclass."""
29 by_name = {}
30 """A mapping from full monomer name to Monomer subclass."""
32 @classmethod
33 def _subclass(cls, name='', code1='', code3='', is_modified=False):
34 """Create new subclass of Monomer for given monomer type."""
35 class TheMonomer(cls):
36 pass
37 name = name.strip().capitalize()
38 code1 = code1.upper()
39 code3 = code3.upper()
40 module = vars(data.monomers)[cls.type]
41 TheMonomer.__name__ = re.sub(r"\W", "_", name)
42 TheMonomer.__module__ = module.__name__
43 TheMonomer.name = name
44 TheMonomer.code1 = code1
45 TheMonomer.code3 = code3
46 TheMonomer.is_modified = is_modified
47 # Save the class in data.monomers so that it can be pickled
48 # Some names are not unique, we append underscores to them
49 # in order to fix it.
50 while TheMonomer.__name__ in vars(module):
51 TheMonomer.__name__ += "_"
52 vars(module)[TheMonomer.__name__] = TheMonomer
53 if not is_modified:
54 cls.by_code1[code1] = TheMonomer
55 cls.by_code3[code3] = TheMonomer
56 cls.by_name[name] = TheMonomer
57 # We duplicate distinguished long names into Monomer itself, so that we
58 # can use Monomer.from_code3 to create the relevant type of monomer.
59 Monomer.by_code3[code3] = TheMonomer
60 Monomer.by_name[name] = TheMonomer
62 @classmethod
63 def _initialize(cls, codes=None):
64 """Create all relevant subclasses of Monomer."""
65 for code1, is_modified, code3, name in codes:
66 cls._subclass(name, code1, code3, is_modified)
68 @classmethod
69 def from_code1(cls, code1):
70 """Create new monomer from 1-letter code."""
71 monomer = cls.by_code1[code1.upper()]()
72 monomer.input_code1 = code1
73 return monomer
75 @classmethod
76 def from_code3(cls, code3):
77 """Create new monomer from 3-letter code."""
78 return cls.by_code3[code3.upper()]()
80 @classmethod
81 def from_name(cls, name):
82 """Create new monomer from full name."""
83 return cls.by_name[name.strip().capitalize()]()
85 def __repr__(self):
86 return "<Monomer %s>" % str(self.code1)
88 def __str__(self):
89 """Returns one-letter code"""
90 return self.code1
92 def __eq__(self, other):
93 """Monomers within same monomer type are compared by code1."""
94 if not other:
95 return False
96 assert self.type == other.type
97 return self.code1 == other.code1
99 def __ne__(self, other):
100 return not (self == other)
102 class Sequence(list):
103 """Sequence of Monomers.
105 This behaves like list of monomer objects. In addition to standard list
106 behaviour, Sequence has the following attributes:
108 * name -- str with the name of the sequence
109 * description -- str with description of the sequence
110 * source -- str denoting source of the sequence
112 Any of them may be empty (i.e. hold empty string)
113 """
115 types = base
116 """Mapping of related types. SHOULD be redefined in subclasses."""
118 name = ''
119 description = ''
120 source = ''
122 def __init__(self, *args):
123 self.markups = {}
124 list.__init__(self, *args)
126 @classmethod
127 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
128 """Create sequence from a list of monomer objecst."""
129 result = cls(monomers)
130 if name:
131 result.name = name
132 if description:
133 result.description = description
134 if source:
135 result.source = source
136 return result
138 @classmethod
139 def from_string(cls, string, name='', description='', source=''):
140 """Create sequences from string of one-letter codes."""
141 monomer = cls.types.Monomer.from_code1
142 monomers = [monomer(letter) for letter in string]
143 return cls.from_monomers(monomers, name, description, source)
145 def __repr__(self):
146 if self.name:
147 return '<Sequence %s>' % str(self.name)
148 else:
149 return '<Sequence %s>' % str(self)
152 def __str__(self):
153 """Returns sequence of one-letter codes."""
154 return ''.join(monomer.code1 for monomer in self)
156 def __hash__(self):
157 """Hash sequence by identity."""
158 return id(self)
161 class Alignment(object):
162 """Alignment. It is a list of Columns."""
164 types = base
165 """Mapping of related types. SHOULD be redefined in subclasses."""
167 sequences = None
168 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
170 def __init__(self):
171 """Initialize empty alignment."""
172 self.sequences = []
173 self.columns = []
174 self.markups = {}
176 # Alignment grow & IO methods
177 # ==============================
179 def append_sequence(self, sequence):
180 """Add sequence to alignment. Return self.
182 If sequence is too short, pad it with gaps on the right.
183 """
184 self.sequences.append(sequence)
185 self._pad_to_width(len(sequence))
186 for column, monomer in zip(self.columns, sequence):
187 column[sequence] = monomer
188 return self
190 def append_row_from_string(self, string,
191 name='', description='', source='', gaps=default_gaps):
192 """Add row from a string of one-letter codes and gaps. Return self."""
193 Sequence = self.types.Sequence
194 without_gaps = util.remove_each(string, gaps)
195 sequence = Sequence.from_string(without_gaps, name, description, source)
196 self._pad_to_width(len(string))
197 non_gap_columns = [column
198 for column, char in zip(self.columns, string)
199 if char not in gaps
201 for monomer, column in zip(sequence, non_gap_columns):
202 column[sequence] = monomer
203 self.sequences.append(sequence)
204 return self
206 def append_row_with_gaps(self, row, sequence):
207 """Add row from row_as_list representation and sequence. Return self."""
208 self.sequences.append(sequence)
209 self._pad_to_width(len(row))
210 for column, monomer in zip(self.columns, row):
211 if monomer:
212 column[sequence] = monomer
213 return self
215 def _pad_to_width(self, n):
216 """Pad alignment with empty columns on the right to width n."""
217 for i in range(len(self.columns), n):
218 self.columns.append(Column())
220 def append_file(self, file, format='fasta', gaps=default_gaps):
221 """Append sequences from file to alignment. Return self.
223 If sequences in file have gaps (detected as characters belonging to
224 `gaps` set), treat them accordingly.
225 """
226 fileio.File(file, format, gaps=gaps).read_alignment(self)
227 return self
229 def to_file(self, file, format='fasta', gap='-'):
230 """Write alignment in FASTA file as sequences with gaps."""
231 fileio.File(file, format, gaps=gap).write_alignment(self)
232 return self
234 # Data access methods for alignment
235 # =================================
237 def rows(self):
238 """Return list of rows (temporary objects) in alignment.
240 Each row is a dictionary of { column : monomer }.
242 For gap positions there is no key for the column in row.
244 Each row has attribute `sequence` pointing to the sequence the row is
245 describing.
247 Modifications of row have no effect on the alignment.
248 """
249 # For now, the function returns a list rather than iterator.
250 # It is yet to see, whether memory performance here becomes critical,
251 # or is random access useful.
252 rows = []
253 for sequence in self.sequences:
254 row = util.UserDict()
255 row.sequence = sequence
256 for column in self.columns:
257 if sequence in column:
258 row[column] = column[sequence]
259 rows.append(row)
260 return rows
262 def rows_as_lists(self):
263 """Return list of rows (temporary objects) in alignment.
265 Each row here is a list of either monomer or None (for gaps).
267 Each row has attribute `sequence` pointing to the sequence of row.
269 Modifications of row have no effect on the alignment.
270 """
271 rows = []
272 for sequence in self.sequences:
273 row = util.UserList()
274 row.sequence = sequence
275 for column in self.columns:
276 row.append(column.get(sequence))
277 rows.append(row)
278 return rows
280 def rows_as_strings(self, gap='-'):
281 """Return list of string representation of rows in alignment.
283 Each row has attribute `sequence` pointing to the sequence of row.
285 `gap` is the symbol to use for gap.
286 """
287 rows = []
288 for sequence in self.sequences:
289 string = ""
290 for column in self.columns:
291 if sequence in column:
292 string += column[sequence].code1
293 else:
294 string += gap
295 string = util.UserString(string)
296 string.sequence = sequence
297 rows.append(string)
298 return rows
300 def row_as_list(self, sequence):
301 """Return representaion of row as list with `Monomers` and `None`s."""
302 return [column.get(sequence) for column in self.columns]
304 def row_as_string(self, sequence, gap='-'):
305 """Return string representaion of row in alignment.
307 String will have gaps represented by `gap` symbol (defaults to '-').
308 """
309 def char(monomer):
310 if monomer:
311 return monomer.code1
312 return gap
313 row = self.row_as_list(sequence)
314 return "".join(map(char, row))
316 def columns_as_lists(self):
317 """Return list of columns (temorary objects) in alignment.
319 Each column here is a list of either monomer or None (for gaps).
321 Items of column are sorted in the same way as alignment.sequences.
323 Modifications of column have no effect on the alignment.
324 """
325 columns = []
326 for column in self.columns:
327 col = util.UserList()
328 col.column = column
329 for sequence in self.sequences:
330 col.append(column.get(sequence))
331 columns.append(col)
332 return columns
334 # Alignment / Block editing methods
335 # =================================
337 def flush(self, whence='left'):
338 """Remove all gaps from alignment and flush results to one side.
340 `whence` must be one of 'left', 'right' or 'center'
341 """
342 if whence == 'left':
343 from processors import Left as Flush
344 elif whence == 'right':
345 from processors import Right as Flush
346 elif whence == 'center':
347 from processors import Center as Flush
348 else:
349 raise AssertionError, "Whence must be left, right or center"
350 self.realign(Flush())
352 def remove_gap_columns(self):
353 """Remove all empty columns."""
354 for n, column in reversed(list(enumerate(self.columns))):
355 if column == {}:
356 self.columns[n:n+1] = []
358 def _wipe_row(self, sequence):
359 """Turn all row positions into gaps (but keep sequences intact)."""
360 for column in self.columns:
361 if sequence in column:
362 del column[sequence]
364 def _merge(self, dst, new, merge):
365 """Replace contents of `dst` with those of `new`.
367 Replace contents of elements using function `merge(dst_el, new_le)`.
368 """
369 for el, new_el in zip(dst, new):
370 merge(el, new_el)
371 dst[len(dst):] = new[len(dst):]
372 del dst[len(new):]
374 def _replace_sequence_contents(self, new, copy_descriptions):
375 """Replace contents of sequences with those of `new` alignment."""
376 # XXX: we manually copy sequence contents here
377 # XXX: we only copy, overlapping parts and link to the rest
378 def merge_monomers(dst, new):
379 dst.__class__ = new.__class__
380 def merge_sequences(dst, new):
381 if copy_descriptions:
382 vars(dst).update(vars(new))
383 self._merge(dst, new, merge_monomers)
384 self._merge(self.sequences, new.sequences, merge_sequences)
386 def _replace_column_contents(self, new):
387 """Replace column contents with those of `new` alignment.
389 In other words: copy gap patterns from `new` to `self`.
391 `self.sequences` and `new.sequences` should have the same contents.
392 """
393 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):
394 sequence = row.sequence
395 monomers = filter(None, row)
396 assert len(monomers) == len(filter(None, new_row))
397 self._wipe_row(sequence)
398 non_gap_columns = [column
399 for column, monomer in zip(self.columns, new_row)
400 if monomer
402 for monomer, column in zip(monomers, non_gap_columns):
403 column[sequence] = monomer
405 def _replace_contents(self, new, copy_descriptions, copy_contents):
406 """Replace alignment contents with those of other alignment."""
407 if copy_contents:
408 self._replace_sequence_contents(new, copy_descriptions)
409 self._replace_column_contents(new)
411 def process(self, function, copy_descriptions=True, copy_contents=True):
412 """Apply function to the alignment (or block); inject results back.
414 - `function(block)` must return block with same line order.
415 - if `copy_descriptions` is False, ignore new sequence names.
416 - if `copy_contents` is False, don't copy sequence contents too.
418 `function` (object) may have attributes `copy_descriptions` and
419 `copy_contents`, which override the same named arguments.
420 """
421 new = function(self)
422 if hasattr(function, 'copy_descriptions'):
423 copy_descriptions = function.copy_descriptions
424 if hasattr(function, 'copy_contents'):
425 copy_contents = function.copy_contents
426 self._replace_contents(new, copy_descriptions, copy_contents)
428 def realign(self, function):
429 """Realign self.
431 I.e.: apply function to self to produce a new alignment, then update
432 self to have the same gap patterns as the new alignment.
434 This is the same as process(function, False, False)
435 """
436 new = function(self)
437 self._replace_column_contents(new)
439 class Column(dict):
440 """Column of alignment.
442 Column is a dict of { sequence : monomer }.
444 For sequences that have gaps in current row, given key is not present in
445 the column.
446 """
448 types = base
449 """Mapping of related types. SHOULD be redefined in subclasses."""
451 def __hash__(self):
452 """Return hash by identity."""
453 return id(self)
456 class Block(Alignment):
457 """Block of alignment.
459 Block is an intersection of several rows & columns. (The collections of
460 rows and columns are represented as ordered lists, to retain display order
461 of Alignment or add ability to tweak it). Most of blocks look like
462 rectangular part of alignment if you shuffle alignment rows the right way.
463 """
465 alignment = None
466 """Alignment the block belongs to."""
468 sequences = ()
469 """List of sequences in block."""
471 columns = ()
472 """List of columns in block."""
474 @classmethod
475 def from_alignment(cls, alignment, sequences=None, columns=None):
476 """Build new block from alignment.
478 If sequences are not given, the block uses all sequences in alignment.
480 If columns are not given, the block uses all columns in alignment.
482 In both cases we use exactly the list used in alignment, thus, if new
483 sequences or columns are added to alignment, the block tracks this too.
484 """
485 if sequences is None:
486 sequences = alignment.sequences
487 if columns is None:
488 columns = alignment.columns
489 block = cls()
490 block.alignment = alignment
491 block.sequences = sequences
492 block.columns = columns
493 return block
495 class Markup(object):
496 """Base class for sequence and alignment markups.
498 We shall call either sequence or alignment a container. And we shall call
499 either monomers or columns elements respectively.
501 Markup behaves like a dictionary of [element] -> value.
503 Every container has a dictionary of [name] -> markup. It is Markup's
504 responsibility to add itself to this dictionary and to avoid collisions
505 while doing it.
506 """
508 name = None
509 """Name of markup elements"""
511 def _register(self, container, name):
512 """Register self within container.
514 Assure the name is not taken before. If name is not given, look in the
515 class. Make sure we have some name at all.
516 """
517 if name:
518 self.name = name
519 assert self.name is not None
520 assert self.name not in container.markups
521 container.markups[self.name] = self
523 def refresh(self):
524 """Recalculate markup values (if they are generated automatically)."""
525 pass
527 @classmethod
528 def from_record(cls, container, record, name=None):
529 """Restore markup from `record`. (Used for loading from file).
531 `record` is a dict of all metadata and data related to one markup. All
532 keys and values in `record` are strings, markup must parse them itself.
534 Markup values should be stored in `record['markup']`, which is a list
535 of items separated with either `record['separator']` or a comma.
536 """
537 return cls(container, name)
539 def to_record(self):
540 """Save markup to `record`, for saving to file.
542 For description of `record` see docstring for `from_record` method.
543 """
544 return {}
546 def sorted_keys(self):
547 """Return list of elements in the container in proper order."""
548 raise NotImplementedError()
550 def sorted_values(self):
551 """Return list of markup values in container."""
552 raise NotImplementedError()
554 class SequenceMarkup(Markup):
555 """Markup for sequence.
557 Behaves like a dictionary of [monomer] -> value. Value may be anything
558 or something specific, depending on subclass.
560 Actual values are stored in monomers themselves as attributes.
561 """
563 def __init__(self, sequence, name=None):
564 self.sequence = sequence
565 self._register(sequence, name)
566 self.refresh()
568 def sorted_keys(self):
569 """Return list of monomers."""
570 return self.sequence
572 def sorted_values(self):
573 """Return list of markup values, if every monomer is marked up."""
574 return (self[monomer] for monomer in self.sequence)
576 def get(self, key, value=None):
577 """Part of Mapping collection interface."""
578 if key not in self:
579 return value
580 return self[key]
582 def __contains__(self, monomer):
583 """Part of Mapping collection interface."""
584 return hasattr(monomer, self.name)
586 def __getitem__(self, monomer):
587 """Part of Mapping collection interface."""
588 return getattr(monomer, self.name)
590 def __setitem__(self, monomer, value):
591 """Part of Mapping collection interface."""
592 return setattr(monomer, self.name, value)
594 class AlignmentMarkup(dict, Markup):
595 """Markupf for alignment.
597 Is a dictionary of [column] -> value. Value may be anything or something
598 specific, depending on subclass.
599 """
601 def __init__(self, alignment, name=None):
602 self.alignment = alignment
603 self._register(alignment, name)
604 self.refresh()
606 def sorted_keys(self):
607 """Return a list of columns."""
608 return self.alignment.columns
610 def sorted_values(self):
611 """Return a list of makrup values, if every column is marked up."""
612 return (self[column] for column in self.alignment.columns)
614 # vim: set ts=4 sts=4 sw=4 et: