Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/0192c5c09ce8/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 20:41:52 2013
Кодировка:
allpy: 0192c5c09ce8 allpy/base.py

allpy

view allpy/base.py @ 823:0192c5c09ce8

allpy.base: added parameter use_existing to *.add_markup() to allow same markup to be added multiple times explicitly [see #95]
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Fri, 15 Jul 2011 17:02:09 +0400
parents d87129162eb4
children 137981a4686f
line source
1 import sys
2 import re
4 import util
5 import fileio
6 import data.monomers
8 # import this very module as means of having all related classes in one place
9 import base
11 default_gaps = set((".", "-", "~"))
12 """Set of characters to recoginze as gaps when parsing alignment."""
14 class Monomer(object):
15 """Monomer object."""
17 type = None
18 """Either of 'dna', 'rna', 'protein'."""
20 types = base
21 """Mapping of related types. SHOULD be redefined in subclasses."""
23 by_code1 = {}
24 """A mapping from 1-letter code to Monomer subclass."""
26 by_code3 = {}
27 """A mapping from 3-letter code to Monomer subclass."""
29 by_name = {}
30 """A mapping from full monomer name to Monomer subclass."""
32 @classmethod
33 def _subclass(cls, name='', code1='', code3='', is_modified=False):
34 """Create new subclass of Monomer for given monomer type."""
35 class TheMonomer(cls):
36 pass
37 name = name.strip().capitalize()
38 code1 = code1.upper()
39 code3 = code3.upper()
40 module = vars(data.monomers)[cls.type]
41 TheMonomer.__name__ = re.sub(r"\W", "_", name)
42 TheMonomer.__module__ = module.__name__
43 TheMonomer.name = name
44 TheMonomer.code1 = code1
45 TheMonomer.code3 = code3
46 TheMonomer.is_modified = is_modified
47 # Save the class in data.monomers so that it can be pickled
48 # Some names are not unique, we append underscores to them
49 # in order to fix it.
50 while TheMonomer.__name__ in vars(module):
51 TheMonomer.__name__ += "_"
52 vars(module)[TheMonomer.__name__] = TheMonomer
53 if not is_modified:
54 cls.by_code1[code1] = TheMonomer
55 if code3 not in cls.by_code3 or not is_modified:
56 cls.by_code3[code3] = TheMonomer
57 cls.by_name[name] = TheMonomer
58 # We duplicate distinguished long names into Monomer itself, so that we
59 # can use Monomer.from_code3 to create the relevant type of monomer.
60 if code3 not in Monomer.by_code3 or not is_modified:
61 Monomer.by_code3[code3] = TheMonomer
62 Monomer.by_name[name] = TheMonomer
64 @classmethod
65 def _initialize(cls, codes=None):
66 """Create all relevant subclasses of Monomer."""
67 for code1, is_modified, code3, name in codes:
68 cls._subclass(name, code1, code3, is_modified)
70 @classmethod
71 def from_code1(cls, code1):
72 """Create new monomer from 1-letter code."""
73 monomer = cls.by_code1[code1.upper()]()
74 monomer.input_code1 = code1
75 return monomer
77 @classmethod
78 def from_code3(cls, code3):
79 """Create new monomer from 3-letter code."""
80 return cls.by_code3[code3.upper()]()
82 @classmethod
83 def from_name(cls, name):
84 """Create new monomer from full name."""
85 return cls.by_name[name.strip().capitalize()]()
87 def __repr__(self):
88 return "<Monomer %s>" % str(self.code1)
90 def __str__(self):
91 """Returns one-letter code"""
92 return self.code1
94 def __eq__(self, other):
95 """Monomers within same monomer type are compared by code1."""
96 if not other:
97 return False
98 assert self.type == other.type
99 return self.code1 == other.code1
101 def __ne__(self, other):
102 return not (self == other)
104 class MarkupContainerMixin(object):
105 """Common functions for alignment and sequence for dealing with markups.
106 """
108 def _init(self):
109 """Hook to be called from __init__ of actual class."""
110 self.markups = {}
112 def add_markup(self, name, markup_class=None, use_existing=False, **kws):
113 """Create a markup object, add to self. Return the created markup.
115 - `name` is name for markup in `self.markups` dictionary
116 - optional `markup_class` is class for created markup
117 - if optional `use_existing` is true, it is no error, if same named
118 markup already exists (in this case, nothing is changed)
119 - optional keyword arguments are passed on to the markup constructor
121 For user markups you have to specify `name` and `markup_class`,
122 for the standard automatical markups just `name` is enough.
123 """
124 # We have to import markups here, and not in the module header
125 # so as not to create bad import loops.
126 # `base` module is used extensively in `markups` for inherinance,
127 # so breaking the loop here seems a lot easier.
128 import markups
129 if markup_class is None:
130 kind = self.kind + "_" + "markup"
131 markup_class = markups.by_name[kind, name]
132 if use_existing and name in self.markups:
133 assert self.markups[name].__class__ is markup_class
134 return
135 assert name not in self.markups
136 markup = markup_class(self, name, caller='container', **kws)
137 self.markups[name] = markup
138 return markup
140 def remove_markup(self, name):
141 """Remove markup."""
142 self.markups[name].remove()
143 del self.markups[name]
145 class Sequence(list, MarkupContainerMixin):
146 """Sequence of Monomers.
148 This behaves like list of monomer objects. In addition to standard list
149 behaviour, Sequence has the following attributes:
151 * name -- str with the name of the sequence
152 * description -- str with description of the sequence
153 * source -- str denoting source of the sequence
155 Any of them may be empty (i.e. hold empty string)
156 """
158 types = base
159 """Mapping of related types. SHOULD be redefined in subclasses."""
161 kind = 'sequence'
162 """Description of object kind."""
164 name = ''
165 description = ''
166 source = ''
168 def __init__(self, *args):
169 list.__init__(self, *args)
170 MarkupContainerMixin._init(self)
172 @classmethod
173 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
174 """Create sequence from a list of monomer objecst."""
175 result = cls(monomers)
176 if name:
177 result.name = name
178 if description:
179 result.description = description
180 if source:
181 result.source = source
182 return result
184 @classmethod
185 def from_string(cls, string, name='', description='', source=''):
186 """Create sequences from string of one-letter codes."""
187 monomer = cls.types.Monomer.from_code1
188 monomers = [monomer(letter) for letter in string]
189 return cls.from_monomers(monomers, name, description, source)
191 def __repr__(self):
192 if self.name:
193 return '<Sequence %s>' % str(self.name)
194 else:
195 return '<Sequence %s>' % str(self)
197 def __str__(self):
198 """Returns sequence of one-letter codes."""
199 return ''.join(monomer.code1 for monomer in self)
201 def __hash__(self):
202 """Hash sequence by identity."""
203 return id(self)
205 class Alignment(MarkupContainerMixin):
206 """Alignment. It is a list of Columns."""
208 types = base
209 """Mapping of related types. SHOULD be redefined in subclasses."""
211 sequences = None
212 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
214 kind = 'alignment'
215 """Description of object kind."""
217 def __init__(self):
218 """Initialize empty alignment."""
219 self.sequences = []
220 self.columns = []
221 MarkupContainerMixin._init(self)
223 # Alignment grow & IO methods
224 # ==============================
226 def append_sequence(self, sequence):
227 """Add sequence to alignment. Return self.
229 If sequence is too short, pad it with gaps on the right.
230 """
231 self.sequences.append(sequence)
232 self._pad_to_width(len(sequence))
233 for column, monomer in zip(self.columns, sequence):
234 column[sequence] = monomer
235 return self
237 def append_row_from_string(self, string,
238 name='', description='', source='', gaps=default_gaps):
239 """Add row from a string of one-letter codes and gaps. Return self."""
240 Sequence = self.types.Sequence
241 without_gaps = util.remove_each(string, gaps)
242 sequence = Sequence.from_string(without_gaps, name, description, source)
243 self._pad_to_width(len(string))
244 non_gap_columns = [column
245 for column, char in zip(self.columns, string)
246 if char not in gaps
248 for monomer, column in zip(sequence, non_gap_columns):
249 column[sequence] = monomer
250 self.sequences.append(sequence)
251 return self
253 def append_row_with_gaps(self, row, sequence):
254 """Add row from row_as_list representation and sequence. Return self."""
255 self.sequences.append(sequence)
256 self._pad_to_width(len(row))
257 for column, monomer in zip(self.columns, row):
258 if monomer:
259 column[sequence] = monomer
260 return self
262 def _pad_to_width(self, n):
263 """Pad alignment with empty columns on the right to width n."""
264 for i in range(len(self.columns), n):
265 self.columns.append(Column())
267 def append_file(self, file, format='fasta', gaps=default_gaps):
268 """Append sequences from file to alignment. Return self.
270 If sequences in file have gaps (detected as characters belonging to
271 `gaps` set), treat them accordingly.
272 """
273 fileio.File(file, format, gaps=gaps).read_alignment(self)
274 return self
276 def to_file(self, file, format='fasta', gap='-'):
277 """Write alignment in FASTA file as sequences with gaps."""
278 fileio.File(file, format, gaps=gap).write_alignment(self)
279 return self
281 # Data access methods for alignment
282 # =================================
284 def rows(self):
285 """Return list of rows (temporary objects) in alignment.
287 Each row is a dictionary of { column : monomer }.
289 For gap positions there is no key for the column in row.
291 Each row has attribute `sequence` pointing to the sequence the row is
292 describing.
294 Modifications of row have no effect on the alignment.
295 """
296 # For now, the function returns a list rather than iterator.
297 # It is yet to see, whether memory performance here becomes critical,
298 # or is random access useful.
299 rows = []
300 for sequence in self.sequences:
301 row = util.UserDict()
302 row.sequence = sequence
303 for column in self.columns:
304 if sequence in column:
305 row[column] = column[sequence]
306 rows.append(row)
307 return rows
309 def rows_as_lists(self):
310 """Return list of rows (temporary objects) in alignment.
312 Each row here is a list of either monomer or None (for gaps).
314 Each row has attribute `sequence` pointing to the sequence of row.
316 Modifications of row have no effect on the alignment.
317 """
318 rows = []
319 for sequence in self.sequences:
320 row = util.UserList()
321 row.sequence = sequence
322 for column in self.columns:
323 row.append(column.get(sequence))
324 rows.append(row)
325 return rows
327 def rows_as_strings(self, gap='-'):
328 """Return list of string representation of rows in alignment.
330 Each row has attribute `sequence` pointing to the sequence of row.
332 `gap` is the symbol to use for gap.
333 """
334 rows = []
335 for sequence in self.sequences:
336 string = ""
337 for column in self.columns:
338 if sequence in column:
339 string += column[sequence].code1
340 else:
341 string += gap
342 string = util.UserString(string)
343 string.sequence = sequence
344 rows.append(string)
345 return rows
347 def row_as_list(self, sequence):
348 """Return representaion of row as list with `Monomers` and `None`s."""
349 return [column.get(sequence) for column in self.columns]
351 def row_as_string(self, sequence, gap='-'):
352 """Return string representaion of row in alignment.
354 String will have gaps represented by `gap` symbol (defaults to '-').
355 """
356 def char(monomer):
357 if monomer:
358 return monomer.code1
359 return gap
360 row = self.row_as_list(sequence)
361 return "".join(map(char, row))
363 def columns_as_lists(self):
364 """Return list of columns (temorary objects) in alignment.
366 Each column here is a list of either monomer or None (for gaps).
368 Items of column are sorted in the same way as alignment.sequences.
370 Modifications of column have no effect on the alignment.
371 """
372 columns = []
373 for column in self.columns:
374 col = util.UserList()
375 col.column = column
376 for sequence in self.sequences:
377 col.append(column.get(sequence))
378 columns.append(col)
379 return columns
381 # Alignment / Block editing methods
382 # =================================
384 def flush(self, whence='left'):
385 """Remove all gaps from alignment and flush results to one side.
387 `whence` must be one of 'left', 'right' or 'center'
388 """
389 if whence == 'left':
390 from processors import Left as Flush
391 elif whence == 'right':
392 from processors import Right as Flush
393 elif whence == 'center':
394 from processors import Center as Flush
395 else:
396 raise AssertionError, "Whence must be left, right or center"
397 self.realign(Flush())
399 def remove_gap_columns(self):
400 """Remove all empty columns."""
401 for n, column in reversed(list(enumerate(self.columns))):
402 if column == {}:
403 self.columns[n:n+1] = []
405 def _wipe_row(self, sequence):
406 """Turn all row positions into gaps (but keep sequences intact)."""
407 for column in self.columns:
408 if sequence in column:
409 del column[sequence]
411 def _merge(self, dst, new, merge):
412 """Replace contents of `dst` with those of `new`.
414 Replace contents of elements using function `merge(dst_el, new_le)`.
415 """
416 for el, new_el in zip(dst, new):
417 merge(el, new_el)
418 dst[len(dst):] = new[len(dst):]
419 del dst[len(new):]
421 def _replace_sequence_contents(self, new, copy_descriptions):
422 """Replace contents of sequences with those of `new` alignment."""
423 # XXX: we manually copy sequence contents here
424 # XXX: we only copy, overlapping parts and link to the rest
425 def merge_monomers(dst, new):
426 dst.__class__ = new.__class__
427 def merge_sequences(dst, new):
428 if copy_descriptions:
429 vars(dst).update(vars(new))
430 self._merge(dst, new, merge_monomers)
431 self._merge(self.sequences, new.sequences, merge_sequences)
433 def _replace_column_contents(self, new):
434 """Replace column contents with those of `new` alignment.
436 In other words: copy gap patterns from `new` to `self`.
438 `self.sequences` and `new.sequences` should have the same contents.
439 """
440 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):
441 sequence = row.sequence
442 monomers = filter(None, row)
443 assert len(monomers) == len(filter(None, new_row))
444 self._wipe_row(sequence)
445 non_gap_columns = [column
446 for column, monomer in zip(self.columns, new_row)
447 if monomer
449 for monomer, column in zip(monomers, non_gap_columns):
450 column[sequence] = monomer
452 def _replace_contents(self, new, copy_descriptions, copy_contents):
453 """Replace alignment contents with those of other alignment."""
454 if copy_contents:
455 self._replace_sequence_contents(new, copy_descriptions)
456 self._replace_column_contents(new)
458 def process(self, function, copy_descriptions=True, copy_contents=True):
459 """Apply function to the alignment (or block); inject results back.
461 - `function(block)` must return block with same line order.
462 - if `copy_descriptions` is False, ignore new sequence names.
463 - if `copy_contents` is False, don't copy sequence contents too.
465 `function` (object) may have attributes `copy_descriptions` and
466 `copy_contents`, which override the same named arguments.
467 """
468 new = function(self)
469 if hasattr(function, 'copy_descriptions'):
470 copy_descriptions = function.copy_descriptions
471 if hasattr(function, 'copy_contents'):
472 copy_contents = function.copy_contents
473 self._replace_contents(new, copy_descriptions, copy_contents)
475 def realign(self, function):
476 """Realign self.
478 I.e.: apply function to self to produce a new alignment, then update
479 self to have the same gap patterns as the new alignment.
481 This is the same as process(function, False, False)
482 """
483 new = function(self)
484 self._replace_column_contents(new)
486 class Column(dict):
487 """Column of alignment.
489 Column is a dict of { sequence : monomer }.
491 For sequences that have gaps in current row, given key is not present in
492 the column.
493 """
495 types = base
496 """Mapping of related types. SHOULD be redefined in subclasses."""
498 def __hash__(self):
499 """Return hash by identity."""
500 return id(self)
502 class Block(Alignment):
503 """Block of alignment.
505 Block is an intersection of several rows & columns. (The collections of
506 rows and columns are represented as ordered lists, to retain display order
507 of Alignment or add ability to tweak it). Most of blocks look like
508 rectangular part of alignment if you shuffle alignment rows the right way.
509 """
511 alignment = None
512 """Alignment the block belongs to."""
514 sequences = ()
515 """List of sequences in block."""
517 columns = ()
518 """List of columns in block."""
520 @classmethod
521 def from_alignment(cls, alignment, sequences=None, columns=None):
522 """Build new block from alignment.
524 If sequences are not given, the block uses all sequences in alignment.
526 If columns are not given, the block uses all columns in alignment.
528 In both cases we use exactly the list used in alignment, thus, if new
529 sequences or columns are added to alignment, the block tracks this too.
530 """
531 if sequences is None:
532 sequences = alignment.sequences
533 if columns is None:
534 columns = alignment.columns
535 block = cls()
536 block.alignment = alignment
537 block.sequences = sequences
538 block.columns = columns
539 return block
541 class Markup(object):
542 """Base class for sequence and alignment markups.
544 We shall call either sequence or alignment a container. And we shall call
545 either monomers or columns elements respectively.
547 Markup behaves like a dictionary of [element] -> value.
549 Every container has a dictionary of [name] -> markup. It is Markup's
550 responsibility to add itself to this dictionary and to avoid collisions
551 while doing it.
552 """
554 name = None
555 """Name of markup elements"""
557 def __init__(self, container, name, **kwargs):
558 """Markup takes mandatory container and name and optional kwargs.
560 Markups should never be created by the user. They are created by
561 Sequence or Alignment.
562 """
563 self.name = name
564 assert kwargs.get('caller') == 'container', "Improper call"
565 self.refresh()
567 def refresh(self):
568 """Recalculate markup values (if they are generated automatically)."""
569 pass
571 def remove(self):
572 """Remove the traces of markup object. Do not call this yourself!"""
573 pass
575 @classmethod
576 def from_record(cls, container, record, name=None):
577 """Restore markup from `record`. (Used for loading from file).
579 `record` is a dict of all metadata and data related to one markup. All
580 keys and values in `record` are strings, markup must parse them itself.
582 Markup values should be stored in `record['markup']`, which is a list
583 of items separated with either `record['separator']` or a comma.
584 """
585 return container.add_markup(name, markup_class=cls)
587 def to_record(self):
588 """Save markup to `record`, for saving to file.
590 For description of `record` see docstring for `from_record` method.
591 """
592 return {}
594 def sorted_keys(self):
595 """Return list of elements in the container in proper order."""
596 raise NotImplementedError()
598 def sorted_values(self):
599 """Return list of markup values in container."""
600 raise NotImplementedError()
602 class SequenceMarkup(Markup):
603 """Markup for sequence.
605 Behaves like a dictionary of [monomer] -> value. Value may be anything
606 or something specific, depending on subclass.
608 Actual values are stored in monomers themselves as attributes.
609 """
611 kind = 'sequence_markup'
613 def __init__(self, sequence, name, **kwargs):
614 self.sequence = sequence
615 Markup.__init__(self, sequence, name, **kwargs)
617 def remove(self):
618 """Remove the traces of markup object. Do not call this yourself!"""
619 for monomer in self.monomers:
620 del self[monomer]
622 def sorted_keys(self):
623 """Return list of monomers."""
624 return self.sequence
626 def sorted_values(self):
627 """Return list of markup values, if every monomer is marked up."""
628 return (self[monomer] for monomer in self.sequence)
630 def get(self, key, value=None):
631 """Part of Mapping collection interface."""
632 if key not in self:
633 return value
634 return self[key]
636 def __contains__(self, monomer):
637 """Part of Mapping collection interface."""
638 return hasattr(monomer, self.name)
640 def __getitem__(self, monomer):
641 """Part of Mapping collection interface."""
642 return getattr(monomer, self.name)
644 def __setitem__(self, monomer, value):
645 """Part of Mapping collection interface."""
646 return setattr(monomer, self.name, value)
648 def __delitem__(self, monomer):
649 """Part of Mapping collection interface."""
650 return delattr(monomer, self.name)
652 class AlignmentMarkup(dict, Markup):
653 """Markupf for alignment.
655 Is a dictionary of [column] -> value. Value may be anything or something
656 specific, depending on subclass.
657 """
659 kind = 'alignment_markup'
661 def __init__(self, alignment, name, **kwargs):
662 self.alignment = alignment
663 Markup.__init__(self, alignment, name, **kwargs)
665 def sorted_keys(self):
666 """Return a list of columns."""
667 return self.alignment.columns
669 def sorted_values(self):
670 """Return a list of makrup values, if every column is marked up."""
671 return (self[column] for column in self.alignment.columns)
673 # vim: set ts=4 sts=4 sw=4 et: