Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/fd531580b9af/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 20:41:39 2013
Кодировка:
allpy: fd531580b9af allpy/base.py

allpy

view allpy/base.py @ 724:fd531580b9af

Merge between Burkov and others.
author Boris Burkov <BurkovBA@gmail.com>
date Fri, 08 Jul 2011 16:20:20 +0400
parents b71f9c1d1509 83a40cd34923
children 0a7917b4ca71
line source
1 import sys
2 import re
4 import util
5 import fileio
6 import data.monomers
8 # import this very module as means of having all related classes in one place
9 import base
11 default_gaps = set((".", "-", "~"))
12 """Set of characters to recoginze as gaps when parsing alignment."""
14 class Monomer(object):
15 """Monomer object."""
17 type = None
18 """Either of 'dna', 'rna', 'protein'."""
20 types = base
21 """Mapping of related types. SHOULD be redefined in subclasses."""
23 by_code1 = {}
24 """A mapping from 1-letter code to Monomer subclass."""
26 by_code3 = {}
27 """A mapping from 3-letter code to Monomer subclass."""
29 by_name = {}
30 """A mapping from full monomer name to Monomer subclass."""
32 @classmethod
33 def _subclass(cls, name='', code1='', code3='', is_modified=False):
34 """Create new subclass of Monomer for given monomer type."""
35 class TheMonomer(cls):
36 pass
37 name = name.strip().capitalize()
38 code1 = code1.upper()
39 code3 = code3.upper()
40 module = vars(data.monomers)[cls.type]
41 TheMonomer.__name__ = re.sub(r"\W", "_", name)
42 TheMonomer.__module__ = module.__name__
43 TheMonomer.name = name
44 TheMonomer.code1 = code1
45 TheMonomer.code3 = code3
46 TheMonomer.is_modified = is_modified
47 # Save the class in data.monomers so that it can be pickled
48 # Some names are not unique, we append underscores to them
49 # in order to fix it.
50 while TheMonomer.__name__ in vars(module):
51 TheMonomer.__name__ += "_"
52 vars(module)[TheMonomer.__name__] = TheMonomer
53 if not is_modified:
54 cls.by_code1[code1] = TheMonomer
55 cls.by_code3[code3] = TheMonomer
56 cls.by_name[name] = TheMonomer
57 # We duplicate distinguished long names into Monomer itself, so that we
58 # can use Monomer.from_code3 to create the relevant type of monomer.
59 Monomer.by_code3[code3] = TheMonomer
60 Monomer.by_name[name] = TheMonomer
62 @classmethod
63 def _initialize(cls, codes=None):
64 """Create all relevant subclasses of Monomer."""
65 for code1, is_modified, code3, name in codes:
66 cls._subclass(name, code1, code3, is_modified)
68 @classmethod
69 def from_code1(cls, code1):
70 """Create new monomer from 1-letter code."""
71 return cls.by_code1[code1.upper()]()
73 @classmethod
74 def from_code3(cls, code3):
75 """Create new monomer from 3-letter code."""
76 return cls.by_code3[code3.upper()]()
78 @classmethod
79 def from_name(cls, name):
80 """Create new monomer from full name."""
81 return cls.by_name[name.strip().capitalize()]()
83 def __repr__(self):
84 return "<Monomer %s>" % str(self.code1)
86 def __str__(self):
87 """Returns one-letter code"""
88 return self.code1
90 def __eq__(self, other):
91 """Monomers within same monomer type are compared by code1."""
92 if not other:
93 return False
94 assert self.type == other.type
95 return self.code1 == other.code1
97 def __ne__(self, other):
98 return not (self == other)
100 class Sequence(list):
101 """Sequence of Monomers.
103 This behaves like list of monomer objects. In addition to standard list
104 behaviour, Sequence has the following attributes:
106 * name -- str with the name of the sequence
107 * description -- str with description of the sequence
108 * source -- str denoting source of the sequence
110 Any of them may be empty (i.e. hold empty string)
111 """
113 types = base
114 """Mapping of related types. SHOULD be redefined in subclasses."""
116 name = ''
117 description = ''
118 source = ''
120 def __init__(self, *args):
121 self.markups = {}
122 list.__init__(self, *args)
124 @classmethod
125 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
126 """Create sequence from a list of monomer objecst."""
127 result = cls(monomers)
128 if name:
129 result.name = name
130 if description:
131 result.description = description
132 if source:
133 result.source = source
134 return result
136 @classmethod
137 def from_string(cls, string, name='', description='', source=''):
138 """Create sequences from string of one-letter codes."""
139 monomer = cls.types.Monomer.from_code1
140 monomers = [monomer(letter) for letter in string]
141 return cls.from_monomers(monomers, name, description, source)
143 def __repr__(self):
144 if self.name:
145 return '<Sequence %s>' % str(self.name)
146 else:
147 return '<Sequence %s>' % str(self)
150 def __str__(self):
151 """Returns sequence of one-letter codes."""
152 return ''.join(monomer.code1 for monomer in self)
154 def __hash__(self):
155 """Hash sequence by identity."""
156 return id(self)
159 class Alignment(object):
160 """Alignment. It is a list of Columns."""
162 types = base
163 """Mapping of related types. SHOULD be redefined in subclasses."""
165 sequences = None
166 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
168 def __init__(self):
169 """Initialize empty alignment."""
170 self.sequences = []
171 self.columns = []
172 self.markups = {}
174 # Alignment grow & IO methods
175 # ==============================
177 def append_sequence(self, sequence):
178 """Add sequence to alignment. Return self.
180 If sequence is too short, pad it with gaps on the right.
181 """
182 self.sequences.append(sequence)
183 self._pad_to_width(len(sequence))
184 for column, monomer in zip(self.columns, sequence):
185 column[sequence] = monomer
186 return self
188 def append_row_from_string(self, string,
189 name='', description='', source='', gaps=default_gaps):
190 """Add row from a string of one-letter codes and gaps. Return self."""
191 Sequence = self.types.Sequence
192 without_gaps = util.remove_each(string, gaps)
193 sequence = Sequence.from_string(without_gaps, name, description, source)
194 self._pad_to_width(len(string))
195 non_gap_columns = [column
196 for column, char in zip(self.columns, string)
197 if char not in gaps
199 for monomer, column in zip(sequence, non_gap_columns):
200 column[sequence] = monomer
201 self.sequences.append(sequence)
202 return self
204 def append_row_with_gaps(self, row, sequence):
205 """Add row from row_as_list representation and sequence. Return self."""
206 self.sequences.append(sequence)
207 self._pad_to_width(len(row))
208 for column, monomer in zip(self.columns, row):
209 if monomer:
210 column[sequence] = monomer
211 return self
213 def _pad_to_width(self, n):
214 """Pad alignment with empty columns on the right to width n."""
215 for i in range(len(self.columns), n):
216 self.columns.append(Column())
218 def append_file(self, file, format='fasta', gaps=default_gaps):
219 """Append sequences from file to alignment. Return self.
221 If sequences in file have gaps (detected as characters belonging to
222 `gaps` set), treat them accordingly.
223 """
224 fileio.File(file, format, gaps=gaps).read_alignment(self)
225 return self
227 def to_file(self, file, format='fasta', gap='-'):
228 """Write alignment in FASTA file as sequences with gaps."""
229 fileio.File(file, format, gaps=gap).write_alignment(self)
230 return self
232 # Data access methods for alignment
233 # =================================
235 def rows(self):
236 """Return list of rows (temporary objects) in alignment.
238 Each row is a dictionary of { column : monomer }.
240 For gap positions there is no key for the column in row.
242 Each row has attribute `sequence` pointing to the sequence the row is
243 describing.
245 Modifications of row have no effect on the alignment.
246 """
247 # For now, the function returns a list rather than iterator.
248 # It is yet to see, whether memory performance here becomes critical,
249 # or is random access useful.
250 rows = []
251 for sequence in self.sequences:
252 row = util.UserDict()
253 row.sequence = sequence
254 for column in self.columns:
255 if sequence in column:
256 row[column] = column[sequence]
257 rows.append(row)
258 return rows
260 def rows_as_lists(self):
261 """Return list of rows (temporary objects) in alignment.
263 Each row here is a list of either monomer or None (for gaps).
265 Each row has attribute `sequence` pointing to the sequence of row.
267 Modifications of row have no effect on the alignment.
268 """
269 rows = []
270 for sequence in self.sequences:
271 row = util.UserList()
272 row.sequence = sequence
273 for column in self.columns:
274 row.append(column.get(sequence))
275 rows.append(row)
276 return rows
278 def rows_as_strings(self, gap='-'):
279 """Return list of string representation of rows in alignment.
281 Each row has attribute `sequence` pointing to the sequence of row.
283 `gap` is the symbol to use for gap.
284 """
285 rows = []
286 for sequence in self.sequences:
287 string = ""
288 for column in self.columns:
289 if sequence in column:
290 string += column[sequence].code1
291 else:
292 string += gap
293 string = util.UserString(string)
294 string.sequence = sequence
295 rows.append(string)
296 return rows
298 def row_as_list(self, sequence):
299 """Return representaion of row as list with `Monomers` and `None`s."""
300 return [column.get(sequence) for column in self.columns]
302 def row_as_string(self, sequence, gap='-'):
303 """Return string representaion of row in alignment.
305 String will have gaps represented by `gap` symbol (defaults to '-').
306 """
307 def char(monomer):
308 if monomer:
309 return monomer.code1
310 return gap
311 row = self.row_as_list(sequence)
312 return "".join(map(char, row))
314 def columns_as_lists(self):
315 """Return list of columns (temorary objects) in alignment.
317 Each column here is a list of either monomer or None (for gaps).
319 Items of column are sorted in the same way as alignment.sequences.
321 Modifications of column have no effect on the alignment.
322 """
323 columns = []
324 for column in self.columns:
325 col = util.UserList()
326 col.column = column
327 for sequence in self.sequences:
328 col.append(column.get(sequence))
329 columns.append(col)
330 return columns
332 # Alignment / Block editing methods
333 # =================================
335 def flush(self, whence='left'):
336 """Remove all gaps from alignment and flush results to one side.
338 `whence` must be one of 'left', 'right' or 'center'
339 """
340 if whence == 'left':
341 from processors import Left as Flush
342 elif whence == 'right':
343 from processors import Right as Flush
344 elif whence == 'center':
345 from processors import Center as Flush
346 else:
347 raise AssertionError, "Whence must be left, right or center"
348 self.realign(Flush())
350 def remove_gap_columns(self):
351 """Remove all empty columns."""
352 for n, column in reversed(list(enumerate(self.columns))):
353 if column == {}:
354 self.columns[n:n+1] = []
356 def _wipe_row(self, sequence):
357 """Turn all row positions into gaps (but keep sequences intact)."""
358 for column in self.columns:
359 if sequence in column:
360 del column[sequence]
362 def _merge(self, dst, new, merge):
363 """Replace contents of `dst` with those of `new`.
365 Replace contents of elements using function `merge(dst_el, new_le)`.
366 """
367 for el, new_el in zip(dst, new):
368 merge(el, new_el)
369 dst[len(dst):] = new[len(dst):]
370 del dst[len(new):]
372 def _replace_sequence_contents(self, new, copy_descriptions):
373 """Replace contents of sequences with those of `new` alignment."""
374 # XXX: we manually copy sequence contents here
375 # XXX: we only copy, overlapping parts and link to the rest
376 def merge_monomers(dst, new):
377 dst.__class__ = new.__class__
378 def merge_sequences(dst, new):
379 if copy_descriptions:
380 vars(dst).update(vars(new))
381 self._merge(dst, new, merge_monomers)
382 self._merge(self.sequences, new.sequences, merge_sequences)
384 def _replace_column_contents(self, new):
385 """Replace column contents with those of `new` alignment.
387 In other words: copy gap patterns from `new` to `self`.
389 `self.sequences` and `new.sequences` should have the same contents.
390 """
391 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):
392 sequence = row.sequence
393 monomers = filter(None, row)
394 assert len(monomers) == len(filter(None, new_row))
395 self._wipe_row(sequence)
396 non_gap_columns = [column
397 for column, monomer in zip(self.columns, new_row)
398 if monomer
400 for monomer, column in zip(monomers, non_gap_columns):
401 column[sequence] = monomer
403 def _replace_contents(self, new, copy_descriptions, copy_contents):
404 """Replace alignment contents with those of other alignment."""
405 if copy_contents:
406 self._replace_sequence_contents(new, copy_descriptions)
407 self._replace_column_contents(new)
409 def process(self, function, copy_descriptions=True, copy_contents=True):
410 """Apply function to the alignment (or block); inject results back.
412 - `function(block)` must return block with same line order.
413 - if `copy_descriptions` is False, ignore new sequence names.
414 - if `copy_contents` is False, don't copy sequence contents too.
416 `function` (object) may have attributes `copy_descriptions` and
417 `copy_contents`, which override the same named arguments.
418 """
419 new = function(self)
420 if hasattr(function, 'copy_descriptions'):
421 copy_descriptions = function.copy_descriptions
422 if hasattr(function, 'copy_contents'):
423 copy_contents = function.copy_contents
424 self._replace_contents(new, copy_descriptions, copy_contents)
426 def realign(self, function):
427 """Realign self.
429 I.e.: apply function to self to produce a new alignment, then update
430 self to have the same gap patterns as the new alignment.
432 This is the same as process(function, False, False)
433 """
434 new = function(self)
435 self._replace_column_contents(new)
437 class Column(dict):
438 """Column of alignment.
440 Column is a dict of { sequence : monomer }.
442 For sequences that have gaps in current row, given key is not present in
443 the column.
444 """
446 types = base
447 """Mapping of related types. SHOULD be redefined in subclasses."""
449 def __init__(self, alignment):
450 self.alignment = alignment
451 super(Column, self).__init__()
453 def __hash__(self):
454 """Return hash by identity."""
455 return id(self)
457 def MyIndex(self):
458 for index, column in enumerate(self.alignment.columns):
459 if column is self: return index
460 raise ValueException
462 def __repr__(self):
463 #!!!!!!!!! READ HOW index OF LIST COMPARES OBJECTS AND BASIC TYPES
464 return "<Column %s>"%(str(self.MyIndex()))
469 class Block(Alignment):
470 """Block of alignment.
472 Block is an intersection of several rows & columns. (The collections of
473 rows and columns are represented as ordered lists, to retain display order
474 of Alignment or add ability to tweak it). Most of blocks look like
475 rectangular part of alignment if you shuffle alignment rows the right way.
476 """
478 alignment = None
479 """Alignment the block belongs to."""
481 sequences = ()
482 """List of sequences in block."""
484 columns = ()
485 """List of columns in block."""
487 @classmethod
488 def from_alignment(cls, alignment, sequences=None, columns=None):
489 """Build new block from alignment.
491 If sequences are not given, the block uses all sequences in alignment.
493 If columns are not given, the block uses all columns in alignment.
495 In both cases we use exactly the list used in alignment, thus, if new
496 sequences or columns are added to alignment, the block tracks this too.
497 """
498 if sequences is None:
499 sequences = alignment.sequences
500 if columns is None:
501 columns = alignment.columns
502 block = cls()
503 block.alignment = alignment
504 block.sequences = sequences
505 block.columns = columns
506 return block
508 class Markup(object):
509 """Base class for sequence and alignment markups.
511 We shall call either sequence or alignment a container. And we shall call
512 either monomers or columns elements respectively.
514 Markup behaves like a dictionary of [element] -> value.
516 Every container has a dictionary of [name] -> markup. It is Markup's
517 responsibility to add itself to this dictionary and to avoid collisions
518 while doing it.
519 """
521 name = None
522 """Name of markup elements"""
524 def _register(self, container, name):
525 """Register self within container.
527 Assure the name is not taken before. If name is not given, look in the
528 class. Make sure we have some name at all.
529 """
530 if name:
531 self.name = name
532 assert self.name is not None
533 assert self.name not in container.markups
534 container.markups[self.name] = self
536 def refresh(self):
537 """Recalculate markup values (if they are generated automatically)."""
538 pass
540 @classmethod
541 def from_record(cls, container, record, name=None):
542 """Restore markup from `record`. (Used for loading from file).
544 `record` is a dict of all metadata and data related to one markup. All
545 keys and values in `record` are strings, markup must parse them itself.
547 Markup values should be stored in `record['markup']`, which is a list
548 of items separated with either `record['separator']` or a comma.
549 """
550 return cls(container, name)
552 def to_record(self):
553 """Save markup to `record`, for saving to file.
555 For description of `record` see docstring for `from_record` method.
556 """
557 return {}
559 def sorted_keys(self):
560 """Return list of elements in the container in proper order."""
561 raise NotImplementedError()
563 def sorted_values(self):
564 """Return list of markup values in container."""
565 raise NotImplementedError()
567 class SequenceMarkup(Markup):
568 """Markup for sequence.
570 Behaves like a dictionary of [monomer] -> value. Value may be anything
571 or something specific, depending on subclass.
573 Actual values are stored in monomers themselves as attributes.
574 """
576 def __init__(self, sequence, name=None):
577 self.sequence = sequence
578 self._register(sequence, name)
579 self.refresh()
581 def sorted_keys(self):
582 """Return list of monomers."""
583 return self.sequence
585 def sorted_values(self):
586 """Return list of markup values, if every monomer is marked up."""
587 return (self[monomer] for monomer in self.sequence)
589 def get(self, key, value=None):
590 """Part of Mapping collection interface."""
591 if key not in self:
592 return value
593 return self[key]
595 def __contains__(self, monomer):
596 """Part of Mapping collection interface."""
597 return hasattr(monomer, self.name)
599 def __getitem__(self, monomer):
600 """Part of Mapping collection interface."""
601 return getattr(monomer, self.name)
603 def __setitem__(self, monomer, value):
604 """Part of Mapping collection interface."""
605 return setattr(monomer, self.name, value)
607 class AlignmentMarkup(dict, Markup):
608 """Markupf for alignment.
610 Is a dictionary of [column] -> value. Value may be anything or something
611 specific, depending on subclass.
612 """
614 def __init__(self, alignment, name=None):
615 self.alignment = alignment
616 self._register(alignment, name)
617 self.refresh()
619 def sorted_keys(self):
620 """Return a list of columns."""
621 return self.alignment.columns
623 def sorted_values(self):
624 """Return a list of makrup values, if every column is marked up."""
625 return (self[column] for column in self.alignment.columns)
627 # vim: set ts=4 sts=4 sw=4 et: