Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/1a4a04829ba6/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 20:41:35 2013
Кодировка:
allpy: 1a4a04829ba6 allpy/base.py

allpy

view allpy/base.py @ 670:1a4a04829ba6

This is a half-rollback commit with base.py from central repo and my blocks_finder.py
author Boris Burkov <BurkovBA@gmail.com>
date Fri, 01 Jul 2011 11:40:00 +0400
parents 2f016bfda114
children b0d3c9413cf9
line source
1 import sys
2 import re
4 import util
5 import fileio
6 import data.monomers
8 # import this very module as means of having all related classes in one place
9 import base
11 default_gaps = set((".", "-", "~"))
12 """Set of characters to recoginze as gaps when parsing alignment."""
14 class Monomer(object):
15 """Monomer object."""
17 type = None
18 """Either of 'dna', 'rna', 'protein'."""
20 types = base
21 """Mapping of related types. SHOULD be redefined in subclasses."""
23 by_code1 = {}
24 """A mapping from 1-letter code to Monomer subclass."""
26 by_code3 = {}
27 """A mapping from 3-letter code to Monomer subclass."""
29 by_name = {}
30 """A mapping from full monomer name to Monomer subclass."""
32 @classmethod
33 def _subclass(cls, name='', code1='', code3='', is_modified=False):
34 """Create new subclass of Monomer for given monomer type."""
35 class TheMonomer(cls):
36 pass
37 name = name.strip().capitalize()
38 code1 = code1.upper()
39 code3 = code3.upper()
40 module = vars(data.monomers)[cls.type]
41 TheMonomer.__name__ = re.sub(r"\W", "_", name)
42 TheMonomer.__module__ = module.__name__
43 TheMonomer.name = name
44 TheMonomer.code1 = code1
45 TheMonomer.code3 = code3
46 TheMonomer.is_modified = is_modified
47 # Save the class in data.monomers so that it can be pickled
48 # Some names are not unique, we append underscores to them
49 # in order to fix it.
50 while TheMonomer.__name__ in vars(module):
51 TheMonomer.__name__ += "_"
52 vars(module)[TheMonomer.__name__] = TheMonomer
53 if not is_modified:
54 cls.by_code1[code1] = TheMonomer
55 cls.by_code3[code3] = TheMonomer
56 cls.by_name[name] = TheMonomer
57 # We duplicate distinguished long names into Monomer itself, so that we
58 # can use Monomer.from_code3 to create the relevant type of monomer.
59 Monomer.by_code3[code3] = TheMonomer
60 Monomer.by_name[name] = TheMonomer
62 @classmethod
63 def _initialize(cls, codes=None):
64 """Create all relevant subclasses of Monomer."""
65 for code1, is_modified, code3, name in codes:
66 cls._subclass(name, code1, code3, is_modified)
68 @classmethod
69 def from_code1(cls, code1):
70 """Create new monomer from 1-letter code."""
71 return cls.by_code1[code1.upper()]()
73 @classmethod
74 def from_code3(cls, code3):
75 """Create new monomer from 3-letter code."""
76 return cls.by_code3[code3.upper()]()
78 @classmethod
79 def from_name(cls, name):
80 """Create new monomer from full name."""
81 return cls.by_name[name.strip().capitalize()]()
83 def __repr__(self):
84 return "<Monomer %s>" % str(self.code1)
86 def __str__(self):
87 """Returns one-letter code"""
88 return self.code1
90 def __eq__(self, other):
91 """Monomers within same monomer type are compared by code1."""
92 if not other:
93 return False
94 assert self.type == other.type
95 return self.code1 == other.code1
97 def __ne__(self, other):
98 return not (self == other)
100 class Sequence(list):
101 """Sequence of Monomers.
103 This behaves like list of monomer objects. In addition to standard list
104 behaviour, Sequence has the following attributes:
106 * name -- str with the name of the sequence
107 * description -- str with description of the sequence
108 * source -- str denoting source of the sequence
110 Any of them may be empty (i.e. hold empty string)
111 """
113 types = base
114 """Mapping of related types. SHOULD be redefined in subclasses."""
116 name = ''
117 description = ''
118 source = ''
120 @classmethod
121 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
122 """Create sequence from a list of monomer objecst."""
123 result = cls(monomers)
124 if name:
125 result.name = name
126 if description:
127 result.description = description
128 if source:
129 result.source = source
130 return result
132 @classmethod
133 def from_string(cls, string, name='', description='', source=''):
134 """Create sequences from string of one-letter codes."""
135 monomer = cls.types.Monomer.from_code1
136 monomers = [monomer(letter) for letter in string]
137 return cls.from_monomers(monomers, name, description, source)
139 def __repr__(self):
140 if self.name:
141 return '<Sequence %s>' % str(self.name)
142 else:
143 return '<Sequence %s>' % str(self)
146 def __str__(self):
147 """Returns sequence of one-letter codes."""
148 return ''.join(monomer.code1 for monomer in self)
150 def __hash__(self):
151 """Hash sequence by identity."""
152 return id(self)
154 class Alignment(object):
155 """Alignment. It is a list of Columns."""
157 types = base
158 """Mapping of related types. SHOULD be redefined in subclasses."""
160 sequences = None
161 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
163 def __init__(self):
164 """Initialize empty alignment."""
165 self.sequences = []
166 self.columns = []
168 # Alignment grow & IO methods
169 # ==============================
171 def append_sequence(self, sequence):
172 """Add sequence to alignment. Return self.
174 If sequence is too short, pad it with gaps on the right.
175 """
176 self.sequences.append(sequence)
177 self._pad_to_width(len(sequence))
178 for column, monomer in zip(self.columns, sequence):
179 column[sequence] = monomer
180 return self
182 def append_row_from_string(self, string,
183 name='', description='', source='', gaps=default_gaps):
184 """Add row from a string of one-letter codes and gaps. Return self."""
185 Sequence = self.types.Sequence
186 without_gaps = util.remove_each(string, gaps)
187 sequence = Sequence.from_string(without_gaps, name, description, source)
188 self._pad_to_width(len(string))
189 non_gap_columns = [column
190 for column, char in zip(self.columns, string)
191 if char not in gaps
193 for monomer, column in zip(sequence, non_gap_columns):
194 column[sequence] = monomer
195 self.sequences.append(sequence)
196 return self
198 def append_row_with_gaps(self, row, sequence):
199 """Add row from row_as_list representation and sequence. Return self."""
200 self.sequences.append(sequence)
201 self._pad_to_width(len(row))
202 for column, monomer in zip(self.columns, row):
203 if monomer:
204 column[sequence] = monomer
205 return self
207 def _pad_to_width(self, n):
208 """Pad alignment with empty columns on the right to width n."""
209 for i in range(len(self.columns), n):
210 self.columns.append(Column())
212 def append_file(self, file, format='fasta', gaps=default_gaps):
213 """Append sequences from file to alignment. Return self.
215 If sequences in file have gaps (detected as characters belonging to
216 `gaps` set), treat them accordingly.
217 """
218 sequences = []
219 io = fileio.File(file, format)
220 for name, description, body in io.read_strings():
221 self.append_row_from_string(body, name, description, file.name, gaps)
222 return self
224 def to_file(self, file, format='fasta', gap='-'):
225 """Write alignment in FASTA file as sequences with gaps."""
226 strings = [(s, s.sequence.name, s.sequence.description)
227 for s in self.rows_as_strings()]
228 fileio.File(file, format).write_strings(strings)
230 # Data access methods for alignment
231 # =================================
233 def rows(self):
234 """Return list of rows (temporary objects) in alignment.
236 Each row is a dictionary of { column : monomer }.
238 For gap positions there is no key for the column in row.
240 Each row has attribute `sequence` pointing to the sequence the row is
241 describing.
243 Modifications of row have no effect on the alignment.
244 """
245 # For now, the function returns a list rather than iterator.
246 # It is yet to see, whether memory performance here becomes critical,
247 # or is random access useful.
248 rows = []
249 for sequence in self.sequences:
250 row = util.UserDict()
251 row.sequence = sequence
252 for column in self.columns:
253 if sequence in column:
254 row[column] = column[sequence]
255 rows.append(row)
256 return rows
258 def rows_as_lists(self):
259 """Return list of rows (temporary objects) in alignment.
261 Each row here is a list of either monomer or None (for gaps).
263 Each row has attribute `sequence` pointing to the sequence of row.
265 Modifications of row have no effect on the alignment.
266 """
267 rows = []
268 for sequence in self.sequences:
269 row = util.UserList()
270 row.sequence = sequence
271 for column in self.columns:
272 row.append(column.get(sequence))
273 rows.append(row)
274 return rows
276 def rows_as_strings(self, gap='-'):
277 """Return list of string representation of rows in alignment.
279 Each row has attribute `sequence` pointing to the sequence of row.
281 `gap` is the symbol to use for gap.
282 """
283 rows = []
284 for sequence in self.sequences:
285 string = ""
286 for column in self.columns:
287 if sequence in column:
288 string += column[sequence].code1
289 else:
290 string += gap
291 string = util.UserString(string)
292 string.sequence = sequence
293 rows.append(string)
294 return rows
296 def row_as_list(self, sequence):
297 """Return representaion of row as list with `Monomers` and `None`s."""
298 return [column.get(sequence) for column in self.columns]
300 def row_as_string(self, sequence, gap='-'):
301 """Return string representaion of row in alignment.
303 String will have gaps represented by `gap` symbol (defaults to '-').
304 """
305 def char(monomer):
306 if monomer:
307 return monomer.code1
308 return gap
309 row = self.row_as_list(sequence)
310 return "".join(map(char, row))
312 def columns_as_lists(self):
313 """Return list of columns (temorary objects) in alignment.
315 Each column here is a list of either monomer or None (for gaps).
317 Items of column are sorted in the same way as alignment.sequences.
319 Modifications of column have no effect on the alignment.
320 """
321 columns = []
322 for column in self.columns:
323 col = UserList()
324 col.column = column
325 for sequence in self.sequences:
326 col.append(column.get(sequence))
327 columns.append(col)
328 return columns
330 # Alignment / Block editing methods
331 # =================================
333 def flush(self, whence='left'):
334 """Remove all gaps from alignment and flush results to one side.
336 `whence` must be one of 'left', 'right' or 'center'
337 """
338 if whence == 'left':
339 from processors import Left as Flush
340 elif whence == 'right':
341 from processors import Right as Flush
342 elif whence == 'center':
343 from processors import Center as Flush
344 else:
345 raise AssertionError, "Whence must be left, right or center"
346 self.realign(Flush())
348 def remove_gap_columns(self):
349 """Remove all empty columns."""
350 for n, column in reversed(list(enumerate(self.columns))):
351 if column == {}:
352 self.columns[n:n+1] = []
354 def _wipe_row(self, sequence):
355 """Turn all row positions into gaps (but keep sequences intact)."""
356 for column in self.columns:
357 if sequence in column:
358 del column[sequence]
360 def _merge(self, dst, new, merge):
361 """Replace contents of `dst` with those of `new`.
363 Replace contents of elements using function `merge(dst_el, new_le)`.
364 """
365 for el, new_el in zip(dst, new):
366 merge(el, new_el)
367 dst[len(dst):] = new[len(dst):]
368 del dst[len(new):]
370 def _replace_sequence_contents(self, new, copy_descriptions):
371 """Replace contents of sequences with those of `new` alignment."""
372 # XXX: we manually copy sequence contents here
373 # XXX: we only copy, overlapping parts and link to the rest
374 def merge_monomers(dst, new):
375 dst.__class__ = new.__class__
376 def merge_sequences(dst, new):
377 if copy_descriptions:
378 vars(dst).update(vars(new))
379 self._merge(dst, new, merge_monomers)
380 self._merge(self.sequences, new.sequences, merge_sequences)
382 def _replace_column_contents(self, new):
383 """Replace column contents with those of `new` alignment.
385 In other words: copy gap patterns from `new` to `self`.
387 `self.sequences` and `new.sequences` should have the same contents.
388 """
389 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):
390 sequence = row.sequence
391 monomers = filter(None, row)
392 assert len(monomers) == len(filter(None, new_row))
393 self._wipe_row(sequence)
394 non_gap_columns = [column
395 for column, monomer in zip(self.columns, new_row)
396 if monomer
398 for monomer, column in zip(monomers, non_gap_columns):
399 column[sequence] = monomer
401 def _replace_contents(self, new, copy_descriptions, copy_contents):
402 """Replace alignment contents with those of other alignment."""
403 if copy_contents:
404 self._replace_sequence_contents(new, copy_descriptions)
405 self._replace_column_contents(new)
407 def process(self, function, copy_descriptions=True, copy_contents=True):
408 """Apply function to the alignment (or block); inject results back.
410 - `function(block)` must return block with same line order.
411 - if `copy_descriptions` is False, ignore new sequence names.
412 - if `copy_contents` is False, don't copy sequence contents too.
414 `function` (object) may have attributes `copy_descriptions` and
415 `copy_contents`, which override the same named arguments.
416 """
417 new = function(self)
418 if hasattr(function, 'copy_descriptions'):
419 copy_descriptions = function.copy_descriptions
420 if hasattr(function, 'copy_contents'):
421 copy_contents = function.copy_contents
422 self._replace_contents(new, copy_descriptions, copy_contents)
424 def realign(self, function):
425 """Realign self.
427 I.e.: apply function to self to produce a new alignment, then update
428 self to have the same gap patterns as the new alignment.
430 This is the same as process(function, False, False)
431 """
432 new = function(self)
433 self._replace_column_contents(new)
435 class Column(dict):
436 """Column of alignment.
438 Column is a dict of { sequence : monomer }.
440 For sequences that have gaps in current row, given key is not present in
441 the column.
442 """
444 types = base
445 """Mapping of related types. SHOULD be redefined in subclasses."""
447 def __hash__(self):
448 """Return hash by identity."""
449 return id(self)
451 class Block(Alignment):
452 """Block of alignment.
454 Block is an intersection of several rows & columns. (The collections of
455 rows and columns are represented as ordered lists, to retain display order
456 of Alignment or add ability to tweak it). Most of blocks look like
457 rectangular part of alignment if you shuffle alignment rows the right way.
458 """
460 alignment = None
461 """Alignment the block belongs to."""
463 sequences = ()
464 """List of sequences in block."""
466 columns = ()
467 """List of columns in block."""
469 @classmethod
470 def from_alignment(cls, alignment, sequences=None, columns=None):
471 """Build new block from alignment.
473 If sequences are not given, the block uses all sequences in alignment.
475 If columns are not given, the block uses all columns in alignment.
477 In both cases we use exactly the list used in alignment, thus, if new
478 sequences or columns are added to alignment, the block tracks this too.
479 """
480 if sequences is None:
481 sequences = alignment.sequences
482 if columns is None:
483 columns = alignment.columns
484 block = cls()
485 block.alignment = alignment
486 block.sequences = sequences
487 block.columns = columns
488 return block
490 # vim: set ts=4 sts=4 sw=4 et: