Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/364232e42888/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 20:41:15 2013
Кодировка:
allpy: 364232e42888 allpy/base.py

allpy

view allpy/base.py @ 537:364232e42888

Reimplemented flush(direction) as processor
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Mon, 28 Feb 2011 21:17:41 +0300
parents 5dfb9b9761d5
children 737b52785e5e
line source
1 import sys
2 import re
4 import util
5 import fileio
7 # import this very module as means of having all related classes in one place
8 import base
10 default_gaps = set((".", "-", "~"))
11 """Set of characters to recoginze as gaps when parsing alignment."""
13 class Monomer(object):
14 """Monomer object."""
16 type = None
17 """Either of 'dna', 'rna', 'protein'."""
19 types = base
20 """Mapping of related types. SHOULD be redefined in subclasses."""
22 by_code1 = {}
23 """A mapping from 1-letter code to Monomer subclass."""
25 by_code3 = {}
26 """A mapping from 3-letter code to Monomer subclass."""
28 by_name = {}
29 """A mapping from full monomer name to Monomer subclass."""
31 @classmethod
32 def _subclass(cls, name='', code1='', code3='', is_modified=False):
33 """Create new subclass of Monomer for given monomer type."""
34 class TheMonomer(cls):
35 pass
36 name = name.strip().capitalize()
37 code1 = code1.upper()
38 code3 = code3.upper()
39 TheMonomer.__name__ = re.sub(r"[^\w]", "_", name)
40 TheMonomer.name = name
41 TheMonomer.code1 = code1
42 TheMonomer.code3 = code3
43 TheMonomer.is_modified = is_modified
44 if not is_modified:
45 cls.by_code1[code1] = TheMonomer
46 cls.by_code3[code3] = TheMonomer
47 cls.by_name[name] = TheMonomer
48 # We duplicate distinguished long names into Monomer itself, so that we
49 # can use Monomer.from_code3 to create the relevant type of monomer.
50 Monomer.by_code3[code3] = TheMonomer
51 Monomer.by_name[name] = TheMonomer
53 @classmethod
54 def _initialize(cls, codes=None):
55 """Create all relevant subclasses of Monomer."""
56 for code1, is_modified, code3, name in codes:
57 cls._subclass(name, code1, code3, is_modified)
59 @classmethod
60 def from_code1(cls, code1):
61 """Create new monomer from 1-letter code."""
62 return cls.by_code1[code1.upper()]()
64 @classmethod
65 def from_code3(cls, code3):
66 """Create new monomer from 3-letter code."""
67 return cls.by_code3[code3.upper()]()
69 @classmethod
70 def from_name(cls, name):
71 """Create new monomer from full name."""
72 return cls.by_name[name.strip().capitalize()]()
74 def __repr__(self):
75 return '<Monomer %s>' % self.code3
77 def __str__(self):
78 """Returns one-letter code"""
79 return self.code1
81 def __eq__(self, other):
82 """Monomers within same monomer type are compared by code1."""
83 assert self.type == other.type
84 return self.code1 == other.code1
86 def __ne__(self, other):
87 return not (self == other)
89 class Sequence(list):
90 """Sequence of Monomers.
92 This behaves like list of monomer objects. In addition to standard list
93 behaviour, Sequence has the following attributes:
95 * name -- str with the name of the sequence
96 * description -- str with description of the sequence
97 * source -- str denoting source of the sequence
99 Any of them may be empty (i.e. hold empty string)
100 """
102 types = base
103 """Mapping of related types. SHOULD be redefined in subclasses."""
105 name = ''
106 description = ''
107 source = ''
109 @classmethod
110 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
111 """Create sequence from a list of monomer objecst."""
112 result = cls(monomers)
113 if name:
114 result.name = name
115 if description:
116 result.description = description
117 if source:
118 result.source = source
119 return result
121 @classmethod
122 def from_string(cls, string, name='', description='', source=''):
123 """Create sequences from string of one-letter codes."""
124 monomer = cls.types.Monomer.from_code1
125 monomers = [monomer(letter) for letter in string]
126 return cls.from_monomers(monomers, name, description, source)
128 def __repr__(self):
129 return '<Sequence %s>' % str(self)
131 def __str__(self):
132 """Returns sequence of one-letter codes."""
133 return ''.join(monomer.code1 for monomer in self)
135 def __hash__(self):
136 """Hash sequence by identity."""
137 return id(self)
139 class Alignment(object):
140 """Alignment. It is a list of Columns."""
142 types = base
143 """Mapping of related types. SHOULD be redefined in subclasses."""
145 sequences = None
146 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
148 def __init__(self):
149 """Initialize empty alignment."""
150 self.sequences = []
151 self.columns = []
153 # Alignment grow & IO methods
154 # ==============================
156 def append_sequence(self, sequence):
157 """Add sequence to alignment. Return self.
159 If sequence is too short, pad it with gaps on the right.
160 """
161 self.sequences.append(sequence)
162 self._pad_to_width(len(sequence))
163 for column, monomer in zip(self.columns, sequence):
164 column[sequence] = monomer
165 return self
167 def append_row_from_string(self, string,
168 name='', description='', source='', gaps=default_gaps):
169 """Add row from a string of one-letter codes and gaps. Return self."""
170 Sequence = self.types.Sequence
171 without_gaps = util.remove_each(string, gaps)
172 sequence = Sequence.from_string(without_gaps, name, description, source)
173 self._pad_to_width(len(string))
174 non_gap_columns = [column
175 for column, char in zip(self.columns, string)
176 if char not in gaps
178 for monomer, column in zip(sequence, non_gap_columns):
179 column[sequence] = monomer
180 self.sequences.append(sequence)
181 return self
183 def append_row_with_gaps(self, row, sequence):
184 """Add row from row_as_list representation and sequence. Return self."""
185 self.sequences.append(sequence)
186 self._pad_to_width(len(row))
187 for column, monomer in zip(self.columns, row):
188 if monomer:
189 column[sequence] = monomer
190 return self
192 def _pad_to_width(self, n):
193 """Pad alignment with empty columns on the right to width n."""
194 for i in range(len(self.columns), n):
195 self.columns.append(Column())
197 def append_file(self, file, format='fasta', gaps=default_gaps):
198 """Append sequences from file to alignment. Return self.
200 If sequences in file have gaps (detected as characters belonging to
201 `gaps` set), treat them accordingly.
202 """
203 sequences = []
204 if format == 'fasta':
205 sequences = fileio.FastaIo(file).get_all_strings()
206 elif format == 'msf':
207 sequences = fileio.MsfIo(file).get_all_strings()
208 else:
209 raise Exception("We don't support other formats yet")
210 for (name, description, body) in sequences:
211 self.append_row_from_string(body, name, description, file.name, gaps)
212 return self
214 def to_file(self, file, format='fasta', gap='-'):
215 """Write alignment in FASTA file as sequences with gaps."""
216 def char(monomer):
217 if monomer:
218 return monomer.code1
219 return gap
220 if format == 'fasta':
221 io = fileio.FastaIo(file)
222 elif format == 'msf':
223 io = fileio.MsfIo(file)
224 else:
225 raise Exception("We don't support other formats yet")
226 for row in self.rows_as_lists():
227 seq = row.sequence
228 line = "".join(map(char, row))
229 io.save_string(line, seq.name, seq.description)
231 # Data access methods for alignment
232 # =================================
234 def rows(self):
235 """Return list of rows (temporary objects) in alignment.
237 Each row is a dictionary of { column : monomer }.
239 For gap positions there is no key for the column in row.
241 Each row has attribute `sequence` pointing to the sequence the row is
242 describing.
244 Modifications of row have no effect on the alignment.
245 """
246 # For now, the function returns a list rather than iterator.
247 # It is yet to see, whether memory performance here becomes critical,
248 # or is random access useful.
249 rows = []
250 for sequence in self.sequences:
251 row = util.UserDict()
252 row.sequence = sequence
253 for column in self.columns:
254 if sequence in column:
255 row[column] = column[sequence]
256 rows.append(row)
257 return rows
259 def rows_as_lists(self):
260 """Return list of rows (temporary objects) in alignment.
262 Each row here is a list of either monomer or None (for gaps).
264 Each row has attribute `sequence` pointing to the sequence of row.
266 Modifications of row have no effect on the alignment.
267 """
268 rows = []
269 for sequence in self.sequences:
270 row = util.UserList()
271 row.sequence = sequence
272 for column in self.columns:
273 row.append(column.get(sequence))
274 rows.append(row)
275 return rows
277 def columns_as_lists(self):
278 """Return list of columns (temorary objects) in alignment.
280 Each column here is a list of either monomer or None (for gaps).
282 Items of column are sorted in the same way as alignment.sequences.
284 Modifications of column have no effect on the alignment.
285 """
286 columns = []
287 for column in self.columns:
288 col = []
289 for sequence in self.sequences:
290 col.append(column.get(sequence))
291 columns.append(col)
292 return columns
294 # Alignment / Block editing methods
295 # =================================
297 def flush(self, whence='left'):
298 """Remove all gaps from alignment and flush results to one side.
300 `whence` must be one of 'left', 'right' or 'center'
301 """
302 if whence == 'left':
303 from processors import Left as Flush
304 elif whence == 'right':
305 from processors import Right as Flush
306 elif whence == 'center':
307 from processors import Center as Flush
308 else:
309 raise AssertionError, "Whence must be left, right or center"
310 self.realign(Flush())
312 def remove_gap_columns(self):
313 """Remove all empty columns."""
314 for n, column in reversed(enumerate(self.columns)):
315 if column == {}:
316 self.columns[n:n+1] = []
318 def _wipe_row(self, sequence):
319 """Turn all row positions into gaps (but keep sequences intact)."""
320 for column in self.columns:
321 if sequence in column:
322 del column[sequence]
324 def _merge(self, dst, new, merge):
325 """Replace contents of `dst` with those of `new`.
327 Replace contents of elements using function `merge(dst_el, new_le)`.
328 """
329 for el, new_el in zip(dst, new):
330 merge(el, new_el)
331 dst[len(dst):] = new[len(dst):]
332 del dst[len(new):]
334 def _replace_sequence_contents(self, new, copy_descriptions):
335 """Replace contents of sequences with those of `new` alignment."""
336 # XXX: we manually copy sequence contents here
337 # XXX: we only copy, overlapping parts and link to the rest
338 def merge_monomers(dst, new):
339 dst.__class__ = new.__class__
340 def merge_sequences(dst, new):
341 if copy_descriptions:
342 vars(dst).update(vars(new))
343 self._merge(dst, new, merge_monomers)
344 self._merge(self.sequences, new.sequences, merge_sequences)
346 def _replace_column_contents(self, new):
347 """Replace column contents with those of `new` alignment.
349 In other words: copy gap patterns from `new` to `self`.
351 `self.sequences` and `new.sequences` should have the same contents.
352 """
353 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):
354 sequence = row.sequence
355 monomers = filter(None, row)
356 assert len(monomers) == len(filter(None, new_row))
357 self._wipe_row(sequence)
358 non_gap_columns = [column
359 for column, monomer in zip(self.columns, new_row)
360 if monomer
362 for monomer, column in zip(monomers, non_gap_columns):
363 column[sequence] = monomer
365 def _replace_contents(self, new, copy_descriptions, copy_contents):
366 """Replace alignment contents with those of other alignment."""
367 if copy_contents:
368 self._replace_sequence_contents(new, copy_descriptions)
369 self._replace_column_contents(new)
371 def process(self, function, copy_descriptions=True, copy_contents=True):
372 """Apply function to the alignment (or block); inject results back.
374 - `function(block)` must return block with same line order.
375 - if `copy_descriptions` is False, ignore new sequence names.
376 - if `copy_contents` is False, don't copy sequence contents too.
378 `function` (object) may have attributes `copy_descriptions` and
379 `copy_contents`, which override the same named arguments.
380 """
381 new = function(self)
382 if hasattr(function, 'copy_descriptions'):
383 copy_descriptions = function.copy_descriptions
384 if hasattr(function, 'copy_contents'):
385 copy_contents = function.copy_contents
386 self._replace_contents(new, copy_descriptions, copy_contents)
388 def realign(self, function):
389 """Realign self.
391 I.e.: apply function to self to produce a new alignment, then update
392 self to have the same gap patterns as the new alignment.
394 This is the same as process(function, False, False)
395 """
396 new = function(self)
397 self._replace_column_contents(new)
399 class Column(dict):
400 """Column of alignment.
402 Column is a dict of { sequence : monomer }.
404 For sequences that have gaps in current row, given key is not present in
405 the column.
406 """
408 types = base
409 """Mapping of related types. SHOULD be redefined in subclasses."""
411 def __hash__(self):
412 """Return hash by identity."""
413 return id(self)
415 class Block(Alignment):
416 """Block of alignment.
418 Block is an intersection of several rows & columns. (The collections of
419 rows and columns are represented as ordered lists, to retain display order
420 of Alignment or add ability to tweak it). Most of blocks look like
421 rectangular part of alignment if you shuffle alignment rows the right way.
422 """
424 alignment = None
425 """Alignment the block belongs to."""
427 sequences = ()
428 """List of sequences in block."""
430 columns = ()
431 """List of columns in block."""
433 @classmethod
434 def from_alignment(cls, alignment, sequences=None, columns=None):
435 """Build new block from alignment.
437 If sequences are not given, the block uses all sequences in alignment.
439 If columns are not given, the block uses all columns in alignment.
441 In both cases we use exactly the list used in alignment, thus, if new
442 sequences or columns are added to alignment, the block tracks this too.
443 """
444 if sequences is None:
445 sequences = alignment.sequences
446 if columns is None:
447 columns = alignment.columns
448 block = cls()
449 block.alignment = alignment
450 block.sequences = sequences
451 block.columns = columns
452 return block
454 # vim: set ts=4 sts=4 sw=4 et: