Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/1a8ddc6e2eee/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 21:32:06 2013
Кодировка:
allpy: 1a8ddc6e2eee allpy/base.py

allpy

view allpy/base.py @ 536:1a8ddc6e2eee

mkcodes.py, codes.py: remove entries with 1-letter code longer than 1 letter
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Mon, 28 Feb 2011 19:24:28 +0300
parents 23af8cf15344
children 364232e42888
line source
1 import sys
2 import re
4 import util
5 import fileio
7 # import this very module as means of having all related classes in one place
8 import base
10 default_gaps = set((".", "-", "~"))
11 """Set of characters to recoginze as gaps when parsing alignment."""
13 class Monomer(object):
14 """Monomer object."""
16 type = None
17 """Either of 'dna', 'rna', 'protein'."""
19 types = base
20 """Mapping of related types. SHOULD be redefined in subclasses."""
22 by_code1 = {}
23 """A mapping from 1-letter code to Monomer subclass."""
25 by_code3 = {}
26 """A mapping from 3-letter code to Monomer subclass."""
28 by_name = {}
29 """A mapping from full monomer name to Monomer subclass."""
31 @classmethod
32 def _subclass(cls, name='', code1='', code3='', is_modified=False):
33 """Create new subclass of Monomer for given monomer type."""
34 class TheMonomer(cls):
35 pass
36 name = name.strip().capitalize()
37 code1 = code1.upper()
38 code3 = code3.upper()
39 TheMonomer.__name__ = re.sub(r"[^\w]", "_", name)
40 TheMonomer.name = name
41 TheMonomer.code1 = code1
42 TheMonomer.code3 = code3
43 TheMonomer.is_modified = is_modified
44 if not is_modified:
45 cls.by_code1[code1] = TheMonomer
46 cls.by_code3[code3] = TheMonomer
47 cls.by_name[name] = TheMonomer
48 # We duplicate distinguished long names into Monomer itself, so that we
49 # can use Monomer.from_code3 to create the relevant type of monomer.
50 Monomer.by_code3[code3] = TheMonomer
51 Monomer.by_name[name] = TheMonomer
53 @classmethod
54 def _initialize(cls, codes=None):
55 """Create all relevant subclasses of Monomer."""
56 for code1, is_modified, code3, name in codes:
57 cls._subclass(name, code1, code3, is_modified)
59 @classmethod
60 def from_code1(cls, code1):
61 """Create new monomer from 1-letter code."""
62 return cls.by_code1[code1.upper()]()
64 @classmethod
65 def from_code3(cls, code3):
66 """Create new monomer from 3-letter code."""
67 return cls.by_code3[code3.upper()]()
69 @classmethod
70 def from_name(cls, name):
71 """Create new monomer from full name."""
72 return cls.by_name[name.strip().capitalize()]()
74 def __repr__(self):
75 return '<Monomer %s>' % self.code3
77 def __str__(self):
78 """Returns one-letter code"""
79 return self.code1
81 def __eq__(self, other):
82 """Monomers within same monomer type are compared by code1."""
83 assert self.type == other.type
84 return self.code1 == other.code1
86 def __ne__(self, other):
87 return not (self == other)
89 class Sequence(list):
90 """Sequence of Monomers.
92 This behaves like list of monomer objects. In addition to standard list
93 behaviour, Sequence has the following attributes:
95 * name -- str with the name of the sequence
96 * description -- str with description of the sequence
97 * source -- str denoting source of the sequence
99 Any of them may be empty (i.e. hold empty string)
100 """
102 types = base
103 """Mapping of related types. SHOULD be redefined in subclasses."""
105 name = ''
106 description = ''
107 source = ''
109 @classmethod
110 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
111 """Create sequence from a list of monomer objecst."""
112 result = cls(monomers)
113 if name:
114 result.name = name
115 if description:
116 result.description = description
117 if source:
118 result.source = source
119 return result
121 @classmethod
122 def from_string(cls, string, name='', description='', source=''):
123 """Create sequences from string of one-letter codes."""
124 monomer = cls.types.Monomer.from_code1
125 monomers = [monomer(letter) for letter in string]
126 return cls.from_monomers(monomers, name, description, source)
128 def __repr__(self):
129 return '<Sequence %s>' % str(self)
131 def __str__(self):
132 """Returns sequence of one-letter codes."""
133 return ''.join(monomer.code1 for monomer in self)
135 def __hash__(self):
136 """Hash sequence by identity."""
137 return id(self)
139 class Alignment(object):
140 """Alignment. It is a list of Columns."""
142 types = base
143 """Mapping of related types. SHOULD be redefined in subclasses."""
145 sequences = None
146 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
148 def __init__(self):
149 """Initialize empty alignment."""
150 self.sequences = []
151 self.columns = []
153 # Alignment grow & IO methods
154 # ==============================
156 def append_sequence(self, sequence):
157 """Add sequence to alignment. Return self.
159 If sequence is too short, pad it with gaps on the right.
160 """
161 self.sequences.append(sequence)
162 self._pad_to_width(len(sequence))
163 for column, monomer in zip(self.columns, sequence):
164 column[sequence] = monomer
165 return self
167 def append_row_from_string(self, string,
168 name='', description='', source='', gaps=default_gaps):
169 """Add row from a string of one-letter codes and gaps. Return self."""
170 Sequence = self.types.Sequence
171 without_gaps = util.remove_each(string, gaps)
172 sequence = Sequence.from_string(without_gaps, name, description, source)
173 self._pad_to_width(len(string))
174 non_gap_columns = [column
175 for column, char in zip(self.columns, string)
176 if char not in gaps
178 for monomer, column in zip(sequence, non_gap_columns):
179 column[sequence] = monomer
180 self.sequences.append(sequence)
181 return self
183 def _pad_to_width(self, n):
184 """Pad alignment with empty columns on the right to width n."""
185 for i in range(len(self.columns), n):
186 self.columns.append(Column())
188 def append_file(self, file, format='fasta', gaps=default_gaps):
189 """Append sequences from file to alignment. Return self.
191 If sequences in file have gaps (detected as characters belonging to
192 `gaps` set), treat them accordingly.
193 """
194 sequences = []
195 if format == 'fasta':
196 sequences = fileio.FastaIo(file).get_all_strings()
197 elif format == 'msf':
198 sequences = fileio.MsfIo(file).get_all_strings()
199 else:
200 raise Exception("We don't support other formats yet")
201 for (name, description, body) in sequences:
202 self.append_row_from_string(body, name, description, file.name, gaps)
203 return self
205 def to_file(self, file, format='fasta', gap='-'):
206 """Write alignment in FASTA file as sequences with gaps."""
207 def char(monomer):
208 if monomer:
209 return monomer.code1
210 return gap
211 if format == 'fasta':
212 io = fileio.FastaIo(file)
213 elif format == 'msf':
214 io = fileio.MsfIo(file)
215 else:
216 raise Exception("We don't support other formats yet")
217 for row in self.rows_as_lists():
218 seq = row.sequence
219 line = "".join(map(char, row))
220 io.save_string(line, seq.name, seq.description)
222 # Data access methods for alignment
223 # =================================
225 def rows(self):
226 """Return list of rows (temporary objects) in alignment.
228 Each row is a dictionary of { column : monomer }.
230 For gap positions there is no key for the column in row.
232 Each row has attribute `sequence` pointing to the sequence the row is
233 describing.
235 Modifications of row have no effect on the alignment.
236 """
237 # For now, the function returns a list rather than iterator.
238 # It is yet to see, whether memory performance here becomes critical,
239 # or is random access useful.
240 rows = []
241 for sequence in self.sequences:
242 row = util.UserDict()
243 row.sequence = sequence
244 for column in self.columns:
245 if sequence in column:
246 row[column] = column[sequence]
247 rows.append(row)
248 return rows
250 def rows_as_lists(self):
251 """Return list of rows (temporary objects) in alignment.
253 Each row here is a list of either monomer or None (for gaps).
255 Each row has attribute `sequence` pointing to the sequence of row.
257 Modifications of row have no effect on the alignment.
258 """
259 rows = []
260 for sequence in self.sequences:
261 row = util.UserList()
262 row.sequence = sequence
263 for column in self.columns:
264 row.append(column.get(sequence))
265 rows.append(row)
266 return rows
268 def columns_as_lists(self):
269 """Return list of columns (temorary objects) in alignment.
271 Each column here is a list of either monomer or None (for gaps).
273 Items of column are sorted in the same way as alignment.sequences.
275 Modifications of column have no effect on the alignment.
276 """
277 columns = []
278 for column in self.columns:
279 col = []
280 for sequence in self.sequences:
281 col.append(column.get(sequence))
282 columns.append(col)
283 return columns
285 # Alignment / Block editing methods
286 # =================================
288 def _flush_row(self, row, whence='left'):
289 """Helper for `flush`: flush to one side all monomers in one row."""
290 row = filter(None, row)
291 padding = [None] * len(self.columns)
292 if whence == 'left':
293 return row + padding
294 if whence == 'right':
295 return padding + row
296 if whence == 'center':
297 pad_len = (len(self.columns) - len(row)) // 2
298 # vvv fix padding for case when length is odd: better have more
299 pad_len += len(self.columns) - 2 * pad_len
300 padding = [None] * pad_len
301 return padding + row + padding
302 assert True, "whence must be either 'left' or 'right' or 'center'"
304 def flush(self, whence='left'):
305 """Remove all gaps from alignment and flush results to one side.
307 `whence` must be one of 'left', 'right' or 'center'
308 """
309 for row in self.rows_as_lists():
310 sequence = row.sequence
311 row = self._flush_row(row, whence)
312 for monomer, column in zip(row, self.columns):
313 if monomer:
314 column[sequence] = monomer
315 elif sequence in column:
316 del column[sequence]
318 def remove_gap_columns(self):
319 """Remove all empty columns."""
320 for n, column in reversed(enumerate(self.columns)):
321 if column == {}:
322 self.columns[n:n+1] = []
324 def _wipe_row(self, sequence):
325 """Turn all row positions into gaps (but keep sequences intact)."""
326 for column in self.columns:
327 if sequence in column:
328 del column[sequence]
330 def _merge(self, dst, new, merge):
331 """Replace contents of `dst` with those of `new`.
333 Replace contents of elements using function `merge(dst_el, new_le)`.
334 """
335 for el, new_el in zip(dst, new):
336 merge(el, new_el)
337 dst[len(dst):] = new[len(dst):]
338 del dst[len(new):]
340 def _replace_sequence_contents(self, new, copy_descriptions):
341 """Replace contents of sequences with those of `new` alignment."""
342 # XXX: we manually copy sequence contents here
343 # XXX: we only copy, overlapping parts and link to the rest
344 def merge_monomers(dst, new):
345 dst.__class__ = new.__class__
346 def merge_sequences(dst, new):
347 if copy_descriptions:
348 vars(dst).update(vars(new))
349 self._merge(dst, new, merge_monomers)
350 self._merge(self.sequences, new.sequences, merge_sequences)
352 def _replace_column_contents(self, new):
353 """Replace column contents with those of `new` alignment.
355 In other words: copy gap patterns from `new` to `self`.
357 `self.sequences` and `new.sequences` should have the same contents.
358 """
359 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):
360 sequence = row.sequence
361 monomers = filter(None, row)
362 assert len(monomers) == len(filter(None, new_row))
363 self._wipe_row(sequence)
364 non_gap_columns = [column
365 for column, monomer in zip(self.columns, new_row)
366 if monomer
368 for monomer, column in zip(monomers, non_gap_columns):
369 column[sequence] = monomer
371 def _replace_contents(self, new, copy_descriptions, copy_contents):
372 """Replace alignment contents with those of other alignment."""
373 if copy_contents:
374 self._replace_sequence_contents(new, copy_descriptions)
375 self._replace_column_contents(new)
377 def process(self, function, copy_descriptions=True, copy_contents=True):
378 """Apply function to the alignment (or block); inject results back.
380 - `function(block)` must return block with same line order.
381 - if `copy_descriptions` is False, ignore new sequence names.
382 - if `copy_contents` is False, don't copy sequence contents too.
384 `function` (object) may have attributes `copy_descriptions` and
385 `copy_contents`, which override the same named arguments.
386 """
387 new = function(self)
388 if hasattr(function, 'copy_descriptions'):
389 copy_descriptions = function.copy_descriptions
390 if hasattr(function, 'copy_contents'):
391 copy_contents = function.copy_contents
392 self._replace_contents(new, copy_descriptions, copy_contents)
394 class Column(dict):
395 """Column of alignment.
397 Column is a dict of { sequence : monomer }.
399 For sequences that have gaps in current row, given key is not present in
400 the column.
401 """
403 types = base
404 """Mapping of related types. SHOULD be redefined in subclasses."""
406 def __hash__(self):
407 """Return hash by identity."""
408 return id(self)
410 class Block(Alignment):
411 """Block of alignment.
413 Block is an intersection of several rows & columns. (The collections of
414 rows and columns are represented as ordered lists, to retain display order
415 of Alignment or add ability to tweak it). Most of blocks look like
416 rectangular part of alignment if you shuffle alignment rows the right way.
417 """
419 alignment = None
420 """Alignment the block belongs to."""
422 sequences = ()
423 """List of sequences in block."""
425 columns = ()
426 """List of columns in block."""
428 @classmethod
429 def from_alignment(cls, alignment, sequences=None, columns=None):
430 """Build new block from alignment.
432 If sequences are not given, the block uses all sequences in alignment.
434 If columns are not given, the block uses all columns in alignment.
436 In both cases we use exactly the list used in alignment, thus, if new
437 sequences or columns are added to alignment, the block tracks this too.
438 """
439 if sequences is None:
440 sequences = alignment.sequences
441 if columns is None:
442 columns = alignment.columns
443 block = cls()
444 block.alignment = alignment
445 block.sequences = sequences
446 block.columns = columns
447 return block
449 # vim: set ts=4 sts=4 sw=4 et: