allpy
changeset 711:21cfc7897a8f
Implemented markup fileIO (closes #56)
This is done by adding file format 'markup' or 'markup:formatname', where
'formatname' is otherwise known alignment format.
The file format for is described briefly in fileio.MarkupFile docstrings.
This commit also contains example of defining Markup saving mixin:
markups.IntMarkupMixin and a test for it.
author | Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru> |
---|---|
date | Thu, 07 Jul 2011 22:32:21 +0400 |
parents | 4e0312f00a6c |
children | 9481d408ca93 |
files | allpy/base.py allpy/fileio.py allpy/markups.py test/test_markups.py |
diffstat | 4 files changed, 208 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- a/allpy/base.py Thu Jul 07 22:27:14 2011 +0400 1.2 +++ b/allpy/base.py Thu Jul 07 22:32:21 2011 +0400 1.3 @@ -504,6 +504,13 @@ 1.4 def refresh(self): 1.5 pass 1.6 1.7 + @classmethod 1.8 + def from_record(cls, alignment, record, name=None): 1.9 + return cls(alignment, name) 1.10 + 1.11 + def to_record(self): 1.12 + return {} 1.13 + 1.14 class SequenceMarkup(Markup): 1.15 1.16 def __init__(self, sequence, name=None):
2.1 --- a/allpy/fileio.py Thu Jul 07 22:27:14 2011 +0400 2.2 +++ b/allpy/fileio.py Thu Jul 07 22:32:21 2011 +0400 2.3 @@ -3,11 +3,21 @@ 2.4 from tempfile import NamedTemporaryFile 2.5 import util 2.6 2.7 +def get_markups_class(classname): 2.8 + """This ugly helper is to avoid bad untimely import loops.""" 2.9 + import markups 2.10 + return getattr(markups, classname) 2.11 + 2.12 class File(object): 2.13 """Automatical file IO.""" 2.14 def __new__(cls, file, format="fasta", **kw): 2.15 if format == "fasta": 2.16 return FastaFile(file, **kw) 2.17 + elif format == 'markup': 2.18 + return MarkupFile(file, **kw) 2.19 + elif format.startswith('markup:'): 2.20 + subformat = format.split(':',1)[1] 2.21 + return MarkupFile(file, format=subformat, **kw) 2.22 else: 2.23 return EmbossFile(file, format, **kw) 2.24 2.25 @@ -67,6 +77,138 @@ 2.26 body = util.remove_each(body, " \n\r\t\v") 2.27 yield (name, description, body) 2.28 2.29 +class MarkupFile(AlignmentFile): 2.30 + """Parser & writer for our own marked alignment file format. 2.31 + 2.32 + Marked alignment file consists of a list of records, separated with one or 2.33 + more empty lines. Each record consists of type name, header and optional 2.34 + contents. Type name is a line, containing just one word, describing the 2.35 + record type. Header is a sequence of lines, each in format `key: value`. 2.36 + Content, if present, is separated from header with an empty line. 2.37 + 2.38 + Type names and header key names are case-insensitive. 2.39 + 2.40 + Known record types now are: 2.41 + 2.42 + - `alignment` -- this must be the last record in file for now 2.43 + - `sequence_markup` 2.44 + - `alignment_markup` 2.45 + 2.46 + Example:: 2.47 + 2.48 + sequence_markup 2.49 + sequence_name: cyb5_mouse 2.50 + sequence_description: 2.51 + name: pdb_residue_number 2.52 + type: SequencePDBResidueNumberMarkup 2.53 + markup: -,12,121,122,123,124,13,14,15,-,-,16 2.54 + 2.55 + alignment_markup 2.56 + name: geometrical_core 2.57 + type: AlignmentGeometricalCoreMarkup 2.58 + markup: -,-,-,-,+,+,+,-,-,-,+,+,-,-,-,- 2.59 + 2.60 + alignment 2.61 + format: fasta 2.62 + 2.63 + > cyb5_mouse 2.64 + seqvencemouse 2.65 + """ 2.66 + 2.67 + _empty_line = '' 2.68 + """Helper attribute for write_empty_line.""" 2.69 + 2.70 + def write_alignment(self, alignment): 2.71 + """Write alignment to file.""" 2.72 + self.write_markups(alignment.markups, 'alignment_markup') 2.73 + for sequence in alignment.sequences: 2.74 + record = { 2.75 + 'sequence_name': sequence.name, 2.76 + 'sequence_description': sequence.description, 2.77 + } 2.78 + self.write_markups(sequence.markups, 'sequence_markup', record) 2.79 + record = {'type': 'alignment', 'format': self.format} 2.80 + self.write_record(record) 2.81 + self.write_empty_line() 2.82 + alignment.to_file(self.file) 2.83 + 2.84 + def write_markups(self, markups, type, pre_record={}): 2.85 + """Write a dictionary of markups as series of records.""" 2.86 + for name, markup in markups.items(): 2.87 + record = markup.to_record() 2.88 + record.update(pre_record) 2.89 + record['type'] = type 2.90 + record['name'] = name 2.91 + record['class'] = markup.__class__.__name__ 2.92 + self.write_record(record) 2.93 + 2.94 + def write_record(self, record): 2.95 + """Write record to file. Add new line before every but first record.""" 2.96 + self.write_empty_line() 2.97 + self.file.write('%s\n' % record['type']) 2.98 + del record['type'] 2.99 + for key, value in record.items(): 2.100 + self.file.write('%s: %s\n' % (key, value)) 2.101 + 2.102 + def write_empty_line(self): 2.103 + """Add empty line every time except the first call.""" 2.104 + self.file.write(self._empty_line) 2.105 + self._empty_line = '\n' 2.106 + 2.107 + def read_alignment(self, alignment): 2.108 + """Read alignment from file.""" 2.109 + for record in list(self.read_records(alignment)): 2.110 + handler = getattr(self, 'add_%s' % record['type']) 2.111 + handler(alignment, record) 2.112 + 2.113 + def add_sequence_markup(self, alignment, record): 2.114 + """Found sequence markup record in file. Do something about it.""" 2.115 + for sequence in alignment.sequences: 2.116 + if sequence.name == record['sequence_name']: 2.117 + description = record.get('sequence_description') 2.118 + if description: 2.119 + assert sequence.description == description 2.120 + cls = get_markups_class(record['class']) 2.121 + cls.from_record(sequence, record, name=record.get('name')) 2.122 + return 2.123 + raise AssertionError("Could not find sequence in alignment") 2.124 + 2.125 + def add_alignment_markup(self, alignment, record): 2.126 + """Found alignment markup record in file. Do something about it.""" 2.127 + cls = get_markups_class(record['class']) 2.128 + cls.from_record(alignment, record, name=record.get('name')) 2.129 + 2.130 + def add_alignment(self, alignment, record): 2.131 + """Found alignment record. It has been handled in read_payload.""" 2.132 + pass 2.133 + 2.134 + def read_records(self, alignment): 2.135 + """Read records and return them as a list of dicts.""" 2.136 + for line in self.file: 2.137 + if line.strip() == "": 2.138 + continue 2.139 + yield self.read_record(alignment, line) 2.140 + 2.141 + def read_record(self, alignment, type): 2.142 + """Read record headers and record payload.""" 2.143 + type = type.strip().lower() 2.144 + record = {'type': type} 2.145 + for line in self.file: 2.146 + if line.strip() == "": 2.147 + self.read_payload(alignment, record, type) 2.148 + return record 2.149 + key, value = line.split(':', 1) 2.150 + key = key.strip().lower() 2.151 + value = value.strip() 2.152 + record[key] = value 2.153 + return record 2.154 + 2.155 + def read_payload(self, alignment, record, type): 2.156 + """Read record payload, if necessary.""" 2.157 + if type == 'alignment': 2.158 + io = File(self.file, record.get('format', 'fasta')) 2.159 + io.read_alignment(alignment) 2.160 + 2.161 class EmbossFile(AlignmentFile): 2.162 """Parser & writer for file formats supported by EMBOSS.""" 2.163
3.1 --- a/allpy/markups.py Thu Jul 07 22:27:14 2011 +0400 3.2 +++ b/allpy/markups.py Thu Jul 07 22:32:21 2011 +0400 3.3 @@ -1,5 +1,27 @@ 3.4 import base 3.5 3.6 +class IntMarkupMixin(base.Markup): 3.7 + 3.8 + @classmethod 3.9 + def from_record(cls, container, record, name=None): 3.10 + assert record['io-class'] == 'IntMarkup' 3.11 + result = cls(container, name=name) 3.12 + separator = record.get('separator', ',') 3.13 + values = record['markup'].split(separator) 3.14 + assert len(values) == len(result.sorted_keys()) 3.15 + for key, value in zip(result.sorted_keys(), values): 3.16 + if value: 3.17 + result[key] = int(value) 3.18 + return result 3.19 + 3.20 + def to_record(self): 3.21 + def fmt(value): 3.22 + if value is None: 3.23 + return "" 3.24 + return str(value) 3.25 + values = [fmt(self.get(key)) for key in self.sorted_keys()] 3.26 + return {'markup': ','.join(values), 'io-class': 'IntMarkup'} 3.27 + 3.28 class SequenceNumberMarkup(base.SequenceMarkup): 3.29 3.30 name = 'number'
4.1 --- a/test/test_markups.py Thu Jul 07 22:27:14 2011 +0400 4.2 +++ b/test/test_markups.py Thu Jul 07 22:32:21 2011 +0400 4.3 @@ -1,3 +1,4 @@ 4.4 +from StringIO import StringIO 4.5 from allpy import base 4.6 from allpy import protein 4.7 from allpy import markups 4.8 @@ -34,4 +35,40 @@ 4.9 seq[2].my_markup = 4 4.10 assert markup[seq[2]] == 4 4.11 4.12 +def test_io(): 4.13 + aln = (protein.Alignment(). 4.14 + append_row_from_string('aseq-vence', name='a'). 4.15 + append_row_from_string('ase-qret--', name='b') 4.16 + ) 4.17 + 4.18 + class MySequenceMarkup(base.SequenceMarkup, markups.IntMarkupMixin): 4.19 + pass 4.20 + class MyAlignmentMarkup(base.AlignmentMarkup, markups.IntMarkupMixin): 4.21 + pass 4.22 + markups.MySequenceMarkup = MySequenceMarkup 4.23 + markups.MyAlignmentMarkup = MyAlignmentMarkup 4.24 + 4.25 + s = aln.sequences[0] 4.26 + c = aln.columns 4.27 + m1 = MySequenceMarkup(s, name='m1') 4.28 + m1[s[5]] = 5 4.29 + m1[s[3]] = 3 4.30 + m2 = MyAlignmentMarkup(aln, name='m2') 4.31 + m2[c[5]] = 5 4.32 + m2[c[6]] = 6 4.33 + 4.34 + file = StringIO() 4.35 + aln.to_file(file, format='markup') 4.36 + import sys 4.37 + file.seek(0) 4.38 + 4.39 + out = protein.Alignment().append_file(file, format='markup') 4.40 + s = out.sequences[0] 4.41 + c = out.columns 4.42 + print out.markups 4.43 + print s.markups 4.44 + print out.sequences[1].markups 4.45 + assert s[5].m1 == 5 and s[3].m1 == 3 4.46 + assert out.markups['m2'][c[5]] == 5 and out.markups['m2'][c[6]] == 6 4.47 + 4.48 # vim: set ts=4 sts=4 sw=4 et: