Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/rev/21cfc7897a8f
Дата изменения: Unknown
Дата индексирования: Tue Oct 2 01:10:00 2012
Кодировка:
allpy: 21cfc7897a8f

allpy

changeset 711:21cfc7897a8f

Implemented markup fileIO (closes #56) This is done by adding file format 'markup' or 'markup:formatname', where 'formatname' is otherwise known alignment format. The file format for is described briefly in fileio.MarkupFile docstrings. This commit also contains example of defining Markup saving mixin: markups.IntMarkupMixin and a test for it.
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Thu, 07 Jul 2011 22:32:21 +0400
parents 4e0312f00a6c
children 9481d408ca93
files allpy/base.py allpy/fileio.py allpy/markups.py test/test_markups.py
diffstat 4 files changed, 208 insertions(+), 0 deletions(-) [+]
line diff
     1.1 --- a/allpy/base.py	Thu Jul 07 22:27:14 2011 +0400
     1.2 +++ b/allpy/base.py	Thu Jul 07 22:32:21 2011 +0400
     1.3 @@ -504,6 +504,13 @@
     1.4      def refresh(self):
     1.5          pass
     1.6  
     1.7 +    @classmethod
     1.8 +    def from_record(cls, alignment, record, name=None):
     1.9 +        return cls(alignment, name)
    1.10 +
    1.11 +    def to_record(self):
    1.12 +        return {}
    1.13 +
    1.14  class SequenceMarkup(Markup):
    1.15  
    1.16      def __init__(self, sequence, name=None):
     2.1 --- a/allpy/fileio.py	Thu Jul 07 22:27:14 2011 +0400
     2.2 +++ b/allpy/fileio.py	Thu Jul 07 22:32:21 2011 +0400
     2.3 @@ -3,11 +3,21 @@
     2.4  from tempfile import NamedTemporaryFile
     2.5  import util
     2.6  
     2.7 +def get_markups_class(classname):
     2.8 +    """This ugly helper is to avoid bad untimely import loops."""
     2.9 +    import markups
    2.10 +    return getattr(markups, classname)
    2.11 +
    2.12  class File(object):
    2.13      """Automatical file IO."""
    2.14      def __new__(cls, file, format="fasta", **kw):
    2.15          if format == "fasta":
    2.16              return FastaFile(file, **kw)
    2.17 +        elif format == 'markup':
    2.18 +            return MarkupFile(file, **kw)
    2.19 +        elif format.startswith('markup:'):
    2.20 +            subformat = format.split(':',1)[1]
    2.21 +            return MarkupFile(file, format=subformat, **kw)
    2.22          else:
    2.23              return EmbossFile(file, format, **kw)
    2.24  
    2.25 @@ -67,6 +77,138 @@
    2.26              body = util.remove_each(body, " \n\r\t\v")
    2.27              yield (name, description, body)
    2.28  
    2.29 +class MarkupFile(AlignmentFile):
    2.30 +    """Parser & writer for our own marked alignment file format.
    2.31 +
    2.32 +    Marked alignment file consists of a list of records, separated with one or
    2.33 +    more empty lines. Each record consists of type name, header and optional
    2.34 +    contents. Type name is a line, containing just one word, describing the
    2.35 +    record type. Header is a sequence of lines, each in format `key: value`.
    2.36 +    Content, if present, is separated from header with an empty line.
    2.37 +
    2.38 +    Type names and header key names are case-insensitive.
    2.39 +
    2.40 +    Known record types now are:
    2.41 +
    2.42 +    - `alignment` -- this must be the last record in file for now
    2.43 +    - `sequence_markup`
    2.44 +    - `alignment_markup`
    2.45 +
    2.46 +    Example::
    2.47 +
    2.48 +        sequence_markup
    2.49 +        sequence_name: cyb5_mouse
    2.50 +        sequence_description:
    2.51 +        name: pdb_residue_number
    2.52 +        type: SequencePDBResidueNumberMarkup
    2.53 +        markup: -,12,121,122,123,124,13,14,15,-,-,16
    2.54 +
    2.55 +        alignment_markup
    2.56 +        name: geometrical_core
    2.57 +        type: AlignmentGeometricalCoreMarkup
    2.58 +        markup: -,-,-,-,+,+,+,-,-,-,+,+,-,-,-,-
    2.59 +
    2.60 +        alignment
    2.61 +        format: fasta
    2.62 +
    2.63 +        > cyb5_mouse
    2.64 +        seqvencemouse
    2.65 +    """
    2.66 +
    2.67 +    _empty_line = ''
    2.68 +    """Helper attribute for write_empty_line."""
    2.69 +
    2.70 +    def write_alignment(self, alignment):
    2.71 +        """Write alignment to file."""
    2.72 +        self.write_markups(alignment.markups, 'alignment_markup')
    2.73 +        for sequence in alignment.sequences:
    2.74 +            record = {
    2.75 +                'sequence_name': sequence.name,
    2.76 +                'sequence_description': sequence.description,
    2.77 +            }
    2.78 +            self.write_markups(sequence.markups, 'sequence_markup', record)
    2.79 +        record = {'type': 'alignment', 'format': self.format}
    2.80 +        self.write_record(record)
    2.81 +        self.write_empty_line()
    2.82 +        alignment.to_file(self.file)
    2.83 +
    2.84 +    def write_markups(self, markups, type, pre_record={}):
    2.85 +        """Write a dictionary of markups as series of records."""
    2.86 +        for name, markup in markups.items():
    2.87 +            record = markup.to_record()
    2.88 +            record.update(pre_record)
    2.89 +            record['type'] = type
    2.90 +            record['name'] = name
    2.91 +            record['class'] = markup.__class__.__name__
    2.92 +            self.write_record(record)
    2.93 +
    2.94 +    def write_record(self, record):
    2.95 +        """Write record to file. Add new line before every but first record."""
    2.96 +        self.write_empty_line()
    2.97 +        self.file.write('%s\n' % record['type'])
    2.98 +        del record['type']
    2.99 +        for key, value in record.items():
   2.100 +            self.file.write('%s: %s\n' % (key, value))
   2.101 +
   2.102 +    def write_empty_line(self):
   2.103 +        """Add empty line every time except the first call."""
   2.104 +        self.file.write(self._empty_line)
   2.105 +        self._empty_line = '\n'
   2.106 +
   2.107 +    def read_alignment(self, alignment):
   2.108 +        """Read alignment from file."""
   2.109 +        for record in list(self.read_records(alignment)):
   2.110 +            handler = getattr(self, 'add_%s' % record['type'])
   2.111 +            handler(alignment, record)
   2.112 +
   2.113 +    def add_sequence_markup(self, alignment, record):
   2.114 +        """Found sequence markup record in file. Do something about it."""
   2.115 +        for sequence in alignment.sequences:
   2.116 +            if sequence.name == record['sequence_name']:
   2.117 +                description = record.get('sequence_description')
   2.118 +                if description:
   2.119 +                    assert sequence.description == description
   2.120 +                cls = get_markups_class(record['class'])
   2.121 +                cls.from_record(sequence, record, name=record.get('name'))
   2.122 +                return
   2.123 +        raise AssertionError("Could not find sequence in alignment")
   2.124 +
   2.125 +    def add_alignment_markup(self, alignment, record):
   2.126 +        """Found alignment markup record in file. Do something about it."""
   2.127 +        cls = get_markups_class(record['class'])
   2.128 +        cls.from_record(alignment, record, name=record.get('name'))
   2.129 +
   2.130 +    def add_alignment(self, alignment, record):
   2.131 +        """Found alignment record. It has been handled in read_payload."""
   2.132 +        pass
   2.133 +
   2.134 +    def read_records(self, alignment):
   2.135 +        """Read records and return them as a list of dicts."""
   2.136 +        for line in self.file:
   2.137 +            if line.strip() == "":
   2.138 +                continue
   2.139 +            yield self.read_record(alignment, line)
   2.140 +
   2.141 +    def read_record(self, alignment, type):
   2.142 +        """Read record headers and record payload."""
   2.143 +        type = type.strip().lower()
   2.144 +        record = {'type': type}
   2.145 +        for line in self.file:
   2.146 +            if line.strip() == "":
   2.147 +                self.read_payload(alignment, record, type)
   2.148 +                return record
   2.149 +            key, value = line.split(':', 1)
   2.150 +            key = key.strip().lower()
   2.151 +            value = value.strip()
   2.152 +            record[key] = value
   2.153 +        return record
   2.154 +
   2.155 +    def read_payload(self, alignment, record, type):
   2.156 +        """Read record payload, if necessary."""
   2.157 +        if type == 'alignment':
   2.158 +            io = File(self.file, record.get('format', 'fasta'))
   2.159 +            io.read_alignment(alignment)
   2.160 +
   2.161  class EmbossFile(AlignmentFile):
   2.162      """Parser & writer for file formats supported by EMBOSS."""
   2.163  
     3.1 --- a/allpy/markups.py	Thu Jul 07 22:27:14 2011 +0400
     3.2 +++ b/allpy/markups.py	Thu Jul 07 22:32:21 2011 +0400
     3.3 @@ -1,5 +1,27 @@
     3.4  import base
     3.5  
     3.6 +class IntMarkupMixin(base.Markup):
     3.7 +
     3.8 +    @classmethod
     3.9 +    def from_record(cls, container, record, name=None):
    3.10 +        assert record['io-class'] == 'IntMarkup'
    3.11 +        result = cls(container, name=name)
    3.12 +        separator = record.get('separator', ',')
    3.13 +        values = record['markup'].split(separator)
    3.14 +        assert len(values) == len(result.sorted_keys())
    3.15 +        for key, value in zip(result.sorted_keys(), values):
    3.16 +            if value:
    3.17 +                result[key] = int(value)
    3.18 +        return result
    3.19 +
    3.20 +    def to_record(self):
    3.21 +        def fmt(value):
    3.22 +            if value is None:
    3.23 +                return ""
    3.24 +            return str(value)
    3.25 +        values = [fmt(self.get(key)) for key in self.sorted_keys()]
    3.26 +        return {'markup': ','.join(values), 'io-class': 'IntMarkup'}
    3.27 +
    3.28  class SequenceNumberMarkup(base.SequenceMarkup):
    3.29  
    3.30      name = 'number'
     4.1 --- a/test/test_markups.py	Thu Jul 07 22:27:14 2011 +0400
     4.2 +++ b/test/test_markups.py	Thu Jul 07 22:32:21 2011 +0400
     4.3 @@ -1,3 +1,4 @@
     4.4 +from StringIO import StringIO
     4.5  from allpy import base
     4.6  from allpy import protein
     4.7  from allpy import markups
     4.8 @@ -34,4 +35,40 @@
     4.9      seq[2].my_markup = 4
    4.10      assert markup[seq[2]] == 4
    4.11  
    4.12 +def test_io():
    4.13 +    aln = (protein.Alignment().
    4.14 +        append_row_from_string('aseq-vence', name='a').
    4.15 +        append_row_from_string('ase-qret--', name='b')
    4.16 +    )
    4.17 +
    4.18 +    class MySequenceMarkup(base.SequenceMarkup, markups.IntMarkupMixin):
    4.19 +        pass
    4.20 +    class MyAlignmentMarkup(base.AlignmentMarkup, markups.IntMarkupMixin):
    4.21 +        pass
    4.22 +    markups.MySequenceMarkup = MySequenceMarkup
    4.23 +    markups.MyAlignmentMarkup = MyAlignmentMarkup
    4.24 +
    4.25 +    s = aln.sequences[0]
    4.26 +    c = aln.columns
    4.27 +    m1 = MySequenceMarkup(s, name='m1')
    4.28 +    m1[s[5]] = 5
    4.29 +    m1[s[3]] = 3
    4.30 +    m2 = MyAlignmentMarkup(aln, name='m2')
    4.31 +    m2[c[5]] = 5
    4.32 +    m2[c[6]] = 6
    4.33 +
    4.34 +    file = StringIO()
    4.35 +    aln.to_file(file, format='markup')
    4.36 +    import sys
    4.37 +    file.seek(0)
    4.38 +
    4.39 +    out = protein.Alignment().append_file(file, format='markup')
    4.40 +    s = out.sequences[0]
    4.41 +    c = out.columns
    4.42 +    print out.markups
    4.43 +    print s.markups
    4.44 +    print out.sequences[1].markups
    4.45 +    assert s[5].m1 == 5 and s[3].m1 == 3
    4.46 +    assert out.markups['m2'][c[5]] == 5 and out.markups['m2'][c[6]] == 6
    4.47 +
    4.48  # vim: set ts=4 sts=4 sw=4 et: