Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/rev/cfcbd13f6761
Дата изменения: Unknown
Дата индексирования: Tue Oct 2 01:03:46 2012
Кодировка:
allpy: cfcbd13f6761

allpy

changeset 864:cfcbd13f6761

Added fileio.BioPythonFile as a method to parse unknown file formats [closes #106] Biopython can parse more formats than EMBOSS, but surprisingly, it cannot do msf. Also, there is no way to see in the current tests, where a test used biopython or emboss for a particular IO task. This will likely be fixed with the 1.5.0 release with the new fileio system. For now, Biopython has precedence over EMBOSS, so an IO test of msf tests EMBOSS, and IO test of Stockholm tests Biopython.
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Mon, 25 Jul 2011 14:40:41 +0400
parents ddf85d0a8924
children 3286970608fe
files allpy/fileio.py test/test_base.py
diffstat 2 files changed, 75 insertions(+), 0 deletions(-) [+]
line diff
     1.1 --- a/allpy/fileio.py	Thu Jul 21 18:34:27 2011 +0400
     1.2 +++ b/allpy/fileio.py	Mon Jul 25 14:40:41 2011 +0400
     1.3 @@ -3,6 +3,13 @@
     1.4  from tempfile import NamedTemporaryFile
     1.5  import util
     1.6  
     1.7 +bio_python = False
     1.8 +try:
     1.9 +    from Bio import Seq, SeqRecord, Align, SeqIO, AlignIO, Alphabet
    1.10 +    bio_python = True
    1.11 +except ImportError:
    1.12 +    pass
    1.13 +
    1.14  def get_markups_class(classname):
    1.15      """This ugly helper is to avoid bad untimely import loops."""
    1.16      import markups
    1.17 @@ -18,6 +25,8 @@
    1.18          elif format.startswith('markup:'):
    1.19              subformat = format.split(':',1)[1]
    1.20              return MarkupFile(file, format=subformat, **kw)
    1.21 +        elif bio_python and BioPythonFile.supports(format):
    1.22 +            return BioPythonFile(file, format, **kw)
    1.23          else:
    1.24              return EmbossFile(file, format, **kw)
    1.25  
    1.26 @@ -237,6 +246,34 @@
    1.27          if for_what == 'write':
    1.28              return string.strip().replace('_', '-').capitalize()
    1.29  
    1.30 +class BioPythonFile(AlignmentFile):
    1.31 +    """Parser & writer for file formats supporte by Bio python."""
    1.32 +
    1.33 +    @staticmethod
    1.34 +    def supports(format):
    1.35 +        """Tell what formats this method supports."""
    1.36 +        return (
    1.37 +            format in AlignIO._FormatToWriter
    1.38 +            or format in SeqIO._FormatToWriter
    1.39 +        )
    1.40 +
    1.41 +    def write_strings(self, sequences):
    1.42 +        """Write sequences to file."""
    1.43 +        aln = Align.MultipleSeqAlignment([
    1.44 +            SeqRecord.SeqRecord(
    1.45 +                Seq.Seq(body, Alphabet.single_letter_alphabet),
    1.46 +                id=name,
    1.47 +                description=description
    1.48 +            )
    1.49 +            for body, name, description in sequences
    1.50 +        ])
    1.51 +        AlignIO.write(aln, self.file, self.format)
    1.52 +
    1.53 +    def read_strings(self):
    1.54 +        """Read sequences from file."""
    1.55 +        for seq in AlignIO.read(self.file, self.format):
    1.56 +            yield seq.id, seq.description, str(seq.seq)
    1.57 +
    1.58  class EmbossFile(AlignmentFile):
    1.59      """Parser & writer for file formats supported by EMBOSS."""
    1.60  
     2.1 --- a/test/test_base.py	Thu Jul 21 18:34:27 2011 +0400
     2.2 +++ b/test/test_base.py	Mon Jul 25 14:40:41 2011 +0400
     2.3 @@ -122,4 +122,42 @@
     2.4      assert b.rows_as_strings() == ["A-C-D", "TGCGA"]
     2.5      assert b.sequences[0].source == o.name
     2.6  
     2.7 +def test_bio_io():
     2.8 +    """Test Bio python IO.
     2.9 +
    2.10 +    BioPython does not support msf, so it is not covered in many other tests.
    2.11 +    """
    2.12 +    file = (
    2.13 +        "# STOCKHOLM 1.0\n"
    2.14 +        "#=GS seqA  AC seqA123\n"
    2.15 +        "#=GS seqA  DR PDB; 1abc ; 1-42;\n"
    2.16 +        "#=GS seqB  AC seqB345\n"
    2.17 +        "SeqA             SEQVENCEHELLO\n"
    2.18 +        "#=GR SeqA  SS    -HHHHH---CHHH\n"
    2.19 +        "SeqB             SI-VENCE--LLO\n"
    2.20 +        "#=GR SeqB  SS    X-HHHHH--HHHH\n"
    2.21 +        "#=GC SS_cons     X-HHHHH--HHHH\n"
    2.22 +        "#=GC seq_cons    Si.VENCE.eLLo\n"
    2.23 +        "//\n"
    2.24 +    )
    2.25 +
    2.26 +    o = StringIO()
    2.27 +    o.write(file)
    2.28 +    o.seek(0)
    2.29 +
    2.30 +    aln = p.Alignment().append_file(o, "stockholm")
    2.31 +    assert len(aln.sequences) == 2
    2.32 +    assert len(aln.columns) == 13
    2.33 +    assert aln.sequences[1] not in aln.columns[2]
    2.34 +    assert aln.sequences[0][2].code1 == 'Q'
    2.35 +
    2.36 +    o = StringIO()
    2.37 +    aln.to_file(o, format='stockholm')
    2.38 +    o.seek(0)
    2.39 +    for line in o:
    2.40 +        hd = line.strip().split()[0]
    2.41 +        assert hd in ('#', '#=GS', '#=GF', '#=GC', 'SeqA', 'SeqB', '//')
    2.42 +    o.seek(0)
    2.43 +    assert iter(o).next() == "# STOCKHOLM 1.0\n"
    2.44 +
    2.45  # vim: set et ts=4 sts=4 sw=4: