allpy
changeset 864:cfcbd13f6761
Added fileio.BioPythonFile as a method to parse unknown file formats [closes #106]
Biopython can parse more formats than EMBOSS, but surprisingly, it cannot do msf.
Also, there is no way to see in the current tests, where a test used biopython
or emboss for a particular IO task. This will likely be fixed with the 1.5.0 release
with the new fileio system. For now, Biopython has precedence over EMBOSS, so
an IO test of msf tests EMBOSS, and IO test of Stockholm tests Biopython.
author | Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru> |
---|---|
date | Mon, 25 Jul 2011 14:40:41 +0400 |
parents | ddf85d0a8924 |
children | 3286970608fe |
files | allpy/fileio.py test/test_base.py |
diffstat | 2 files changed, 75 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- a/allpy/fileio.py Thu Jul 21 18:34:27 2011 +0400 1.2 +++ b/allpy/fileio.py Mon Jul 25 14:40:41 2011 +0400 1.3 @@ -3,6 +3,13 @@ 1.4 from tempfile import NamedTemporaryFile 1.5 import util 1.6 1.7 +bio_python = False 1.8 +try: 1.9 + from Bio import Seq, SeqRecord, Align, SeqIO, AlignIO, Alphabet 1.10 + bio_python = True 1.11 +except ImportError: 1.12 + pass 1.13 + 1.14 def get_markups_class(classname): 1.15 """This ugly helper is to avoid bad untimely import loops.""" 1.16 import markups 1.17 @@ -18,6 +25,8 @@ 1.18 elif format.startswith('markup:'): 1.19 subformat = format.split(':',1)[1] 1.20 return MarkupFile(file, format=subformat, **kw) 1.21 + elif bio_python and BioPythonFile.supports(format): 1.22 + return BioPythonFile(file, format, **kw) 1.23 else: 1.24 return EmbossFile(file, format, **kw) 1.25 1.26 @@ -237,6 +246,34 @@ 1.27 if for_what == 'write': 1.28 return string.strip().replace('_', '-').capitalize() 1.29 1.30 +class BioPythonFile(AlignmentFile): 1.31 + """Parser & writer for file formats supporte by Bio python.""" 1.32 + 1.33 + @staticmethod 1.34 + def supports(format): 1.35 + """Tell what formats this method supports.""" 1.36 + return ( 1.37 + format in AlignIO._FormatToWriter 1.38 + or format in SeqIO._FormatToWriter 1.39 + ) 1.40 + 1.41 + def write_strings(self, sequences): 1.42 + """Write sequences to file.""" 1.43 + aln = Align.MultipleSeqAlignment([ 1.44 + SeqRecord.SeqRecord( 1.45 + Seq.Seq(body, Alphabet.single_letter_alphabet), 1.46 + id=name, 1.47 + description=description 1.48 + ) 1.49 + for body, name, description in sequences 1.50 + ]) 1.51 + AlignIO.write(aln, self.file, self.format) 1.52 + 1.53 + def read_strings(self): 1.54 + """Read sequences from file.""" 1.55 + for seq in AlignIO.read(self.file, self.format): 1.56 + yield seq.id, seq.description, str(seq.seq) 1.57 + 1.58 class EmbossFile(AlignmentFile): 1.59 """Parser & writer for file formats supported by EMBOSS.""" 1.60
2.1 --- a/test/test_base.py Thu Jul 21 18:34:27 2011 +0400 2.2 +++ b/test/test_base.py Mon Jul 25 14:40:41 2011 +0400 2.3 @@ -122,4 +122,42 @@ 2.4 assert b.rows_as_strings() == ["A-C-D", "TGCGA"] 2.5 assert b.sequences[0].source == o.name 2.6 2.7 +def test_bio_io(): 2.8 + """Test Bio python IO. 2.9 + 2.10 + BioPython does not support msf, so it is not covered in many other tests. 2.11 + """ 2.12 + file = ( 2.13 + "# STOCKHOLM 1.0\n" 2.14 + "#=GS seqA AC seqA123\n" 2.15 + "#=GS seqA DR PDB; 1abc ; 1-42;\n" 2.16 + "#=GS seqB AC seqB345\n" 2.17 + "SeqA SEQVENCEHELLO\n" 2.18 + "#=GR SeqA SS -HHHHH---CHHH\n" 2.19 + "SeqB SI-VENCE--LLO\n" 2.20 + "#=GR SeqB SS X-HHHHH--HHHH\n" 2.21 + "#=GC SS_cons X-HHHHH--HHHH\n" 2.22 + "#=GC seq_cons Si.VENCE.eLLo\n" 2.23 + "//\n" 2.24 + ) 2.25 + 2.26 + o = StringIO() 2.27 + o.write(file) 2.28 + o.seek(0) 2.29 + 2.30 + aln = p.Alignment().append_file(o, "stockholm") 2.31 + assert len(aln.sequences) == 2 2.32 + assert len(aln.columns) == 13 2.33 + assert aln.sequences[1] not in aln.columns[2] 2.34 + assert aln.sequences[0][2].code1 == 'Q' 2.35 + 2.36 + o = StringIO() 2.37 + aln.to_file(o, format='stockholm') 2.38 + o.seek(0) 2.39 + for line in o: 2.40 + hd = line.strip().split()[0] 2.41 + assert hd in ('#', '#=GS', '#=GF', '#=GC', 'SeqA', 'SeqB', '//') 2.42 + o.seek(0) 2.43 + assert iter(o).next() == "# STOCKHOLM 1.0\n" 2.44 + 2.45 # vim: set et ts=4 sts=4 sw=4: