allpy
changeset 1096:ca0b757452da
Added meta-processor `FixNamesAndOrder`; called from `Needle` and `Muscle` (closes #134)
Processors `Needle()` and `Muscle()` now can deal with all kinds of stuff in
sequence names: spaces, colons, duplicate sequence names.
For readers of #134: please note, that this commit actually does exactly
__nothing__ with `ExternalCommand` processor itself, rather adds a new
meta-processor to be applied to it or any other processor.
author | Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru> |
---|---|
date | Sun, 03 Jun 2012 12:09:41 +0400 |
parents | f05b08e13072 |
children | 8d5388980e57 |
files | allpy/processors.py test/test_realign.py |
diffstat | 2 files changed, 36 insertions(+), 30 deletions(-) [+] |
line diff
1.1 --- a/allpy/processors.py Sat Jun 02 22:33:50 2012 +0400 1.2 +++ b/allpy/processors.py Sun Jun 03 12:09:41 2012 +0400 1.3 @@ -4,6 +4,7 @@ 1.4 import os 1.5 from tempfile import NamedTemporaryFile 1.6 from util import Silence 1.7 +from copy import deepcopy 1.8 1.9 # 1.10 # External command processors: processors that pass their output to some 1.11 @@ -64,48 +65,36 @@ 1.12 otherwise, gaps are retained and muscle performs subalignment of 1.13 an existing alignment. 1.14 """ 1.15 - return FixOrdering(BypassEmpty(_Muscle())) 1.16 + return FixNamesAndOrder(BypassEmpty(_Muscle())) 1.17 1.18 class _Needle(ExternalCommand): 1.19 - """Realign block with needle.""" 1.20 + """Realign block with needle. 1.21 + 1.22 + WARNING! This MUST be wrapped in `FixNamesAndOrder()`. 1.23 + """ 1.24 1.25 def __init__(self, **kwargs): 1.26 if 'end' in ''.join(kwargs.keys()): 1.27 kwargs['endweight'] = 'Y' 1.28 - kwargs['asequence'] = '%%(infile)s:%(seq1)s' 1.29 - kwargs['bsequence'] = '%%(infile)s:%(seq2)s' 1.30 - kwargs['outfile'] = '%%(outfile)s' 1.31 + kwargs['asequence'] = '%(infile)s:1' 1.32 + kwargs['bsequence'] = '%(infile)s:2' 1.33 + kwargs['outfile'] = '%(outfile)s' 1.34 kwargs['aformat3'] = 'fasta' 1.35 args = ['-%s %s' % (key, value) for key, value in kwargs.items()] 1.36 cmdline = ' '.join(['needle', '-auto'] + args) 1.37 ExternalCommand.__init__(self, cmdline) 1.38 1.39 - def _check_sequence(self, sequence): 1.40 - assert sequence.name, "Needle does not allow empty sequence names" 1.41 - assert ":" not in sequence.name, \ 1.42 - "Needle does not allow : in sequence names" 1.43 - 1.44 def __call__(self, block): 1.45 assert len(block.sequences) == 2, "Needle needs exactly two sequences" 1.46 - self._check_sequence(block.sequences[0]) 1.47 - self._check_sequence(block.sequences[1]) 1.48 - try: 1.49 - orig_command = self.command 1.50 - self.command = self.command % { 1.51 - 'seq1': block.sequences[0].name, 1.52 - 'seq2': block.sequences[1].name, 1.53 - } 1.54 - with Silence(dup="stderr"): 1.55 - return ExternalCommand.__call__(self, block) 1.56 - finally: 1.57 - self.command = orig_command 1.58 + with Silence(dup="stderr"): 1.59 + return ExternalCommand.__call__(self, block) 1.60 1.61 def Needle(**kwargs): 1.62 """Realign block with Needle. 1.63 1.64 Arguments are the same as accepted by needle command line program. 1.65 """ 1.66 - return FixOrdering(_Needle(**kwargs)) 1.67 + return FixNamesAndOrder(_Needle(**kwargs)) 1.68 1.69 # 1.70 # Trivial processors (move all gaps to one side) 1.71 @@ -207,4 +196,24 @@ 1.72 def id_by_description(sequence): 1.73 return sequence.description 1.74 1.75 +class FixNamesAndOrder(object): 1.76 + """Metaprocessor: avoid problems with sequence names and sequence order. 1.77 + 1.78 + Temporarily replace sequence names with numbers while processor. Don't 1.79 + bother restoring sequence names/desceriptions, since the caller will only 1.80 + rely on sequence order. 1.81 + """ 1.82 + 1.83 + def __init__(self, processor): 1.84 + self.processor = processor 1.85 + 1.86 + def __call__(self, block): 1.87 + in_block = deepcopy(block) 1.88 + for n, sequence in enumerate(in_block.sequences): 1.89 + sequence.name = str(n) 1.90 + sequence.description = "" 1.91 + out_block = self.processor(in_block) 1.92 + out_block.sequences.sort(key=lambda s: int(s.name)) 1.93 + return out_block 1.94 + 1.95 # vim: set et ts=4 sts=4 sw=4:
2.1 --- a/test/test_realign.py Sat Jun 02 22:33:50 2012 +0400 2.2 +++ b/test/test_realign.py Sun Jun 03 12:09:41 2012 +0400 2.3 @@ -36,21 +36,18 @@ 2.4 append_row_from_string("rrrr", name="a"). 2.5 append_row_from_string("rrrr", name="a")) 2.6 2.7 -@raises(AssertionError) 2.8 def test_muscle_duplicate(): 2.9 - """`muscle` must be unable to deal with alignments with non-unique ids""" 2.10 + """`muscle` must be able to deal with alignments with non-unique ids""" 2.11 example2.realign(processors.Muscle()) 2.12 2.13 -@raises(AssertionError) 2.14 -def test_muscle_duplicate(): 2.15 - """`needle` must be unable to deal with alignments with non-unique ids""" 2.16 +def test_needle_duplicate(): 2.17 + """`needle` must be able to deal with alignments with non-unique ids""" 2.18 example2.realign(processors.Needle()) 2.19 2.20 example3 = (protein.Alignment(). 2.21 append_row_from_string("n", name="xxx"). 2.22 append_row_from_string("m", name="")) 2.23 2.24 -@raises(AssertionError) 2.25 def test_needle_empty(): 2.26 - """`needle` must be unable to deal with sequences with empty name""" 2.27 + """`needle` must be able to deal with sequences with empty name""" 2.28 example3.realign(processors.Needle())