allpy

changeset 249:117b03ee9e8f
allpy.base: add alignment (old version)
author: boris <bnagaev@gmail.com>
date: Sat, 04 Dec 2010 00:19:17 +0300
parents: 90a75f2fe7df
children: fc6418e32e3c
files: allpy/base.py
diffstat: 1 files changed, 213 insertions(+), 5 deletions(-) [+]
[-]

allpy/base.py 218 allpy/base.py 218
allpy/base.py 218
     1.1 --- a/allpy/base.py	Sat Dec 04 00:18:50 2010 +0300
     1.2 +++ b/allpy/base.py	Sat Dec 04 00:19:17 2010 +0300
     1.3 @@ -1,4 +1,5 @@
     1.4  import _monomer
     1.5 +from fasta import save_fasta
     1.6  
     1.7  class Sequence(list):
     1.8      """ Sequence of Monomers
     1.9 @@ -14,16 +15,223 @@
    1.10  MonomerType = _monomer.MonomerType
    1.11  Monomer = _monomer.Monomer
    1.12  
    1.13 -class Alignment(list):
    1.14 +class Alignment(dict):
    1.15 +    """ Alignment
    1.16 +    
    1.17 +        {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
    1.18 +        keys are the Sequence objects, values are the lists, which
    1.19 +        contain monomers of those sequences or None for gaps in the
    1.20 +        corresponding sequence of alignment
    1.21 +    """
    1.22 +    # _sequences -- list of Sequence objects. Sequences don't contain gaps
    1.23 +    #  - see sequence.py module
    1.24 +
    1.25 +    def __init__(self, *args):
    1.26 +        """overloaded constructor
    1.27 +
    1.28 +        Alignment() -> new empty Alignment
    1.29 +        Alignment(sequences, body) -> new Alignment with sequences and
    1.30 +            body initialized from arguments
    1.31 +        Alignment(fasta_file) -> new Alignment, read body and sequences
    1.32 +             from fasta file 
    1.33 +
    1.34 +        """
    1.35 +        if len(args)>1:#overloaded constructor
    1.36 +            self.sequences=args[0]
    1.37 +            self.body=args[1]
    1.38 +        elif len(args)==0:
    1.39 +            self.sequences=[]
    1.40 +            self.body={}
    1.41 +        else:
    1.42 +            self.sequences, self.body = Alignment.from_fasta(args[0])
    1.43 +
    1.44 +    def length(self):
    1.45 +        """ Returns width, ie length of each sequence with gaps """
    1.46 +        return max([len(line) for line in self.body.values()])
    1.47 +
    1.48 +    def height(self):
    1.49 +        """ The number of sequences in alignment (it's thickness). """
    1.50 +        return len(self.body)
    1.51 +
    1.52 +    def identity(self):
    1.53 +        """ Calculate the identity of alignment positions for colouring.
    1.54 +
    1.55 +        For every (row, column) in alignment the percentage of the exactly
    1.56 +        same residue in the same column in the alignment is calculated.
    1.57 +        The data structure is just like the Alignment.body, but istead of 
    1.58 +        monomers it contains float percentages.
    1.59 +        """
    1.60 +        # Oh, God, that's awful! Absolutely not understandable.
    1.61 +        # First, calculate percentages of amino acids in every column
    1.62 +        contribution = 1.0 / len(self.sequences)
    1.63 +        all_columns = []
    1.64 +        for position in range(len(self)):
    1.65 +            column_percentage = {}
    1.66 +            for seq in self.body:
    1.67 +                if self.body[seq][position] is not None:
    1.68 +                    aa = self.body[seq][position].code
    1.69 +                else:
    1.70 +                    aa = None
    1.71 +                if aa in allpy.data.amino_acids:
    1.72 +                    if aa in column_percentage.keys():
    1.73 +                        column_percentage[aa] += contribution
    1.74 +                    else:
    1.75 +                        column_percentage[aa] = contribution
    1.76 +            all_columns.append(column_percentage)
    1.77 +        # Second, map these percentages onto the alignment
    1.78 +        self.identity_percentages = {}
    1.79 +        for seq in self.sequences:
    1.80 +            self.identity_percentages[seq] = []
    1.81 +        for seq in self.identity_percentages:
    1.82 +            line = self.identity_percentages[seq]
    1.83 +            for position in range(len(self)):
    1.84 +                if self.body[seq][position] is not None:
    1.85 +                    aa = self.body[seq][position].code
    1.86 +                else:
    1.87 +                    aa = None
    1.88 +                line.append(all_columns[position].get(aa))
    1.89 +        return self.identity_percentages
    1.90 +
    1.91 +    @staticmethod
    1.92 +    def from_fasta(file, monomer_kind=AminoAcidType):
    1.93 +        """ Import data from fasta file
    1.94 +        
    1.95 +        monomer_kind is class, inherited from MonomerType
    1.96 +        
    1.97 +        >>> import alignment
    1.98 +        >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))       
    1.99 +        """
   1.100 +        import re
   1.101 +
   1.102 +        sequences = []
   1.103 +        body = {}
   1.104 +
   1.105 +        raw_sequences = file.read().split(">")
   1.106 +        if len(raw_sequences) <= 1:
   1.107 +            raise Exception("Wrong format of fasta-file %s" % file.name)
   1.108 +        
   1.109 +        raw_sequences = raw_sequences[1:] #ignore everything before the first >
   1.110 +        for raw in raw_sequences:
   1.111 +            parsed_raw_sequence = raw.split("\n")
   1.112 +            parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
   1.113 +            name_and_description = parsed_raw_sequence[0]
   1.114 +            name_and_description = name_and_description.split(" ",1)
   1.115 +            if len(name_and_description) == 2:
   1.116 +                name, description = name_and_description
   1.117 +            elif len(name_and_description) == 1: 
   1.118 +                #if there is description
   1.119 +                name = name_and_description[0]
   1.120 +                description = ''
   1.121 +            else:
   1.122 +                raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \
   1.123 +                {'name': name, 'file': file.name})
   1.124 +            
   1.125 +            if len(parsed_raw_sequence) <= 1:
   1.126 +                raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \
   1.127 +                {'name': name, 'file': file.name})
   1.128 +            string = ""
   1.129 +            for piece in parsed_raw_sequence[1:]:
   1.130 +                piece_without_whitespace_chars = re.sub("\s", "", piece)
   1.131 +                string += piece_without_whitespace_chars
   1.132 +            monomers = [] #convert into Monomer objects
   1.133 +            body_list = [] #create the respective list in body dict
   1.134 +            for current_monomer in string:
   1.135 +                if current_monomer not in ["-", ".", "~"]:
   1.136 +                    monomers.append(monomer_kind.from_code1(current_monomer).instance())
   1.137 +                    body_list.append(monomers[-1])
   1.138 +                else:
   1.139 +                    body_list.append(None)
   1.140 +            s = sequence.Sequence(monomers, name, description)
   1.141 +            sequences.append(s)
   1.142 +            body[s] = body_list
   1.143 +        return sequences, body
   1.144 +    
   1.145 +    @staticmethod
   1.146 +    def from_sequences(*sequences):
   1.147 +        """  Constructs new alignment from sequences
   1.148 +        
   1.149 +        Add None's to right end to make equal lengthes of alignment sequences 
   1.150 +        """
   1.151 +        alignment = Alignment()
   1.152 +        alignment.sequences = sequences
   1.153 +        max_length = max(len(sequence) for sequence in sequences)
   1.154 +        for sequence in sequences:
   1.155 +            gaps_count = max_length - len(sequence)
   1.156 +            alignment.body[sequence] = sequence.monomers + [None] * gaps_count
   1.157 +        return alignment
   1.158 +    
   1.159 +    def save_fasta(self, out_file, long_line=70, gap='-'):
   1.160 +        """ Saves alignment to given file
   1.161 +        
   1.162 +        Splits long lines to substrings of length=long_line
   1.163 +        To prevent this, set long_line=None 
   1.164 +        """
   1.165 +        block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)
   1.166 +        
   1.167 +    def muscle_align(self):
   1.168 +        """ Simple align ths alignment using sequences (muscle)
   1.169 +        
   1.170 +        uses old Monomers and Sequences objects
   1.171 +        """
   1.172 +        tmp_file = NamedTemporaryFile(delete=False)
   1.173 +        self.save_fasta(tmp_file)
   1.174 +        tmp_file.close()
   1.175 +        os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})
   1.176 +        sequences, body = Alignment.from_fasta(open(tmp_file.name))
   1.177 +        for sequence in self.sequences:
   1.178 +            try:
   1.179 +                new_sequence = [i for i in sequences if sequence==i][0]
   1.180 +            except:
   1.181 +                raise Exception("Align: Cann't find sequence %s in muscle output" % \
   1.182 +                sequence.name)
   1.183 +            old_monomers = iter(sequence.monomers)
   1.184 +            self.body[sequence] = []
   1.185 +            for monomer in body[new_sequence]:
   1.186 +                if not monomer:
   1.187 +                    self.body[sequence].append(monomer)
   1.188 +                else:
   1.189 +                    old_monomer = old_monomers.next()
   1.190 +                    if monomer != old_monomer:
   1.191 +                        raise Exception("Align: alignment errors")
   1.192 +                    self.body[sequence].append(old_monomer)
   1.193 +        os.unlink(tmp_file.name)
   1.194 +        
   1.195 +    def column(self, sequence=None, sequences=None, original=None):
   1.196 +        """ returns list of columns of alignment
   1.197 +        
   1.198 +        sequence or sequences:
   1.199 +            if sequence is given, then column is (original_monomer, monomer)
   1.200 +            if sequences is given, then column is (original_monomer, {sequence: monomer}) 
   1.201 +            if both of them are given, it is an error
   1.202 +        original (Sequence type):
   1.203 +            if given, this filters only columns represented by original sequence
   1.204 +        """
   1.205 +        if sequence and sequences:
   1.206 +            raise Exception("Wrong usage. read help")
   1.207 +        indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])
   1.208 +        alignment = self.body.items()
   1.209 +        alignment.sort(key=lambda i: indexes[i[0]])
   1.210 +        alignment = [monomers for seq, monomers in alignment]
   1.211 +        for column in zip(*alignment):
   1.212 +            if not original or column[indexes[original]]:
   1.213 +                if sequence:
   1.214 +                    yield (column[indexes[original]], column[indexes[sequence]])
   1.215 +                else:
   1.216 +                    yield (column[indexes[original]], 
   1.217 +                    dict([(s, column[indexes[s]]) for s in sequences]))
   1.218 +    
   1.219 +    def secstr(self, sequence, pdb_chain, gap='-'):
   1.220 +        """ Returns string representing secondary structure """
   1.221 +        return ''.join([
   1.222 +        (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap) 
   1.223 +        for m in self.body[sequence]])
   1.224 +
   1.225 +class Block(object):
   1.226      """ """
   1.227      pass
   1.228  
   1.229 -class Block(list):
   1.230 -    """ """
   1.231 -    pass
   1.232  
   1.233  
   1.234  
   1.235  
   1.236  
   1.237 -