allpy

changeset 89:c1677c874e36
try to add utility to make alignments dirty maybe use needle or muscle?
author: boris <bnagaev@gmail.com>
date: Wed, 20 Oct 2010 00:56:42 +0400
parents: 847f54face57
children: e2590c5179f6
files: lib/align.py
diffstat: 1 files changed, 233 insertions(+), 0 deletions(-) [+]
[-]

lib/align.py 233 lib/align.py 233
lib/align.py 233
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/lib/align.py	Wed Oct 20 00:56:42 2010 +0400
     1.3 @@ -0,0 +1,233 @@
     1.4 +
     1.5 +"""
     1.6 +Usage:
     1.7 +
     1.8 +seq_in=[]
     1.9 +seq_in.append("SSNAKIDQLSSDAQTANAKADQASNDANAARSDAQAAKDDAARANQRLDNM")
    1.10 +seq_in.append("NAKADQASSDAQTANAKADQASNDANAARSDAQAAKDDAARANQRADNAA")
    1.11 +l=AlignmentSeq(seq_in)
    1.12 +for t in l.new_sequences:
    1.13 +    print t
    1.14 +
    1.15 +Why not to reimplement it?
    1.16 +Lava flow
    1.17 +http://www.insidecpp.ru/antipatterns/lava_flow/
    1.18 +"""
    1.19 +
    1.20 +from allpy_data.blossum62 import matrix, gaps
    1.21 +
    1.22 +class AlignmentSeq(object):
    1.23 +
    1.24 +    def __init__(self, sequences):
    1.25 +        """
    1.26 +        sequences are strings
    1.27 +        
    1.28 +        new_sequences -- aligned sequences
    1.29 +        connections -- list of dicts like { [new_atom_id] => old_atom_id }
    1.30 +        common -- list of [list_of_letters, list_of_chain_numbers]
    1.31 +        """
    1.32 +        self.old_sequences = []
    1.33 +        self.new_sequences = []
    1.34 +        self.connections = []
    1.35 +        for seq in sequences:
    1.36 +            self.old_sequences.append(seq.upper().replace('-','').replace(' ',''))
    1.37 +        self.common=[]
    1.38 +        for i in xrange(len(self.old_sequences)):
    1.39 +             self.unite(i)
    1.40 +        for i in xrange(len(self.old_sequences)):
    1.41 +             self.lining(i)          
    1.42 +    
    1.43 +    @staticmethod
    1.44 +    def cost(a1, a2):
    1.45 +        """
    1.46 +        returns cost of aligning of aminoacids a1 and a2
    1.47 +        """
    1.48 +        a1 = a1.upper()
    1.49 +        a2 = a2.upper()
    1.50 +        if a1 in matrix:
    1.51 +            if a2 in matrix[a1]:
    1.52 +                return matrix[a1][a2]
    1.53 +        return gaps[0]
    1.54 +   
    1.55 +    @staticmethod
    1.56 +    def gap_cost(self, gaps_count):
    1.57 +        """
    1.58 +        returns penalty of appending (to the right end) one more gap
    1.59 +        gaps_count -- real number of gaps
    1.60 +        """
    1.61 +        if gaps_count >= len(gaps):
    1.62 +            return gaps[(len(gaps)-1)]
    1.63 +        else:
    1.64 +            return gaps[gaps_count]
    1.65 +    
    1.66 +    # TO BE DONE
    1.67 +    
    1.68 +    #def unite(self, chainN):
    1.69 +        #"""
    1.70 +        #alignment list creation
    1.71 +        #chainN - chain number
    1.72 +        #"""
    1.73 +        #str1 = self.old_sequences[chainN]
    1.74 +        #len1 = len(str1)
    1.75 +        
    1.76 +        #if not self.common:
    1.77 +            #i = 0
    1.78 +            #while i < len1:
    1.79 +                #aminoacids = [str1[i]]
    1.80 +                #chains = [chainN]
    1.81 +                #self.common.append([aminoacids,chains])
    1.82 +                #i += 1
    1.83 +            #return
    1.84 +
    1.85 +        #len2 = len(self.common)
    1.86 +
    1.87 +        #d = []
    1.88 +        #tip_from = []
    1.89 +        #already_gaps = []
    1.90 +        
    1.91 +        #for i in xrange(len1 + 1):
    1.92 +            #d.append([])
    1.93 +            #already_gaps.append([])
    1.94 +            #tip_from.append([])
    1.95 +            #for j in xrange(len2 + 1):
    1.96 +                #d[i].append(0)
    1.97 +                #already_gaps[i].append([0, 0])
    1.98 +                #tip_from[i].append(0)
    1.99 +
   1.100 +        #for i in xrange(1, len1 + 1):
   1.101 +            #for j in xrange(1, len2 + 1):
   1.102 +
   1.103 +                #costs = []
   1.104 +                #for A in self.common[j - 1][0]:
   1.105 +                    #costs.append(self.cost(str1[i - 1], A))
   1.106 +                #cost = max(costs)
   1.107 +
   1.108 +                #insertion = d[i - 1][j]
   1.109 +                #if j != len2: # non-end gap
   1.110 +                    #insertion += self.gap_cost(already_gaps[i - 1][j][1])
   1.111 +                    
   1.112 +                #deletion  = d[i][j - 1]
   1.113 +                #if i != len1: # non-end gap
   1.114 +                    #deletion += self.gap_cost(already_gaps[i][j - 1][0])
   1.115 +
   1.116 +                #substitution = d[i - 1][j - 1] + cost
   1.117 +                #max_way = max(insertion, deletion, substitution)
   1.118 +                #d[i][j] = max_way
   1.119 +                
   1.120 +                #if max_way == substitution:
   1.121 +                    #tip = 3   
   1.122 +                #if max_way == deletion:
   1.123 +                    #tip = 2                
   1.124 +                #if max_way == insertion:
   1.125 +                    #tip = 1
   1.126 +
   1.127 +                #if tip == 1:                             # insertion
   1.128 +                    #already_gaps[i][j]=[0, (already_gaps[i-1][j][1]+1) ]
   1.129 +                #if tip == 2:                             # deletion
   1.130 +                    #already_gaps[i][j]=[ (already_gaps[i][j-1][0]+1), 0 ]
   1.131 +                #if tip == 3:                             # substitution
   1.132 +                    #already_gaps[i][j]=[ 0, 0 ]
   1.133 +                
   1.134 +                #tip_from[i][j] = tip
   1.135 +
   1.136 +        #i = len1
   1.137 +        #j = len2
   1.138 +
   1.139 +        #common = []
   1.140 +
   1.141 +        #while i > 0 or j > 0:
   1.142 +            #tip = tip_from[i][j]
   1.143 +            
   1.144 +            #if tip == 1 or j == 0 and i > 0:
   1.145 +
   1.146 +                #aminoacids = [(str1[i-1])]
   1.147 +                #chains = [chainN]
   1.148 +                
   1.149 +                #common.append([aminoacids, chains])
   1.150 +                
   1.151 +                #i -= 1
   1.152 +
   1.153 +
   1.154 +                
   1.155 +            #if tip==2 or (i==0 and j>0):
   1.156 +                
   1.157 +                #common.append(self.common[j-1])
   1.158 +                #j-=1
   1.159 +
   1.160 +                
   1.161 +            #if (tip==3):
   1.162 +                                
   1.163 +                #chains=self.common[j-1][1]
   1.164 +                #chains.append(chainN)
   1.165 +                
   1.166 +                #aminoacids=self.common[j-1][0]
   1.167 +                
   1.168 +                #if (not aminoacids.count(str1[i-1])):
   1.169 +                    #aminoacids.append(str1[i-1])
   1.170 +
   1.171 +                #common.append([aminoacids,chains])
   1.172 +                    
   1.173 +                #i-=1
   1.174 +                #j-=1
   1.175 +
   1.176 +          
   1.177 +            
   1.178 +        #common.reverse()
   1.179 +        
   1.180 +        #self.common=common
   1.181 +
   1.182 +
   1.183 +
   1.184 +
   1.185 +
   1.186 +
   1.187 +
   1.188 +
   1.189 +
   1.190 +
   1.191 +
   1.192 +
   1.193 +
   1.194 +
   1.195 +
   1.196 +
   1.197 +
   1.198 +
   1.199 +
   1.200 +
   1.201 +
   1.202 +
   1.203 +
   1.204 +    #def lining(self,chainN):
   1.205 +
   1.206 +
   1.207 +        #"""
   1.208 +        #метод создает новую выровненную последовательность
   1.209 +        #в self.new_sequences
   1.210 +
   1.211 +        #chainN - номер цепи    
   1.212 +        #"""
   1.213 +
   1.214 +        #str1=self.old_sequences[chainN]
   1.215 +        #len1=len(str1)
   1.216 +        
   1.217 +        #len2=len(self.common)
   1.218 +
   1.219 +
   1.220 +        #new_seq=''
   1.221 +        #position_in_old=0
   1.222 +
   1.223 +        #for common_1 in self.common:
   1.224 +            #if (common_1[1].count(chainN)):
   1.225 +                #new_seq = new_seq + str1[position_in_old]
   1.226 +                #position_in_old += 1
   1.227 +            #else:
   1.228 +                #new_seq = new_seq + '-'
   1.229 +
   1.230 +        #self.new_sequences.append(new_seq)
   1.231 +
   1.232 +
   1.233 +
   1.234 +
   1.235 +
   1.236 +