allpy
changeset 89:c1677c874e36
try to add utility to make alignments
dirty
maybe use needle or muscle?
author | boris <bnagaev@gmail.com> |
---|---|
date | Wed, 20 Oct 2010 00:56:42 +0400 |
parents | 847f54face57 |
children | e2590c5179f6 |
files | lib/align.py |
diffstat | 1 files changed, 233 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/lib/align.py Wed Oct 20 00:56:42 2010 +0400 1.3 @@ -0,0 +1,233 @@ 1.4 + 1.5 +""" 1.6 +Usage: 1.7 + 1.8 +seq_in=[] 1.9 +seq_in.append("SSNAKIDQLSSDAQTANAKADQASNDANAARSDAQAAKDDAARANQRLDNM") 1.10 +seq_in.append("NAKADQASSDAQTANAKADQASNDANAARSDAQAAKDDAARANQRADNAA") 1.11 +l=AlignmentSeq(seq_in) 1.12 +for t in l.new_sequences: 1.13 + print t 1.14 + 1.15 +Why not to reimplement it? 1.16 +Lava flow 1.17 +http://www.insidecpp.ru/antipatterns/lava_flow/ 1.18 +""" 1.19 + 1.20 +from allpy_data.blossum62 import matrix, gaps 1.21 + 1.22 +class AlignmentSeq(object): 1.23 + 1.24 + def __init__(self, sequences): 1.25 + """ 1.26 + sequences are strings 1.27 + 1.28 + new_sequences -- aligned sequences 1.29 + connections -- list of dicts like { [new_atom_id] => old_atom_id } 1.30 + common -- list of [list_of_letters, list_of_chain_numbers] 1.31 + """ 1.32 + self.old_sequences = [] 1.33 + self.new_sequences = [] 1.34 + self.connections = [] 1.35 + for seq in sequences: 1.36 + self.old_sequences.append(seq.upper().replace('-','').replace(' ','')) 1.37 + self.common=[] 1.38 + for i in xrange(len(self.old_sequences)): 1.39 + self.unite(i) 1.40 + for i in xrange(len(self.old_sequences)): 1.41 + self.lining(i) 1.42 + 1.43 + @staticmethod 1.44 + def cost(a1, a2): 1.45 + """ 1.46 + returns cost of aligning of aminoacids a1 and a2 1.47 + """ 1.48 + a1 = a1.upper() 1.49 + a2 = a2.upper() 1.50 + if a1 in matrix: 1.51 + if a2 in matrix[a1]: 1.52 + return matrix[a1][a2] 1.53 + return gaps[0] 1.54 + 1.55 + @staticmethod 1.56 + def gap_cost(self, gaps_count): 1.57 + """ 1.58 + returns penalty of appending (to the right end) one more gap 1.59 + gaps_count -- real number of gaps 1.60 + """ 1.61 + if gaps_count >= len(gaps): 1.62 + return gaps[(len(gaps)-1)] 1.63 + else: 1.64 + return gaps[gaps_count] 1.65 + 1.66 + # TO BE DONE 1.67 + 1.68 + #def unite(self, chainN): 1.69 + #""" 1.70 + #alignment list creation 1.71 + #chainN - chain number 1.72 + #""" 1.73 + #str1 = self.old_sequences[chainN] 1.74 + #len1 = len(str1) 1.75 + 1.76 + #if not self.common: 1.77 + #i = 0 1.78 + #while i < len1: 1.79 + #aminoacids = [str1[i]] 1.80 + #chains = [chainN] 1.81 + #self.common.append([aminoacids,chains]) 1.82 + #i += 1 1.83 + #return 1.84 + 1.85 + #len2 = len(self.common) 1.86 + 1.87 + #d = [] 1.88 + #tip_from = [] 1.89 + #already_gaps = [] 1.90 + 1.91 + #for i in xrange(len1 + 1): 1.92 + #d.append([]) 1.93 + #already_gaps.append([]) 1.94 + #tip_from.append([]) 1.95 + #for j in xrange(len2 + 1): 1.96 + #d[i].append(0) 1.97 + #already_gaps[i].append([0, 0]) 1.98 + #tip_from[i].append(0) 1.99 + 1.100 + #for i in xrange(1, len1 + 1): 1.101 + #for j in xrange(1, len2 + 1): 1.102 + 1.103 + #costs = [] 1.104 + #for A in self.common[j - 1][0]: 1.105 + #costs.append(self.cost(str1[i - 1], A)) 1.106 + #cost = max(costs) 1.107 + 1.108 + #insertion = d[i - 1][j] 1.109 + #if j != len2: # non-end gap 1.110 + #insertion += self.gap_cost(already_gaps[i - 1][j][1]) 1.111 + 1.112 + #deletion = d[i][j - 1] 1.113 + #if i != len1: # non-end gap 1.114 + #deletion += self.gap_cost(already_gaps[i][j - 1][0]) 1.115 + 1.116 + #substitution = d[i - 1][j - 1] + cost 1.117 + #max_way = max(insertion, deletion, substitution) 1.118 + #d[i][j] = max_way 1.119 + 1.120 + #if max_way == substitution: 1.121 + #tip = 3 1.122 + #if max_way == deletion: 1.123 + #tip = 2 1.124 + #if max_way == insertion: 1.125 + #tip = 1 1.126 + 1.127 + #if tip == 1: # insertion 1.128 + #already_gaps[i][j]=[0, (already_gaps[i-1][j][1]+1) ] 1.129 + #if tip == 2: # deletion 1.130 + #already_gaps[i][j]=[ (already_gaps[i][j-1][0]+1), 0 ] 1.131 + #if tip == 3: # substitution 1.132 + #already_gaps[i][j]=[ 0, 0 ] 1.133 + 1.134 + #tip_from[i][j] = tip 1.135 + 1.136 + #i = len1 1.137 + #j = len2 1.138 + 1.139 + #common = [] 1.140 + 1.141 + #while i > 0 or j > 0: 1.142 + #tip = tip_from[i][j] 1.143 + 1.144 + #if tip == 1 or j == 0 and i > 0: 1.145 + 1.146 + #aminoacids = [(str1[i-1])] 1.147 + #chains = [chainN] 1.148 + 1.149 + #common.append([aminoacids, chains]) 1.150 + 1.151 + #i -= 1 1.152 + 1.153 + 1.154 + 1.155 + #if tip==2 or (i==0 and j>0): 1.156 + 1.157 + #common.append(self.common[j-1]) 1.158 + #j-=1 1.159 + 1.160 + 1.161 + #if (tip==3): 1.162 + 1.163 + #chains=self.common[j-1][1] 1.164 + #chains.append(chainN) 1.165 + 1.166 + #aminoacids=self.common[j-1][0] 1.167 + 1.168 + #if (not aminoacids.count(str1[i-1])): 1.169 + #aminoacids.append(str1[i-1]) 1.170 + 1.171 + #common.append([aminoacids,chains]) 1.172 + 1.173 + #i-=1 1.174 + #j-=1 1.175 + 1.176 + 1.177 + 1.178 + #common.reverse() 1.179 + 1.180 + #self.common=common 1.181 + 1.182 + 1.183 + 1.184 + 1.185 + 1.186 + 1.187 + 1.188 + 1.189 + 1.190 + 1.191 + 1.192 + 1.193 + 1.194 + 1.195 + 1.196 + 1.197 + 1.198 + 1.199 + 1.200 + 1.201 + 1.202 + 1.203 + 1.204 + #def lining(self,chainN): 1.205 + 1.206 + 1.207 + #""" 1.208 + #метод создает новую выровненную последовательность 1.209 + #в self.new_sequences 1.210 + 1.211 + #chainN - номер цепи 1.212 + #""" 1.213 + 1.214 + #str1=self.old_sequences[chainN] 1.215 + #len1=len(str1) 1.216 + 1.217 + #len2=len(self.common) 1.218 + 1.219 + 1.220 + #new_seq='' 1.221 + #position_in_old=0 1.222 + 1.223 + #for common_1 in self.common: 1.224 + #if (common_1[1].count(chainN)): 1.225 + #new_seq = new_seq + str1[position_in_old] 1.226 + #position_in_old += 1 1.227 + #else: 1.228 + #new_seq = new_seq + '-' 1.229 + 1.230 + #self.new_sequences.append(new_seq) 1.231 + 1.232 + 1.233 + 1.234 + 1.235 + 1.236 +