allpy
changeset 249:117b03ee9e8f
allpy.base: add alignment (old version)
author | boris <bnagaev@gmail.com> |
---|---|
date | Sat, 04 Dec 2010 00:19:17 +0300 |
parents | 90a75f2fe7df |
children | fc6418e32e3c |
files | allpy/base.py |
diffstat | 1 files changed, 213 insertions(+), 5 deletions(-) [+] |
line diff
1.1 --- a/allpy/base.py Sat Dec 04 00:18:50 2010 +0300 1.2 +++ b/allpy/base.py Sat Dec 04 00:19:17 2010 +0300 1.3 @@ -1,4 +1,5 @@ 1.4 import _monomer 1.5 +from fasta import save_fasta 1.6 1.7 class Sequence(list): 1.8 """ Sequence of Monomers 1.9 @@ -14,16 +15,223 @@ 1.10 MonomerType = _monomer.MonomerType 1.11 Monomer = _monomer.Monomer 1.12 1.13 -class Alignment(list): 1.14 +class Alignment(dict): 1.15 + """ Alignment 1.16 + 1.17 + {<Sequence object>:[<Monomer object>,None,<Monomer object>]} 1.18 + keys are the Sequence objects, values are the lists, which 1.19 + contain monomers of those sequences or None for gaps in the 1.20 + corresponding sequence of alignment 1.21 + """ 1.22 + # _sequences -- list of Sequence objects. Sequences don't contain gaps 1.23 + # - see sequence.py module 1.24 + 1.25 + def __init__(self, *args): 1.26 + """overloaded constructor 1.27 + 1.28 + Alignment() -> new empty Alignment 1.29 + Alignment(sequences, body) -> new Alignment with sequences and 1.30 + body initialized from arguments 1.31 + Alignment(fasta_file) -> new Alignment, read body and sequences 1.32 + from fasta file 1.33 + 1.34 + """ 1.35 + if len(args)>1:#overloaded constructor 1.36 + self.sequences=args[0] 1.37 + self.body=args[1] 1.38 + elif len(args)==0: 1.39 + self.sequences=[] 1.40 + self.body={} 1.41 + else: 1.42 + self.sequences, self.body = Alignment.from_fasta(args[0]) 1.43 + 1.44 + def length(self): 1.45 + """ Returns width, ie length of each sequence with gaps """ 1.46 + return max([len(line) for line in self.body.values()]) 1.47 + 1.48 + def height(self): 1.49 + """ The number of sequences in alignment (it's thickness). """ 1.50 + return len(self.body) 1.51 + 1.52 + def identity(self): 1.53 + """ Calculate the identity of alignment positions for colouring. 1.54 + 1.55 + For every (row, column) in alignment the percentage of the exactly 1.56 + same residue in the same column in the alignment is calculated. 1.57 + The data structure is just like the Alignment.body, but istead of 1.58 + monomers it contains float percentages. 1.59 + """ 1.60 + # Oh, God, that's awful! Absolutely not understandable. 1.61 + # First, calculate percentages of amino acids in every column 1.62 + contribution = 1.0 / len(self.sequences) 1.63 + all_columns = [] 1.64 + for position in range(len(self)): 1.65 + column_percentage = {} 1.66 + for seq in self.body: 1.67 + if self.body[seq][position] is not None: 1.68 + aa = self.body[seq][position].code 1.69 + else: 1.70 + aa = None 1.71 + if aa in allpy.data.amino_acids: 1.72 + if aa in column_percentage.keys(): 1.73 + column_percentage[aa] += contribution 1.74 + else: 1.75 + column_percentage[aa] = contribution 1.76 + all_columns.append(column_percentage) 1.77 + # Second, map these percentages onto the alignment 1.78 + self.identity_percentages = {} 1.79 + for seq in self.sequences: 1.80 + self.identity_percentages[seq] = [] 1.81 + for seq in self.identity_percentages: 1.82 + line = self.identity_percentages[seq] 1.83 + for position in range(len(self)): 1.84 + if self.body[seq][position] is not None: 1.85 + aa = self.body[seq][position].code 1.86 + else: 1.87 + aa = None 1.88 + line.append(all_columns[position].get(aa)) 1.89 + return self.identity_percentages 1.90 + 1.91 + @staticmethod 1.92 + def from_fasta(file, monomer_kind=AminoAcidType): 1.93 + """ Import data from fasta file 1.94 + 1.95 + monomer_kind is class, inherited from MonomerType 1.96 + 1.97 + >>> import alignment 1.98 + >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta")) 1.99 + """ 1.100 + import re 1.101 + 1.102 + sequences = [] 1.103 + body = {} 1.104 + 1.105 + raw_sequences = file.read().split(">") 1.106 + if len(raw_sequences) <= 1: 1.107 + raise Exception("Wrong format of fasta-file %s" % file.name) 1.108 + 1.109 + raw_sequences = raw_sequences[1:] #ignore everything before the first > 1.110 + for raw in raw_sequences: 1.111 + parsed_raw_sequence = raw.split("\n") 1.112 + parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence] 1.113 + name_and_description = parsed_raw_sequence[0] 1.114 + name_and_description = name_and_description.split(" ",1) 1.115 + if len(name_and_description) == 2: 1.116 + name, description = name_and_description 1.117 + elif len(name_and_description) == 1: 1.118 + #if there is description 1.119 + name = name_and_description[0] 1.120 + description = '' 1.121 + else: 1.122 + raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \ 1.123 + {'name': name, 'file': file.name}) 1.124 + 1.125 + if len(parsed_raw_sequence) <= 1: 1.126 + raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \ 1.127 + {'name': name, 'file': file.name}) 1.128 + string = "" 1.129 + for piece in parsed_raw_sequence[1:]: 1.130 + piece_without_whitespace_chars = re.sub("\s", "", piece) 1.131 + string += piece_without_whitespace_chars 1.132 + monomers = [] #convert into Monomer objects 1.133 + body_list = [] #create the respective list in body dict 1.134 + for current_monomer in string: 1.135 + if current_monomer not in ["-", ".", "~"]: 1.136 + monomers.append(monomer_kind.from_code1(current_monomer).instance()) 1.137 + body_list.append(monomers[-1]) 1.138 + else: 1.139 + body_list.append(None) 1.140 + s = sequence.Sequence(monomers, name, description) 1.141 + sequences.append(s) 1.142 + body[s] = body_list 1.143 + return sequences, body 1.144 + 1.145 + @staticmethod 1.146 + def from_sequences(*sequences): 1.147 + """ Constructs new alignment from sequences 1.148 + 1.149 + Add None's to right end to make equal lengthes of alignment sequences 1.150 + """ 1.151 + alignment = Alignment() 1.152 + alignment.sequences = sequences 1.153 + max_length = max(len(sequence) for sequence in sequences) 1.154 + for sequence in sequences: 1.155 + gaps_count = max_length - len(sequence) 1.156 + alignment.body[sequence] = sequence.monomers + [None] * gaps_count 1.157 + return alignment 1.158 + 1.159 + def save_fasta(self, out_file, long_line=70, gap='-'): 1.160 + """ Saves alignment to given file 1.161 + 1.162 + Splits long lines to substrings of length=long_line 1.163 + To prevent this, set long_line=None 1.164 + """ 1.165 + block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap) 1.166 + 1.167 + def muscle_align(self): 1.168 + """ Simple align ths alignment using sequences (muscle) 1.169 + 1.170 + uses old Monomers and Sequences objects 1.171 + """ 1.172 + tmp_file = NamedTemporaryFile(delete=False) 1.173 + self.save_fasta(tmp_file) 1.174 + tmp_file.close() 1.175 + os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name}) 1.176 + sequences, body = Alignment.from_fasta(open(tmp_file.name)) 1.177 + for sequence in self.sequences: 1.178 + try: 1.179 + new_sequence = [i for i in sequences if sequence==i][0] 1.180 + except: 1.181 + raise Exception("Align: Cann't find sequence %s in muscle output" % \ 1.182 + sequence.name) 1.183 + old_monomers = iter(sequence.monomers) 1.184 + self.body[sequence] = [] 1.185 + for monomer in body[new_sequence]: 1.186 + if not monomer: 1.187 + self.body[sequence].append(monomer) 1.188 + else: 1.189 + old_monomer = old_monomers.next() 1.190 + if monomer != old_monomer: 1.191 + raise Exception("Align: alignment errors") 1.192 + self.body[sequence].append(old_monomer) 1.193 + os.unlink(tmp_file.name) 1.194 + 1.195 + def column(self, sequence=None, sequences=None, original=None): 1.196 + """ returns list of columns of alignment 1.197 + 1.198 + sequence or sequences: 1.199 + if sequence is given, then column is (original_monomer, monomer) 1.200 + if sequences is given, then column is (original_monomer, {sequence: monomer}) 1.201 + if both of them are given, it is an error 1.202 + original (Sequence type): 1.203 + if given, this filters only columns represented by original sequence 1.204 + """ 1.205 + if sequence and sequences: 1.206 + raise Exception("Wrong usage. read help") 1.207 + indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)]) 1.208 + alignment = self.body.items() 1.209 + alignment.sort(key=lambda i: indexes[i[0]]) 1.210 + alignment = [monomers for seq, monomers in alignment] 1.211 + for column in zip(*alignment): 1.212 + if not original or column[indexes[original]]: 1.213 + if sequence: 1.214 + yield (column[indexes[original]], column[indexes[sequence]]) 1.215 + else: 1.216 + yield (column[indexes[original]], 1.217 + dict([(s, column[indexes[s]]) for s in sequences])) 1.218 + 1.219 + def secstr(self, sequence, pdb_chain, gap='-'): 1.220 + """ Returns string representing secondary structure """ 1.221 + return ''.join([ 1.222 + (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap) 1.223 + for m in self.body[sequence]]) 1.224 + 1.225 +class Block(object): 1.226 """ """ 1.227 pass 1.228 1.229 -class Block(list): 1.230 - """ """ 1.231 - pass 1.232 1.233 1.234 1.235 1.236 1.237 -