Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/08494fb2c47a/lib/alignment.py
Дата изменения: Unknown
Дата индексирования: Mon Feb 4 04:23:25 2013
Кодировка:
allpy: 08494fb2c47a lib/alignment.py

allpy

view lib/alignment.py @ 190:08494fb2c47a

repeats: visitors for visiting tree items (almost) completed (tree from 10 Nov 2010 of https://docs1.google.com/document/d/1-7kCo7cdYTpHOtPXGMdSG9wxoXh6sIvrNyYyZuCKECs/edit?hl=en#)
author boris (netbook) <bnagaev@gmail.com>
date Thu, 18 Nov 2010 00:01:07 +0300
parents e4f3a2022b08
children
line source
1 #!/usr/bin/python
3 """
4 "I will not use abbrev."
5 "I will always finish what I st"
6 - Bart Simpson
8 """
10 import sequence
11 from monomer import AminoAcidType
12 import allpy_data
13 from tempfile import NamedTemporaryFile
14 import os
15 import block
16 from fasta import save_fasta
18 class Alignment(object):
19 """ Alignment
21 Mandatory data:
22 * body -- dict
23 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
24 keys are the Sequence objects, values are the lists, which
25 contain monomers of those sequences or None for gaps in the
26 corresponding sequence of
27 alignment
29 """
30 # _sequences -- list of Sequence objects. Sequences don't contain gaps
31 # - see sequence.py module
33 def __init__(self, *args):
34 """overloaded constructor
36 Alignment() -> new empty Alignment
37 Alignment(sequences, body) -> new Alignment with sequences and
38 body initialized from arguments
39 Alignment(fasta_file) -> new Alignment, read body and sequences
40 from fasta file
42 """
43 if len(args)>1:#overloaded constructor
44 self.sequences=args[0]
45 self.body=args[1]
46 elif len(args)==0:
47 self.sequences=[]
48 self.body={}
49 else:
50 self.sequences, self.body = Alignment.from_fasta(args[0])
52 def length(self):
53 """ Returns width, ie length of each sequence with gaps """
54 return max([len(line) for line in self.body.values()])
56 def height(self):
57 """ The number of sequences in alignment (it's thickness). """
58 return len(self.body)
60 def identity(self):
61 """ Calculate the identity of alignment positions for colouring.
63 For every (row, column) in alignment the percentage of the exactly
64 same residue in the same column in the alignment is calculated.
65 The data structure is just like the Alignment.body, but istead of
66 monomers it contains float percentages.
67 """
68 # Oh, God, that's awful! Absolutely not understandable.
69 # First, calculate percentages of amino acids in every column
70 contribution = 1.0 / len(self.sequences)
71 all_columns = []
72 for position in range(len(self)):
73 column_percentage = {}
74 for seq in self.body:
75 if self.body[seq][position] is not None:
76 aa = self.body[seq][position].code
77 else:
78 aa = None
79 if aa in allpy_data.amino_acids:
80 if aa in column_percentage.keys():
81 column_percentage[aa] += contribution
82 else:
83 column_percentage[aa] = contribution
84 all_columns.append(column_percentage)
85 # Second, map these percentages onto the alignment
86 self.identity_percentages = {}
87 for seq in self.sequences:
88 self.identity_percentages[seq] = []
89 for seq in self.identity_percentages:
90 line = self.identity_percentages[seq]
91 for position in range(len(self)):
92 if self.body[seq][position] is not None:
93 aa = self.body[seq][position].code
94 else:
95 aa = None
96 line.append(all_columns[position].get(aa))
97 return self.identity_percentages
99 @staticmethod
100 def from_fasta(file, monomer_kind=AminoAcidType):
101 """ Import data from fasta file
103 monomer_kind is class, inherited from MonomerType
105 >>> import alignment
106 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))
107 """
108 import re
110 sequences = []
111 body = {}
113 raw_sequences = file.read().split(">")
114 if len(raw_sequences) <= 1:
115 raise Exception("Wrong format of fasta-file %s" % file.name)
117 raw_sequences = raw_sequences[1:] #ignore everything before the first >
118 for raw in raw_sequences:
119 parsed_raw_sequence = raw.split("\n")
120 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
121 name_and_description = parsed_raw_sequence[0]
122 name_and_description = name_and_description.split(" ",1)
123 if len(name_and_description) == 2:
124 name, description = name_and_description
125 elif len(name_and_description) == 1:
126 #if there is description
127 name = name_and_description[0]
128 description = ''
129 else:
130 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \
131 {'name': name, 'file': file.name})
133 if len(parsed_raw_sequence) <= 1:
134 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \
135 {'name': name, 'file': file.name})
136 string = ""
137 for piece in parsed_raw_sequence[1:]:
138 piece_without_whitespace_chars = re.sub("\s", "", piece)
139 string += piece_without_whitespace_chars
140 monomers = [] #convert into Monomer objects
141 body_list = [] #create the respective list in body dict
142 for current_monomer in string:
143 if current_monomer not in ["-", ".", "~"]:
144 monomers.append(monomer_kind.from_code1(current_monomer).instance())
145 body_list.append(monomers[-1])
146 else:
147 body_list.append(None)
148 s = sequence.Sequence(monomers, name, description)
149 sequences.append(s)
150 body[s] = body_list
151 return sequences, body
154 @staticmethod
155 def from_sequences(*sequences):
156 """ Constructs new alignment from sequences
158 Add None's to right end to make equal lengthes of alignment sequences
159 """
160 alignment = Alignment()
161 alignment.sequences = sequences
162 max_length = max(len(sequence) for sequence in sequences)
163 for sequence in sequences:
164 gaps_count = max_length - len(sequence)
165 alignment.body[sequence] = sequence.monomers + [None] * gaps_count
166 return alignment
168 def save_fasta(self, out_file, long_line=70, gap='-'):
169 """ Saves alignment to given file
171 Splits long lines to substrings of length=long_line
172 To prevent this, set long_line=None
173 """
174 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)
176 def muscle_align(self):
177 """ Simple align ths alignment using sequences (muscle)
179 uses old Monomers and Sequences objects
180 """
181 tmp_file = NamedTemporaryFile(delete=False)
182 self.save_fasta(tmp_file)
183 tmp_file.close()
184 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})
185 sequences, body = Alignment.from_fasta(open(tmp_file.name))
186 for sequence in self.sequences:
187 try:
188 new_sequence = [i for i in sequences if sequence==i][0]
189 except:
190 raise Exception("Align: Cann't find sequence %s in muscle output" % \
191 sequence.name)
192 old_monomers = iter(sequence.monomers)
193 self.body[sequence] = []
194 for monomer in body[new_sequence]:
195 if not monomer:
196 self.body[sequence].append(monomer)
197 else:
198 old_monomer = old_monomers.next()
199 if monomer != old_monomer:
200 raise Exception("Align: alignment errors")
201 self.body[sequence].append(old_monomer)
202 os.unlink(tmp_file.name)
205 def column(self, sequence=None, sequences=None, original=None):
206 """ returns list of columns of alignment
208 sequence or sequences:
209 if sequence is given, then column is (original_monomer, monomer)
210 if sequences is given, then column is (original_monomer, {sequence: monomer})
211 if both of them are given, it is an error
212 original (Sequence type):
213 if given, this filters only columns represented by original sequence
214 """
215 if sequence and sequences:
216 raise Exception("Wrong usage. read help")
217 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])
218 alignment = self.body.items()
219 alignment.sort(key=lambda i: indexes[i[0]])
220 alignment = [monomers for seq, monomers in alignment]
221 for column in zip(*alignment):
222 if not original or column[indexes[original]]:
223 if sequence:
224 yield (column[indexes[original]], column[indexes[sequence]])
225 else:
226 yield (column[indexes[original]],
227 dict([(s, column[indexes[s]]) for s in sequences]))
230 def secstr(self, sequence, pdb_chain, gap='-'):
231 """ Returns string representing secondary structure """
232 return ''.join([
233 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)
234 for m in self.body[sequence]])
236 def save_secstr(self, out_file, sequence, pdb_chain,
237 name, description='', gap='-', long_line=70):
238 """ Save secondary structure and name in fasta format """
239 save_fasta(out_file, self.secstr(sequence, pdb_chain, gap), name, description, long_line)