rev |
line source |
bnagaev@249
|
1 from fasta import save_fasta |
me@260
|
2 import data.codes |
me@260
|
3 |
me@260
|
4 class MonomerType(object): |
me@260
|
5 """Class of monomer types. |
me@260
|
6 |
me@260
|
7 Each MonomerType object represents a known monomer type, e.g. Valine, |
me@260
|
8 and is referenced to by each instance of monomer in a given sequence. |
me@260
|
9 |
me@260
|
10 - `name`: full name of monomer type |
me@260
|
11 - `code1`: one-letter code |
me@260
|
12 - `code3`: three-letter code |
me@260
|
13 - `is_modified`: either of True or False |
me@260
|
14 |
me@260
|
15 class atributes: |
me@260
|
16 |
me@260
|
17 - `by_code1`: a mapping from one-letter code to MonomerType object |
me@260
|
18 - `by_code3`: a mapping from three-letter code to MonomerType object |
me@260
|
19 - `by_name`: a mapping from monomer name to MonomerType object |
me@260
|
20 - `instance_type`: class of Monomer objects to use when creating new |
me@260
|
21 objects; this must be redefined in descendent classes |
me@260
|
22 |
me@260
|
23 All of the class attributes MUST be redefined when subclassing. |
me@260
|
24 """ |
me@260
|
25 |
me@260
|
26 by_code1 = {} |
me@260
|
27 by_code3 = {} |
me@260
|
28 by_name = {} |
me@260
|
29 instance_type = None |
me@260
|
30 |
me@260
|
31 def __init__(self, name="", code1="", code3="", is_modified=False): |
me@260
|
32 self.name = name.capitalize() |
me@260
|
33 self.code1 = code1.upper() |
me@260
|
34 self.code3 = code3.upper() |
me@260
|
35 self.is_modified = bool(is_modified) |
me@260
|
36 if not is_modified: |
me@260
|
37 self.by_code1[self.code1] = self |
me@260
|
38 self.by_code3[code3] = self |
me@260
|
39 self.by_name[name] = self |
me@260
|
40 # We duplicate distinguished long names into MonomerType itself, |
me@260
|
41 # so that we can use MonomerType.from_code3 to create the relevant |
me@260
|
42 # type of monomer. |
me@260
|
43 MonomerType.by_code3[code3] = self |
me@260
|
44 MonomerType.by_name[name] = self |
me@260
|
45 |
me@260
|
46 @classmethod |
me@260
|
47 def _initialize(cls, type_letter, codes=data.codes.codes): |
me@260
|
48 """Create all relevant instances of MonomerType. |
me@260
|
49 |
me@260
|
50 `type_letter` is either of: |
me@260
|
51 |
me@260
|
52 - 'p' for protein |
me@260
|
53 - 'd' for DNA |
me@260
|
54 - 'r' for RNA |
me@260
|
55 |
me@260
|
56 `codes` is a table of monomer codes |
me@260
|
57 """ |
me@260
|
58 for type, code1, is_modified, code3, name in codes: |
me@260
|
59 if type == type_letter: |
me@260
|
60 cls(name, code1, code3, is_modified) |
me@260
|
61 |
me@260
|
62 @classmethod |
me@260
|
63 def from_code1(cls, code1): |
me@260
|
64 """Return monomer type by one-letter code.""" |
me@260
|
65 return cls.by_code1[code1.upper()] |
me@260
|
66 |
me@260
|
67 @classmethod |
me@260
|
68 def from_code3(cls, code3): |
me@260
|
69 """Return monomer type by three-letter code.""" |
me@260
|
70 return cls.by_code3[code3.upper()] |
me@260
|
71 |
me@260
|
72 @classmethod |
me@260
|
73 def from_name(cls, name): |
me@260
|
74 """Return monomer type by name.""" |
me@260
|
75 return cls.by_name[name.capitalize()] |
me@260
|
76 |
me@260
|
77 def instance(self): |
me@260
|
78 """Create a new monomer of given type.""" |
me@260
|
79 return self.instance_type(self) |
me@260
|
80 |
me@260
|
81 def __eq__(self, other): |
me@260
|
82 if hasattr(other, "type"): |
me@260
|
83 return self is other.type |
me@260
|
84 return self is other |
me@260
|
85 |
me@260
|
86 class Monomer(object): |
me@260
|
87 """Monomer object. |
me@260
|
88 |
me@260
|
89 attributes: |
me@260
|
90 |
me@260
|
91 - `type`: type of monomer (a MonomerType object) |
me@260
|
92 |
me@260
|
93 class attribute `monomer_type` is MonomerType or either of it's subclasses, |
me@260
|
94 it is used when creating new monomers. It MUST be redefined when subclassing Monomer. |
me@260
|
95 """ |
me@260
|
96 monomer_type = MonomerType |
me@260
|
97 |
me@260
|
98 def __init__(self, type): |
me@260
|
99 self.type = type |
me@260
|
100 |
me@260
|
101 @classmethod |
me@260
|
102 def from_code1(cls, code1): |
me@260
|
103 return cls(cls.monomer_type.by_code1[code1.upper()]) |
me@260
|
104 |
me@260
|
105 @classmethod |
me@260
|
106 def from_code3(cls, code3): |
me@260
|
107 return cls(cls.monomer_type.by_code3[code3.upper()]) |
me@260
|
108 |
me@260
|
109 @classmethod |
me@260
|
110 def from_name(cls, name): |
me@260
|
111 return cls(cls.monomer_type.by_name[name.capitalize()]) |
me@260
|
112 |
me@260
|
113 def __eq__(self, other): |
me@260
|
114 if hasattr(other, "type"): |
me@260
|
115 return self.type is other.type |
me@260
|
116 return self.type is other |
bnagaev@239
|
117 |
bnagaev@239
|
118 class Sequence(list): |
bnagaev@243
|
119 """ Sequence of Monomers |
bnagaev@243
|
120 |
bnagaev@243
|
121 list of monomer objects |
bnagaev@243
|
122 |
bnagaev@243
|
123 Mandatory data: |
bnagaev@243
|
124 * name -- str with the name of sequence |
bnagaev@243
|
125 * description -- str with description of the sequence |
bnagaev@243
|
126 """ |
bnagaev@243
|
127 pass |
bnagaev@243
|
128 |
bnagaev@249
|
129 class Alignment(dict): |
bnagaev@249
|
130 """ Alignment |
bnagaev@249
|
131 |
bnagaev@249
|
132 {<Sequence object>:[<Monomer object>,None,<Monomer object>]} |
bnagaev@249
|
133 keys are the Sequence objects, values are the lists, which |
bnagaev@249
|
134 contain monomers of those sequences or None for gaps in the |
bnagaev@249
|
135 corresponding sequence of alignment |
bnagaev@249
|
136 """ |
bnagaev@249
|
137 # _sequences -- list of Sequence objects. Sequences don't contain gaps |
bnagaev@249
|
138 # - see sequence.py module |
bnagaev@249
|
139 |
bnagaev@249
|
140 def __init__(self, *args): |
bnagaev@249
|
141 """overloaded constructor |
bnagaev@249
|
142 |
bnagaev@249
|
143 Alignment() -> new empty Alignment |
bnagaev@249
|
144 Alignment(sequences, body) -> new Alignment with sequences and |
bnagaev@249
|
145 body initialized from arguments |
bnagaev@249
|
146 Alignment(fasta_file) -> new Alignment, read body and sequences |
bnagaev@249
|
147 from fasta file |
bnagaev@249
|
148 |
bnagaev@249
|
149 """ |
bnagaev@249
|
150 if len(args)>1:#overloaded constructor |
bnagaev@249
|
151 self.sequences=args[0] |
bnagaev@249
|
152 self.body=args[1] |
bnagaev@249
|
153 elif len(args)==0: |
bnagaev@249
|
154 self.sequences=[] |
bnagaev@249
|
155 self.body={} |
bnagaev@249
|
156 else: |
bnagaev@249
|
157 self.sequences, self.body = Alignment.from_fasta(args[0]) |
bnagaev@249
|
158 |
bnagaev@249
|
159 def length(self): |
bnagaev@249
|
160 """ Returns width, ie length of each sequence with gaps """ |
bnagaev@249
|
161 return max([len(line) for line in self.body.values()]) |
bnagaev@249
|
162 |
bnagaev@249
|
163 def height(self): |
bnagaev@249
|
164 """ The number of sequences in alignment (it's thickness). """ |
bnagaev@249
|
165 return len(self.body) |
bnagaev@249
|
166 |
bnagaev@249
|
167 def identity(self): |
bnagaev@249
|
168 """ Calculate the identity of alignment positions for colouring. |
bnagaev@249
|
169 |
bnagaev@249
|
170 For every (row, column) in alignment the percentage of the exactly |
bnagaev@249
|
171 same residue in the same column in the alignment is calculated. |
bnagaev@249
|
172 The data structure is just like the Alignment.body, but istead of |
bnagaev@249
|
173 monomers it contains float percentages. |
bnagaev@249
|
174 """ |
bnagaev@249
|
175 # Oh, God, that's awful! Absolutely not understandable. |
bnagaev@249
|
176 # First, calculate percentages of amino acids in every column |
bnagaev@249
|
177 contribution = 1.0 / len(self.sequences) |
bnagaev@249
|
178 all_columns = [] |
bnagaev@249
|
179 for position in range(len(self)): |
bnagaev@249
|
180 column_percentage = {} |
bnagaev@249
|
181 for seq in self.body: |
bnagaev@249
|
182 if self.body[seq][position] is not None: |
bnagaev@249
|
183 aa = self.body[seq][position].code |
bnagaev@249
|
184 else: |
bnagaev@249
|
185 aa = None |
bnagaev@249
|
186 if aa in allpy.data.amino_acids: |
bnagaev@249
|
187 if aa in column_percentage.keys(): |
bnagaev@249
|
188 column_percentage[aa] += contribution |
bnagaev@249
|
189 else: |
bnagaev@249
|
190 column_percentage[aa] = contribution |
bnagaev@249
|
191 all_columns.append(column_percentage) |
bnagaev@249
|
192 # Second, map these percentages onto the alignment |
bnagaev@249
|
193 self.identity_percentages = {} |
bnagaev@249
|
194 for seq in self.sequences: |
bnagaev@249
|
195 self.identity_percentages[seq] = [] |
bnagaev@249
|
196 for seq in self.identity_percentages: |
bnagaev@249
|
197 line = self.identity_percentages[seq] |
bnagaev@249
|
198 for position in range(len(self)): |
bnagaev@249
|
199 if self.body[seq][position] is not None: |
bnagaev@249
|
200 aa = self.body[seq][position].code |
bnagaev@249
|
201 else: |
bnagaev@249
|
202 aa = None |
bnagaev@249
|
203 line.append(all_columns[position].get(aa)) |
bnagaev@249
|
204 return self.identity_percentages |
bnagaev@249
|
205 |
bnagaev@249
|
206 @staticmethod |
bnagaev@249
|
207 def from_fasta(file, monomer_kind=AminoAcidType): |
bnagaev@249
|
208 """ Import data from fasta file |
bnagaev@249
|
209 |
bnagaev@249
|
210 monomer_kind is class, inherited from MonomerType |
bnagaev@249
|
211 |
bnagaev@249
|
212 >>> import alignment |
bnagaev@249
|
213 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta")) |
bnagaev@249
|
214 """ |
bnagaev@249
|
215 import re |
bnagaev@249
|
216 |
bnagaev@249
|
217 sequences = [] |
bnagaev@249
|
218 body = {} |
bnagaev@249
|
219 |
bnagaev@249
|
220 raw_sequences = file.read().split(">") |
bnagaev@249
|
221 if len(raw_sequences) <= 1: |
bnagaev@249
|
222 raise Exception("Wrong format of fasta-file %s" % file.name) |
bnagaev@249
|
223 |
bnagaev@249
|
224 raw_sequences = raw_sequences[1:] #ignore everything before the first > |
bnagaev@249
|
225 for raw in raw_sequences: |
bnagaev@249
|
226 parsed_raw_sequence = raw.split("\n") |
bnagaev@249
|
227 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence] |
bnagaev@249
|
228 name_and_description = parsed_raw_sequence[0] |
bnagaev@249
|
229 name_and_description = name_and_description.split(" ",1) |
bnagaev@249
|
230 if len(name_and_description) == 2: |
bnagaev@249
|
231 name, description = name_and_description |
bnagaev@249
|
232 elif len(name_and_description) == 1: |
bnagaev@249
|
233 #if there is description |
bnagaev@249
|
234 name = name_and_description[0] |
bnagaev@249
|
235 description = '' |
bnagaev@249
|
236 else: |
bnagaev@249
|
237 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \ |
bnagaev@249
|
238 {'name': name, 'file': file.name}) |
bnagaev@249
|
239 |
bnagaev@249
|
240 if len(parsed_raw_sequence) <= 1: |
bnagaev@249
|
241 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \ |
bnagaev@249
|
242 {'name': name, 'file': file.name}) |
bnagaev@249
|
243 string = "" |
bnagaev@249
|
244 for piece in parsed_raw_sequence[1:]: |
bnagaev@249
|
245 piece_without_whitespace_chars = re.sub("\s", "", piece) |
bnagaev@249
|
246 string += piece_without_whitespace_chars |
bnagaev@249
|
247 monomers = [] #convert into Monomer objects |
bnagaev@249
|
248 body_list = [] #create the respective list in body dict |
bnagaev@249
|
249 for current_monomer in string: |
bnagaev@249
|
250 if current_monomer not in ["-", ".", "~"]: |
bnagaev@249
|
251 monomers.append(monomer_kind.from_code1(current_monomer).instance()) |
bnagaev@249
|
252 body_list.append(monomers[-1]) |
bnagaev@249
|
253 else: |
bnagaev@249
|
254 body_list.append(None) |
bnagaev@249
|
255 s = sequence.Sequence(monomers, name, description) |
bnagaev@249
|
256 sequences.append(s) |
bnagaev@249
|
257 body[s] = body_list |
bnagaev@249
|
258 return sequences, body |
bnagaev@249
|
259 |
bnagaev@249
|
260 @staticmethod |
bnagaev@249
|
261 def from_sequences(*sequences): |
bnagaev@249
|
262 """ Constructs new alignment from sequences |
bnagaev@249
|
263 |
bnagaev@249
|
264 Add None's to right end to make equal lengthes of alignment sequences |
bnagaev@249
|
265 """ |
bnagaev@249
|
266 alignment = Alignment() |
bnagaev@249
|
267 alignment.sequences = sequences |
bnagaev@249
|
268 max_length = max(len(sequence) for sequence in sequences) |
bnagaev@249
|
269 for sequence in sequences: |
bnagaev@249
|
270 gaps_count = max_length - len(sequence) |
bnagaev@249
|
271 alignment.body[sequence] = sequence.monomers + [None] * gaps_count |
bnagaev@249
|
272 return alignment |
bnagaev@249
|
273 |
bnagaev@249
|
274 def save_fasta(self, out_file, long_line=70, gap='-'): |
bnagaev@249
|
275 """ Saves alignment to given file |
bnagaev@249
|
276 |
bnagaev@249
|
277 Splits long lines to substrings of length=long_line |
bnagaev@249
|
278 To prevent this, set long_line=None |
bnagaev@249
|
279 """ |
bnagaev@249
|
280 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap) |
bnagaev@249
|
281 |
bnagaev@249
|
282 def muscle_align(self): |
bnagaev@249
|
283 """ Simple align ths alignment using sequences (muscle) |
bnagaev@249
|
284 |
bnagaev@249
|
285 uses old Monomers and Sequences objects |
bnagaev@249
|
286 """ |
bnagaev@249
|
287 tmp_file = NamedTemporaryFile(delete=False) |
bnagaev@249
|
288 self.save_fasta(tmp_file) |
bnagaev@249
|
289 tmp_file.close() |
bnagaev@249
|
290 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name}) |
bnagaev@249
|
291 sequences, body = Alignment.from_fasta(open(tmp_file.name)) |
bnagaev@249
|
292 for sequence in self.sequences: |
bnagaev@249
|
293 try: |
bnagaev@249
|
294 new_sequence = [i for i in sequences if sequence==i][0] |
bnagaev@249
|
295 except: |
bnagaev@249
|
296 raise Exception("Align: Cann't find sequence %s in muscle output" % \ |
bnagaev@249
|
297 sequence.name) |
bnagaev@249
|
298 old_monomers = iter(sequence.monomers) |
bnagaev@249
|
299 self.body[sequence] = [] |
bnagaev@249
|
300 for monomer in body[new_sequence]: |
bnagaev@249
|
301 if not monomer: |
bnagaev@249
|
302 self.body[sequence].append(monomer) |
bnagaev@249
|
303 else: |
bnagaev@249
|
304 old_monomer = old_monomers.next() |
bnagaev@249
|
305 if monomer != old_monomer: |
bnagaev@249
|
306 raise Exception("Align: alignment errors") |
bnagaev@249
|
307 self.body[sequence].append(old_monomer) |
bnagaev@249
|
308 os.unlink(tmp_file.name) |
bnagaev@249
|
309 |
bnagaev@249
|
310 def column(self, sequence=None, sequences=None, original=None): |
bnagaev@249
|
311 """ returns list of columns of alignment |
bnagaev@249
|
312 |
bnagaev@249
|
313 sequence or sequences: |
bnagaev@249
|
314 if sequence is given, then column is (original_monomer, monomer) |
bnagaev@249
|
315 if sequences is given, then column is (original_monomer, {sequence: monomer}) |
bnagaev@249
|
316 if both of them are given, it is an error |
bnagaev@249
|
317 original (Sequence type): |
bnagaev@249
|
318 if given, this filters only columns represented by original sequence |
bnagaev@249
|
319 """ |
bnagaev@249
|
320 if sequence and sequences: |
bnagaev@249
|
321 raise Exception("Wrong usage. read help") |
bnagaev@249
|
322 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)]) |
bnagaev@249
|
323 alignment = self.body.items() |
bnagaev@249
|
324 alignment.sort(key=lambda i: indexes[i[0]]) |
bnagaev@249
|
325 alignment = [monomers for seq, monomers in alignment] |
bnagaev@249
|
326 for column in zip(*alignment): |
bnagaev@249
|
327 if not original or column[indexes[original]]: |
bnagaev@249
|
328 if sequence: |
bnagaev@249
|
329 yield (column[indexes[original]], column[indexes[sequence]]) |
bnagaev@249
|
330 else: |
bnagaev@249
|
331 yield (column[indexes[original]], |
bnagaev@249
|
332 dict([(s, column[indexes[s]]) for s in sequences])) |
bnagaev@249
|
333 |
bnagaev@249
|
334 def secstr(self, sequence, pdb_chain, gap='-'): |
bnagaev@249
|
335 """ Returns string representing secondary structure """ |
bnagaev@249
|
336 return ''.join([ |
bnagaev@249
|
337 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap) |
bnagaev@249
|
338 for m in self.body[sequence]]) |
bnagaev@249
|
339 |
bnagaev@249
|
340 class Block(object): |
bnagaev@239
|
341 """ """ |
bnagaev@239
|
342 pass |
bnagaev@239
|
343 |
me@260
|
344 # vim: set ts=4 sts=4 sw=4 et: |