Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/tip/allpy/homology.py
Дата изменения: Unknown
Дата индексирования: Tue Apr 12 07:43:56 2016
Кодировка:
allpy: b556c96c6719 allpy/homology.py

allpy

view allpy/homology.py @ 1168:b556c96c6719

blocks3d/www Makefile: never check certificates of github, they are too confusing for wget
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Mon, 26 May 2014 17:20:29 +0400
parents 922712705968
children 83167a09a19d
line source
1 from allpy import protein
2 from allpy import markups
7 class MonomerHomology(object):
8 """ Essentially, alignment is a set of monomer homology classes.
9 Object of type MonomerHomology desribes an alignment in terms of monomer homology.
10 Each monomer of any sequence from an alignment must be contained in a homology class
12 Homology of nonequal monomers of the same sequence is allowed (formally) in attributes,
13 but there is no implemented methods to deal with
15 DATA:
16 monomer_ids = {monomer_id:class_id}
17 monomer_id = (sequence_name, monomer_number)
18 monomer_number is a number of the monomer in the sequence
19 class_id = string
20 classes = {class_id:[monomer_id1, monomer_id2,...], ...}
21 columns = {class_id:column_number}
22 blocks_data = [block_data, ...]
23 block_data = (sequence_names,class_ids)
24 sequence_names = set( [sequence_1_name,...] )
25 class_ids = = set( [class_1_id, ...] )
27 block_data contain identifiers of objects only, alignment object is not needed.
28 block_data may be converted to bloks of an alignment by appropriate method
30 METHODS:
31 + .read(file_name, columns = True)
32 Reads file of monomer homology format into MonomerHomology object.
33 + .write_monomer(file,monomer_id,monomer_homology_class_id, column_number = column_number) STATIC
34 + .write_class(file,list_of_monomer_ids,monomer_homology_class_id,column_number = column_number) STATIC
35 + .write_block(file, alignment, block)
36 + .add_monomer(monomer_id,class_id,column_number)
37 + .add_class(monomer_ids,class_id, column_number)
38 + .add_block(alignment, block)
39 + .compare_with(class)
40 + .two_vs_one(classes_one, classes_two) STATIC
41 + .highest_blocks()
42 + .alignment_blocks(alignment)
43 + .case_homology(in_file_name,out_file_name, case) STATIC
44 """
45 def __init__(self, next_class_id = 1):
46 self.classes = {}
47 self.monomer_ids = {}
48 self.columns = {}
49 self.blocks_data = []
50 self.next_class_id = next_class_id
52 ################################################################
54 def read(self, file_name, columns = False):
55 """
56 Monomer_homology file format is as follows
57 One line corresponds to one monomer.
58 Thus, number of lines with information is equal to the sum of all sequences lengths
59 Three obligatory fields are in each line:
60 - homology_class_id (string) arbitrary
61 - sequence_id (string) sequence.name is used as sequence id
62 - monomer_number (integer) number in the sequence, 1, 2, ...
63 If columns:
64 - column_number
65 Everything from "#" to the line end is ignored
66 Fields are delimited by white spaces
67 Empty lines (only white speces in the line) are skipped
69 RETURN total number of monomers
70 """
71 try:
72 f = open(file_name)
73 except:
74 raise Exception("Failed to open file %s" % file_name)
75 exit()
77 monomers_count = 0
79 for line in f:
80 line = line.strip()
81 if len(line) == 0:
82 continue
83 line = line.split('#')[0]
84 if len(line) == 0:
85 continue
86 line_split = line.split()
88 try:
89 class_id = line_split[0]
90 sequence_id = line_split[1]
91 monomer_number = int(line_split[2])
92 if columns:
93 column_number = int(line_split[3])
94 except:
95 raise Exception("Wrong format in line %s of file %s! Line skipped" % (line, file_name))
96 continue
98 self.monomer_ids[(sequence_id,monomer_number)] = class_id
100 if class_id in self.classes:
101 self.classes[class_id].append( (sequence_id,monomer_number) )
102 else:
103 self.classes[class_id] = [(sequence_id,monomer_number)]
105 if columns:
106 if not (class_id in self.columns):
107 self.columns[class_id] = column_number
109 monomers_count += 1
110 f.close()
111 return monomers_count
113 ################################################################
115 @staticmethod
116 def two_vs_one(classes_one, classes_two):
117 """ Computes weighted average number of mistakes of classes_two with respect to classes_one
118 May be used as external function
119 See compare_with help for more details
120 Actually, sizes of intersections are computed and store in class_one_splits but not used
121 """
122 classes_one_size = len(classes_one.monomer_ids)
123 classes_two_size = len(classes_two.monomer_ids)
126 if classes_one_size != classes_two_size:
127 raise Exception("Comparing sets of monomers are not equal! Check input data.")
128 exit()
130 weighted_mistakes = 0.0
133 for class_one in classes_one.classes:
135 class_one_size = len(classes_one.classes[class_one])
136 class_one_splits = {}
138 class_one_monomer_ids = classes_one.classes[class_one]
141 for monomer_id in class_one_monomer_ids:
144 try:
145 class_two_id = classes_two.monomer_ids[monomer_id]
146 except:
147 raise Exception("Comparing sets of monomers are not equal!! Check input data.")
148 exit()
150 if class_two_id in class_one_splits:
151 class_one_splits[class_two_id] += 1
152 else:
153 class_one_splits[class_two_id] = 1
154 splitting_number = len(class_one_splits)
155 weighted_mistakes += float(splitting_number-1)* float(class_one_size)/float(classes_one_size)
157 return weighted_mistakes
160 def compare_with(self,other):
161 """Compares two monomer homology classes made on the same set of sequences: self and other
162 Both sets of classes MUST contain isomorfic sets of monomer_ids
164 RETURN two weighted averege number of mistakes: other vs self and self vs other
165 Number of mistakes other vs self in a homology class of self is
166 number of classes of other intersecting with the class of self minus one
167 In other words, one mistake of other is splitting a class of self
168 (believed to be true monomer homology class) into two classes
169 Wieght of a class of self is size of the class divided by total number of monomeres.
171 Different mesures and outputs may be implemented later.
172 """
174 mistakes_two_vs_one = MonomerHomology.two_vs_one(self, other) # mistakes are splitting self by intersections with other
175 mistakes_one_vs_two = MonomerHomology.two_vs_one(other, self) # mistakes are splitting other by intersections with self
177 return (mistakes_two_vs_one, mistakes_one_vs_two)
179 ################################################################
181 @staticmethod
182 def write_class(file,monomer_ids,class_id, column_number = False):
183 """ Writes list of monomer_ids forming one homology_class
184 """
185 for monomer_id in monomer_ids:
186 MonomerHomology.write_monomer(file,monomer_id,class_id, column_number = column_number)
189 def add_class(self,monomer_ids,class_id, column_number):
190 """ Adds list of monomer_ids forming one homology_class
191 """
192 for monomer_id in monomer_ids:
193 self.add_monomer(monomer_id,class_id, column_number)
198 #################################################################
200 def add_monomer(self,monomer_id,class_id,column_number):
201 """ Add one monomer to Homology
202 """
203 if len(monomer_id) != 2:
204 raise Exception("wrong parameters given for Monomer_homology.write_monomer: len(monomer_id) is not 2!")
205 exit()
206 self.monomer_ids[monomer_id] = class_id
208 if class_id in self.classes:
209 self.classes[class_id].append(monomer_id)
210 else:
211 self.classes[class_id] = [monomer_id]
213 self.columns[class_id] = column_number
215 @staticmethod
216 def write_monomer(file, monomer_id, class_id, column_number = False):
217 """ Write a line "class_id sequence_id monome number \n" into file
218 """
219 if len(monomer_id) != 2:
220 raise Exception("wrong parameters given for Monomer_homology.write_monomer: len(monomer_id) is not 2!")
221 exit()
222 try:
223 if column_number:
224 file.write("%s\t%s\t%s\t%s\n" % (class_id,monomer_id[0], monomer_id[1], column_number))
225 else:
226 file.write("%s\t%s\t%s\n" % (class_id,monomer_id[0], monomer_id[1]) )
228 except:
229 raise Exception("Failed to write monomer into file!")
230 exit()
233 #################################################################
235 def add_block(self,alignment, block):
236 """ Add classes of plus-block to Homology
237 """
238 for column in block.columns:
239 if len(column) == 0:
240 continue
242 try:
243 column_number = alignment.markups['number'][column]
244 except:
245 raise Exception("Alignment column has no number! Alignment must be marked up by numbers before .add_block")
246 exit()
248 for sequence in block.sequences:
249 if sequence in column:
251 try:
252 monomer_id = (sequence.name, column[sequence].number)
253 except:
254 raise Exception("Monomer has no number! Sequences must be marked up by numbers before .writ_block")
255 exit()
257 self.add_monomer(monomer_id,self.next_class_id, column_number)
259 self.next_class_id +=1
265 def write_block(self,file,alignment, block, markedup = True):
266 """ Writes each block column into as one homology class
267 WARRNINGS:
268 (1) method works in object of class MonomerHomology
269 (2) all sequences MUST BE MARKED UP with resudue NUMBERS!
270 (3) automatic class ids (actually, numbers) are assigned to homology classes
271 (4) if markedup, then block MUST be marked up by column numbers
273 Block may contain "gaps"
274 """
275 for column in block.columns:
277 if len(column) == 0:
278 continue
280 if markedup:
281 column_number = alignment.markups['number'][column]
282 else:
283 column_number = False
285 for sequence in block.sequences:
287 if sequence in column:
289 try:
290 monomer_id = (sequence.name, column[sequence].number)
291 except:
292 raise Exception("Monomer has no number! Sequences must be marked up by numbers before .writ_block")
293 exit()
294 MonomerHomology.write_monomer(file,monomer_id,self.next_class_id, column_number = column_number)
296 self.next_class_id +=1
298 #################################################################
300 def _sequences_of_class(self,monomer_ids):
301 """ RETURN set of sequences from a list of monomer_ids
302 """
303 set_sequences = set()
304 for x in monomer_ids:
305 set_sequences.add(x[0])
306 return set_sequences
309 def highest_blocks(self):
310 """ Makes blocks of maximal possible heights from monomer homology classes
311 block = set of homology classes with the same set of sequences
312 block MUST BE marked up by column numbers
314 blocks_data = [block_data, ...]
315 block_data = (sequence_names,class_ids)
316 sequence_names = set( [sequence_1_name,...] )
317 class_ids = = set( [class_1_id, ...] )
320 RETURN: self.blocks_data
322 """
323 class_ids = list(self.classes)
324 class_ids.sort()
325 self.blocks_data = []
326 block_number = 0
328 for i in range(len(class_ids)):
329 class_id_1 = class_ids[i]
331 if type(class_id_1) == type(tuple()):
332 continue
334 block_number += 1
335 try:
336 column_number = self.columns[class_id_1]
337 except:
338 raise Exception("Fail to get column_number! Homology classes must be marked to creats highest blocks")
340 # NEW BLOCK
342 set_sequences_1 = self._sequences_of_class(self.classes[class_id_1])
344 self.blocks_data.append( (set_sequences_1, set([class_id_1]) ) )
345 class_ids[i] = (class_id_1,i)
347 # ADD classes to BLOCK
348 for j in range(i+1,len(class_ids)):
349 class_id_2 = class_ids[j]
351 if type(class_id_2) == type(tuple()):
352 continue
354 set_sequences_2 = self._sequences_of_class(self.classes[class_id_2])
356 if set_sequences_2 == set_sequences_1:
357 try:
358 column_number = self.columns[class_id_2]
359 except:
360 raise Exception("Fail to get column_number! Homology classes must be marked to creats hiest blocks")
361 class_ids[j] = (class_id_2, j)
362 self.blocks_data[len(self.blocks_data)-1][1].add( class_id_2)
364 return self.blocks_data
366 ################################################################
368 def alignment_blocks(self, alignment):
369 """ From homology.blocks creates a list of alignment blocks (accessible for visualization)
371 blocks_data = [block_data, ...]
372 block_data = (sequence_names,class_ids)
373 sequence_names = set( [sequence_1_name,...] )
374 class_ids = = set( [class_1_id, ...] )
375 RETURN blocks, list of blocks in alignment
377 """
378 blocks = []
379 dict_sequences = {}
380 for sequence in alignment.sequences:
381 dict_sequences[sequence.name]=sequence
383 dict_columns = {}
384 for column in alignment.columns:
385 try:
386 dict_columns[alignment.markups['number'][column]] = column
387 except:
388 raise Exception("alignment columns are not marked up by numbers! Can't creat block from blocks data")
389 exit()
391 for block_data in self.blocks_data:
392 set_sequence_names = block_data[0]
393 set_class_ids = block_data[1]
395 block_sequences = []
397 for sequence_name in set_sequence_names:
399 try:
400 s = dict_sequences[sequence_name]
401 block_sequences.append(s)
402 except:
403 raise Exception("Sequence name %s from block data is not in alignment sequences!" % sequence_name)
404 exit()
406 block_columns = []
407 for class_id in set_class_ids:
408 column_number = self.columns[class_id]
409 try:
410 block_columns.append(dict_columns[column_number])
411 except:
412 raise Exception("Column number %s from block data is not in alignment columns numbers!" % column_number)
413 exit()
415 block = protein.Block.from_alignment(alignment, sequences = block_sequences, columns = block_columns)
416 blocks.append(block)
417 return blocks
419 #################################################################
420 @staticmethod
421 def case_homology(in_file_name,out_file_name, case = False, format = 'fasta'):
422 """ Makes homology file from input alignment
423 case = True => Upper letters in a column are in one class, each lower letter in separate class
424 case = False => all letter in a column are in one class
426 RETURN number of classes
427 """
428 try:
429 f = open(in_file_name)
430 except:
431 raise Exception("Failed to open file %s" % in_file_name)
432 exit()
433 try:
434 alignment = protein.Alignment().append_file(f, format=format)
435 f.close()
436 except:
437 raise Exception("Failed to create alignment from file %s, format %s!" % (in_file_name,format) )
438 exit()
439 try:
440 g = open(out_file_name, 'w')
441 except:
442 raise Exception("Failed to open output file %s!" % out_file_name)
443 exit()
445 # MARKUPING
446 alignment.add_markup('number')
448 for sequence in alignment.sequences:
449 sequence.add_markup('number')
450 if case and format != "markup":
451 sequence.add_markup('case')
454 #letters = ''.join(v[0] for v in seq.markups['case'].as_list())
456 # WRITING CLASSES
457 next_class_id = 1
459 for column in alignment.columns:
460 column_number = alignment.markups['number'][column]
461 column_class = []
463 for sequence in column:
464 monomer = column[sequence]
465 monomer_id = (sequence.name, monomer.number)
466 if case:
467 if monomer.case == "lower":
468 MonomerHomology.write_monomer(g,monomer_id,next_class_id,column_number)
469 next_class_id += 1
470 continue
471 column_class.append(monomer_id)
473 if len(column_class) > 0:
474 MonomerHomology.write_class(g,column_class,next_class_id, column_number)
475 next_class_id += 1
477 g.close()
478 return next_class_id - 1
480 ################################################################