allpy
changeset 694:0e41dc9c60bc
homology endowed with column_number in all methods and passed through tests
author | Andrei <aba@belozersky.msu.ru> |
---|---|
date | Tue, 05 Jul 2011 21:54:10 +0400 |
parents | 2ac0045bf2d4 |
children | fd3481013cd5 |
files | allpy/homology.py |
diffstat | 1 files changed, 61 insertions(+), 18 deletions(-) [+] |
line diff
1.1 --- a/allpy/homology.py Mon Jul 04 22:38:59 2011 +0400 1.2 +++ b/allpy/homology.py Tue Jul 05 21:54:10 2011 +0400 1.3 @@ -12,6 +12,10 @@ 1.4 monomer_ids = {monomer_id:class_id} 1.5 classes = {class_id:monomer_id) 1.6 monomer_id = (sequence_name, monomer_number) 1.7 + class_id = string 1.8 + columns = {class_id:column_number} optional 1.9 + blocks = [ (block_number, class_id, column_number), ...] 1.10 + 1.11 1.12 METHODS: 1.13 + .read(file_name) 1.14 @@ -23,10 +27,12 @@ 1.15 def __init__(self, next_class_id = 1): 1.16 self.classes = {} 1.17 self.monomer_ids = {} 1.18 + self.columns = {} 1.19 + self.blocks = [] 1.20 self.next_class_id = next_class_id 1.21 1.22 1.23 - def read(self, file_name): 1.24 + def read(self, file_name, columns = False): 1.25 """ 1.26 Residue_homology file format is as follows 1.27 One line corresponds to one monomer. 1.28 @@ -35,6 +41,8 @@ 1.29 - homology_class_id (string) arbitrary 1.30 - sequence_id (string) sequence.name is used as sequence id 1.31 - monomer_number (integer) number in the sequence, 1, 2, ... 1.32 + If columns: 1.33 + - column_number 1.34 Everything from "#" to the line end is ignored 1.35 Fields are delimited by white spaces 1.36 Empty lines (only white speces in the line) are skipped 1.37 @@ -62,16 +70,23 @@ 1.38 class_id = line_split[0] 1.39 sequence_id = line_split[1] 1.40 monomer_number = int(line_split[2]) 1.41 + if columns: 1.42 + column_number = int(line_split[3]) 1.43 except: 1.44 raise Exception("Wrong format in line %s of file %s! Line skipped" % (line, file_name)) 1.45 continue 1.46 1.47 self.monomer_ids[(sequence_id,monomer_number)] = class_id 1.48 1.49 - if self.classes.has_key(class_id): 1.50 + if class_id in self.classes: 1.51 self.classes[class_id].append( (sequence_id,monomer_number) ) 1.52 else: 1.53 self.classes[class_id] = [(sequence_id,monomer_number)] 1.54 + 1.55 + if columns: 1.56 + if not (class_id in self.columns): 1.57 + self.columns[class_id] = column_number 1.58 + 1.59 monomers_count += 1 1.60 f.close() 1.61 return monomers_count 1.62 @@ -141,33 +156,39 @@ 1.63 return (mistakes_two_vs_one, mistakes_one_vs_two) 1.64 1.65 @staticmethod 1.66 - def write_monomer(file,monomer_id,class_id): 1.67 + def write_monomer(file,monomer_id,class_id, column_number = False): 1.68 """ Write a line "class_id sequence_id monome number \n" into file 1.69 """ 1.70 if len(monomer_id) != 2: 1.71 raise Exception("wrong parameters given for Residue_homology.write_monomer: len(monomer_id) is not 2!") 1.72 exit() 1.73 try: 1.74 - file.write("%s\t%s\t%s\n" % (class_id,monomer_id[0], monomer_id[1]) ) 1.75 + if column_number: 1.76 + file.write("%s\t%s\t%s\t%s\n" % (class_id,monomer_id[0], monomer_id[1], column_number)) 1.77 + else: 1.78 + file.write("%s\t%s\t%s\n" % (class_id,monomer_id[0], monomer_id[1]) ) 1.79 + 1.80 except: 1.81 raise Exception("Failed to write monomer into file!") 1.82 exit() 1.83 return None 1.84 1.85 @staticmethod 1.86 - def write_class(file,monomer_ids,class_id): 1.87 + def write_class(file,monomer_ids,class_id, column_number = False): 1.88 """ Writes list of monomer_ids forming one homology_class 1.89 """ 1.90 for monomer_id in monomer_ids: 1.91 - ResidueHomology.write_monomer(file,monomer_id,class_id) 1.92 + ResidueHomology.write_monomer(file,monomer_id,class_id, column_number = column_number) 1.93 1.94 #######################3# 1.95 - def write_block(self,file,block): 1.96 + def write_block(self,file,block,markedup = False): 1.97 """ Writes each block column into as one homology class 1.98 WARRNINGS: 1.99 (1) method works in object of class ResidueHomology 1.100 (2) all sequences MUST BE MARKED UP with resudue NUMBERS! 1.101 (3) automatic class ids (actually, numbers) are assigned to homology classes 1.102 + (4) if markedup, then block MUST be marked up by column numbers 1.103 + 1.104 Block may contain "gaps" 1.105 """ 1.106 for column in block.columns: 1.107 @@ -175,6 +196,11 @@ 1.108 if len(column) == 0: 1.109 continue 1.110 1.111 + if markedup: 1.112 + column_number = block.markups['number'][column] 1.113 + else: 1.114 + column_number = False 1.115 + 1.116 for sequence in block.sequences: 1.117 1.118 if sequence in column: 1.119 @@ -184,43 +210,56 @@ 1.120 except: 1.121 raise Exception("Monomer has no number! Sequences must be marked up by numbers before .writ_block") 1.122 exit() 1.123 - ResidueHomology.write_monomer(file,monomer_id,self.next_class_id) 1.124 + ResidueHomology.write_monomer(file,monomer_id,self.next_class_id, column_number = column_number) 1.125 1.126 self.next_class_id +=1 1.127 1.128 1.129 #######################3# 1.130 def _sequences_of_class(self,monomer_ids): 1.131 + """ RETURN set of sequences from a list of monomer_ids 1.132 + """ 1.133 set_sequences = set() 1.134 for x in monomer_ids: 1.135 set_sequences.add(x[0]) 1.136 return set_sequences 1.137 1.138 + def _add_to_block(self, i, block_number, class_id, column_number): 1.139 + """ Add an entry to a class_ids 1.140 + """ 1.141 + self.blocks[i] = (block_number, class_id, column_number) 1.142 1.143 1.144 def highest_blocks(self): 1.145 """ Makes blocks of maximal possible heights from residue homology classes 1.146 block = set of homology classes with the same set of sequences 1.147 - RETURN: [(block_number,class_id), ...] 1.148 + block MUST BE marked up by column numbers 1.149 + 1.150 + RETURN: self.blocks 1.151 1.152 """ 1.153 - class_ids = list(self.classes) 1.154 - class_ids.sort() 1.155 + self.blocks = list(self.classes) 1.156 + self.blocks.sort() 1.157 block_number = 0 1.158 1.159 - for i in range(len(class_ids)): 1.160 - class_id_1 = class_ids[i] 1.161 + for i in range(len(self.blocks)): 1.162 + class_id_1 = self.blocks[i] 1.163 1.164 if type(class_id_1) == type(tuple()): 1.165 continue 1.166 1.167 block_number += 1 1.168 - class_ids[i] = (block_number, class_id_1) 1.169 + try: 1.170 + column_number = self.columns[class_id_1] 1.171 + except: 1.172 + raise Exception("Fail to get column_number! Homology classes must be marked to creats hiest blocks") 1.173 + 1.174 + self._add_to_block(i, block_number, class_id_1, column_number) 1.175 1.176 set_sequences_1 = self._sequences_of_class(self.classes[class_id_1]) 1.177 1.178 - for j in range(i+1,len(class_ids)): 1.179 - class_id_2 = class_ids[j] 1.180 + for j in range(i+1,len(self.blocks)): 1.181 + class_id_2 = self.blocks[j] 1.182 1.183 if type(class_id_2) == type(tuple()): 1.184 continue 1.185 @@ -229,8 +268,12 @@ 1.186 1.187 1.188 if set_sequences_2 == set_sequences_1: 1.189 - class_ids[j] = (block_number,class_id_2) 1.190 + try: 1.191 + column_number = self.columns[class_id_2] 1.192 + except: 1.193 + raise Exception("Fail to get column_number! Homology classes must be marked to creats hiest blocks") 1.194 + self._add_to_block(j, block_number, class_id_2, column_number) 1.195 1.196 - return class_ids 1.197 + return self.blocks 1.198 1.199