allpy
changeset 677:9792df9d3d68
Residue homology classes added (module allpy/homology)
author | Andrei <aba@belozersky.msu.ru> |
---|---|
date | Fri, 01 Jul 2011 19:48:54 +0400 |
parents | 3bdcee61356d |
children | 4cf91416f524 |
files | allpy/homology.py |
diffstat | 1 files changed, 156 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/allpy/homology.py Fri Jul 01 19:48:54 2011 +0400 1.3 @@ -0,0 +1,156 @@ 1.4 +class ResidueHomology(object): 1.5 + """ Essentially, alignment is a set of residue homology classes 1.6 + Object of type ResidueHomology desribes an alignment in terms of residue homology 1.7 + Currently, identifies of monomers and residue homology classes are used 1.8 + 1.9 + Each monomer of any sequence from an alignment must be contained in a homology class 1.10 + 1.11 + Homology of nonequal residues of the same sequence is allowed (formally), 1.12 + but there is no implemented methods to deal with 1.13 + 1.14 + DATA: 1.15 + monomer_ids = {monomer_id:class_id} 1.16 + classes = {class_id:monomer_id) 1.17 + monomer_id = (sequence_name, monomer_number) 1.18 + 1.19 + METHODS: 1.20 + + .read(file_name) 1.21 + + .wright_monomer(file,monomer_id,residue_homology_class_id) 1.22 + + .write_class(file,list_of_monomer_ids,residue_homology_class_id) 1.23 + + .write_block(file, block) 1.24 + + .compare_with(class) 1.25 + """ 1.26 + def __init__(self): 1.27 + self.classes = {} 1.28 + self.monomer_ids = {} 1.29 + self.next_class_id = 1 1.30 + 1.31 + 1.32 + def read(self, file_name): 1.33 + """ 1.34 + Residue_homology file format is as follows 1.35 + One line corresponds to one monomer. 1.36 + Thus, number of lines with information is equal to the sum of all sequences lengths 1.37 + Three obligatory fields are in each line: 1.38 + - homology_class_id (string) arbitrary 1.39 + - sequence_id (string) sequence.name is used as sequence id 1.40 + - monomer_number (integer) number in the sequence, 1, 2, ... 1.41 + Everything from "#" to the line end is ignored 1.42 + Fields are delimited by white spaces 1.43 + Empty lines (only white speces in the line) are skipped 1.44 + 1.45 + RETURN total number of monomers 1.46 + """ 1.47 + try: 1.48 + f = open(file_name) 1.49 + except: 1.50 + raise Exception("Failed to open file %s" % file_name) 1.51 + exit() 1.52 + 1.53 + monomers_count = 0 1.54 + 1.55 + for line in file: 1.56 + line = line.strip() 1.57 + if len(line) == 0: 1.58 + continue 1.59 + line = line.split('#')[0] 1.60 + if len(line) == 0: 1.61 + continue 1.62 + line_split = line.split() 1.63 + try: 1.64 + class_id = line[0] 1.65 + sequence_id = line[1] 1.66 + monomer_number = int(line[2]) 1.67 + except: 1.68 + raise Exception("Wrong format in line %s of file %s! Line skipped" % (line, file_name)) 1.69 + 1.70 + self.monomer_ids[(sequence_id,monomer_number)] = class_id 1.71 + 1.72 + if self.classes.has_key(class_id): 1.73 + self.classes[class_id] = self.classes[class_id].append( (sequence_id,monomer_number) ) 1.74 + else: 1.75 + self.classes[class_id] = [(sequence_id,monomer_number)] 1.76 + monomers_count += 1 1.77 + f.close() 1.78 + return monomers_count 1.79 + 1.80 + @staticmethod 1.81 + def two_vs_one(classes_one, classes_two): 1.82 + """ Computes weighted average number of mistakes of classes_two with respect to classes_one 1.83 + May be used as external function 1.84 + See compare_with help for more details 1.85 + Actually, sizes of intersections are computed and store in class_one_splits but not used 1.86 + """ 1.87 + classes_one_size = len(classes_one.monomer_ids) 1.88 + classes_two_size = len(classes_two.monomer_ids) 1.89 + 1.90 + if classes_one_size != classes_two_size: 1.91 + raise Exception("Comparing sets of monomers are not equal! Check input data.") 1.92 + exit() 1.93 + 1.94 + weighted_mistakes = 0 1.95 + 1.96 + for class_one in classes_one.classes: 1.97 + class_one_size = len(classes_one.classes[class_one]) 1.98 + class_one_splits = {} 1.99 + 1.100 + for monomer_id in class_one: 1.101 + try: 1.102 + class_two_id = classes_two.monomer_ids[monomer_id] 1.103 + except: 1.104 + raise Exception("Comparing sets of monomers are not equal! Check input data.") 1.105 + exit() 1.106 + if class_one_split.has_key[class_two_id]: 1.107 + class_one_split[class_two_id] = class_one_split[class_two_id] + 1 1.108 + else: 1.109 + class_one_split[class_two_id] = 1 1.110 + splitting_number = len(class_one_splits) 1.111 + weighted_mistakes += (splitting_number-1)* (class_one_size/classes_one_size) 1.112 + 1.113 + return weighted_mistakes 1.114 + 1.115 + 1.116 + def compare_with(other): 1.117 + """Compares two residue homology classes made on the same set of sequences: self and other 1.118 + Both sets of classes MUST contain isomorfic sets of monomer_ids 1.119 + 1.120 + RETURN two weighted averege number of mistakes: other vs self and self vs other 1.121 + Number of mistakes other vs self in a homology class of self is 1.122 + number of classes of other intersecting with the class of self minus one 1.123 + In other words, one mistake of other is splitting a class of self 1.124 + (believed to be true residue homology class) into two classes 1.125 + Wieght of a class of self is size of the class divided by total number of monomeres. 1.126 + 1.127 + Different mesures and outputs may be implemented later. 1.128 + """ 1.129 + 1.130 + mistakes_two_vs_one = two_vs_one(self, other) # mistakes are splitting self by intersections with other 1.131 + mistakes_one_vs_two = two_vs_one(other, self) # mistakes are splitting other by intersections with self 1.132 + 1.133 + return (mistakes_two_vs_one, mistakes_one_vs_two) 1.134 + 1.135 + @staticmethod 1.136 + def write_monomer(file,monomer_id,class_id): 1.137 + """ Write a line "class_id sequence_id monome number \n" into file 1.138 + """ 1.139 + if len(monomer_id) != 2: 1.140 + raise Exception("wrong parameters given for Residue_homology.write_monomer: len(monomer_id) is not 2!") 1.141 + exit() 1.142 + try: 1.143 + file.write("%s\t%s\t%s\n" % (class_id,monomer_id[0], monomer_id[1]) ) 1.144 + except: 1.145 + raise Exception("Failed to write monomer into file!") 1.146 + exit() 1.147 + return None 1.148 + 1.149 + @staticmethod 1.150 + def write_monomers(file,monomer_ids,class_id): 1.151 + """ Writes list of monomer_ids forming one homology_class 1.152 + """ 1.153 + for monomer_id in list_of_monomer_ids: 1.154 + self.write(file,monomer_id,residue_homology_class_id) 1.155 + 1.156 + # def write_block(file,block): 1.157 + 1.158 + 1.159 +