Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/raw-rev/9792df9d3d68
Дата изменения: Unknown
Дата индексирования: Tue Oct 2 08:09:41 2012
Кодировка:

# HG changeset patch
# User Andrei
# Date 1309535334 -14400
# Node ID 9792df9d3d68f0ce47a3dec07cb8da141fd5551f
# Parent 3bdcee61356d4a9ef3e7676bdb29ae6de5228e50
Residue homology classes added (module allpy/homology)

diff -r 3bdcee61356d -r 9792df9d3d68 allpy/homology.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/allpy/homology.py Fri Jul 01 19:48:54 2011 +0400
@@ -0,0 +1,156 @@
+class ResidueHomology(object):
+ """ Essentially, alignment is a set of residue homology classes
+ Object of type ResidueHomology desribes an alignment in terms of residue homology
+ Currently, identifies of monomers and residue homology classes are used
+
+ Each monomer of any sequence from an alignment must be contained in a homology class
+
+ Homology of nonequal residues of the same sequence is allowed (formally),
+ but there is no implemented methods to deal with
+
+ DATA:
+ monomer_ids = {monomer_id:class_id}
+ classes = {class_id:monomer_id)
+ monomer_id = (sequence_name, monomer_number)
+
+ METHODS:
+ + .read(file_name)
+ + .wright_monomer(file,monomer_id,residue_homology_class_id)
+ + .write_class(file,list_of_monomer_ids,residue_homology_class_id)
+ + .write_block(file, block)
+ + .compare_with(class)
+ """
+ def __init__(self):
+ self.classes = {}
+ self.monomer_ids = {}
+ self.next_class_id = 1
+
+
+ def read(self, file_name):
+ """
+ Residue_homology file format is as follows
+ One line corresponds to one monomer.
+ Thus, number of lines with information is equal to the sum of all sequences lengths
+ Three obligatory fields are in each line:
+ - homology_class_id (string) arbitrary
+ - sequence_id (string) sequence.name is used as sequence id
+ - monomer_number (integer) number in the sequence, 1, 2, ...
+ Everything from "#" to the line end is ignored
+ Fields are delimited by white spaces
+ Empty lines (only white speces in the line) are skipped
+
+ RETURN total number of monomers
+ """
+ try:
+ f = open(file_name)
+ except:
+ raise Exception("Failed to open file %s" % file_name)
+ exit()
+
+ monomers_count = 0
+
+ for line in file:
+ line = line.strip()
+ if len(line) == 0:
+ continue
+ line = line.split('#')[0]
+ if len(line) == 0:
+ continue
+ line_split = line.split()
+ try:
+ class_id = line[0]
+ sequence_id = line[1]
+ monomer_number = int(line[2])
+ except:
+ raise Exception("Wrong format in line %s of file %s! Line skipped" % (line, file_name))
+
+ self.monomer_ids[(sequence_id,monomer_number)] = class_id
+
+ if self.classes.has_key(class_id):
+ self.classes[class_id] = self.classes[class_id].append( (sequence_id,monomer_number) )
+ else:
+ self.classes[class_id] = [(sequence_id,monomer_number)]
+ monomers_count += 1
+ f.close()
+ return monomers_count
+
+ @staticmethod
+ def two_vs_one(classes_one, classes_two):
+ """ Computes weighted average number of mistakes of classes_two with respect to classes_one
+ May be used as external function
+ See compare_with help for more details
+ Actually, sizes of intersections are computed and store in class_one_splits but not used
+ """
+ classes_one_size = len(classes_one.monomer_ids)
+ classes_two_size = len(classes_two.monomer_ids)
+
+ if classes_one_size != classes_two_size:
+ raise Exception("Comparing sets of monomers are not equal! Check input data.")
+ exit()
+
+ weighted_mistakes = 0
+
+ for class_one in classes_one.classes:
+ class_one_size = len(classes_one.classes[class_one])
+ class_one_splits = {}
+
+ for monomer_id in class_one:
+ try:
+ class_two_id = classes_two.monomer_ids[monomer_id]
+ except:
+ raise Exception("Comparing sets of monomers are not equal! Check input data.")
+ exit()
+ if class_one_split.has_key[class_two_id]:
+ class_one_split[class_two_id] = class_one_split[class_two_id] + 1
+ else:
+ class_one_split[class_two_id] = 1
+ splitting_number = len(class_one_splits)
+ weighted_mistakes += (splitting_number-1)* (class_one_size/classes_one_size)
+
+ return weighted_mistakes
+
+
+ def compare_with(other):
+ """Compares two residue homology classes made on the same set of sequences: self and other
+ Both sets of classes MUST contain isomorfic sets of monomer_ids
+
+ RETURN two weighted averege number of mistakes: other vs self and self vs other
+ Number of mistakes other vs self in a homology class of self is
+ number of classes of other intersecting with the class of self minus one
+ In other words, one mistake of other is splitting a class of self
+ (believed to be true residue homology class) into two classes
+ Wieght of a class of self is size of the class divided by total number of monomeres.
+
+ Different mesures and outputs may be implemented later.
+ """
+
+ mistakes_two_vs_one = two_vs_one(self, other) # mistakes are splitting self by intersections with other
+ mistakes_one_vs_two = two_vs_one(other, self) # mistakes are splitting other by intersections with self
+
+ return (mistakes_two_vs_one, mistakes_one_vs_two)
+
+ @staticmethod
+ def write_monomer(file,monomer_id,class_id):
+ """ Write a line "class_id sequence_id monome number \n" into file
+ """
+ if len(monomer_id) != 2:
+ raise Exception("wrong parameters given for Residue_homology.write_monomer: len(monomer_id) is not 2!")
+ exit()
+ try:
+ file.write("%s\t%s\t%s\n" % (class_id,monomer_id[0], monomer_id[1]) )
+ except:
+ raise Exception("Failed to write monomer into file!")
+ exit()
+ return None
+
+ @staticmethod
+ def write_monomers(file,monomer_ids,class_id):
+ """ Writes list of monomer_ids forming one homology_class
+ """
+ for monomer_id in list_of_monomer_ids:
+ self.write(file,monomer_id,residue_homology_class_id)
+
+ # def write_block(file,block):
+
+
+