allpy

changeset 677:9792df9d3d68
Residue homology classes added (module allpy/homology)
author: Andrei <aba@belozersky.msu.ru>
date: Fri, 01 Jul 2011 19:48:54 +0400
parents: 3bdcee61356d
children: 4cf91416f524
files: allpy/homology.py
diffstat: 1 files changed, 156 insertions(+), 0 deletions(-) [+]
[-]

allpy/homology.py 156 allpy/homology.py 156
allpy/homology.py 156
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/allpy/homology.py	Fri Jul 01 19:48:54 2011 +0400
     1.3 @@ -0,0 +1,156 @@
     1.4 +class  ResidueHomology(object):
     1.5 +    """ Essentially, alignment is a set of residue homology classes
     1.6 +    Object of type ResidueHomology desribes an alignment  in terms of residue homology 
     1.7 +    Currently, identifies of monomers and residue homology classes are used 
     1.8 +
     1.9 +    Each monomer of any sequence from an alignment must be contained in a homology class
    1.10 +
    1.11 +    Homology of nonequal residues of the same sequence is allowed (formally), 
    1.12 +        but there is no implemented methods to deal with 
    1.13 +
    1.14 +    DATA:
    1.15 +        monomer_ids = {monomer_id:class_id}
    1.16 +        classes     = {class_id:monomer_id)
    1.17 +            monomer_id = (sequence_name, monomer_number) 
    1.18 +
    1.19 +    METHODS:
    1.20 +       + .read(file_name) 
    1.21 +       + .wright_monomer(file,monomer_id,residue_homology_class_id)
    1.22 +       + .write_class(file,list_of_monomer_ids,residue_homology_class_id)
    1.23 +       + .write_block(file, block)
    1.24 +       + .compare_with(class)
    1.25 +    """
    1.26 +    def __init__(self):
    1.27 +         self.classes = {}
    1.28 +         self.monomer_ids = {}
    1.29 +         self.next_class_id = 1
    1.30 +
    1.31 +
    1.32 +    def read(self, file_name):
    1.33 +        """
    1.34 +        Residue_homology file format is as follows
    1.35 +           One line corresponds to one monomer. 
    1.36 +               Thus, number of lines with information is equal to the sum of all sequences lengths
    1.37 +           Three obligatory fields are in each line:
    1.38 +               - homology_class_id   (string)  arbitrary
    1.39 +               - sequence_id         (string)  sequence.name is used as sequence id    
    1.40 +               - monomer_number      (integer) number in the sequence, 1, 2, ...
    1.41 +        Everything from "#" to the line end is ignored
    1.42 +        Fields are delimited by white spaces
    1.43 +        Empty lines (only white speces in the line) are skipped
    1.44 +
    1.45 +        RETURN total number of monomers
    1.46 +        """ 
    1.47 +        try:
    1.48 +            f = open(file_name)
    1.49 +        except:
    1.50 +            raise Exception("Failed to open file %s" % file_name) 
    1.51 +            exit()
    1.52 +
    1.53 +        monomers_count = 0
    1.54 +
    1.55 +        for line in file:
    1.56 +            line = line.strip()
    1.57 +            if len(line) == 0:
    1.58 +                continue
    1.59 +            line = line.split('#')[0]
    1.60 +            if len(line) == 0:
    1.61 +                continue
    1.62 +            line_split = line.split()
    1.63 +            try:
    1.64 +                class_id = line[0]
    1.65 +                sequence_id = line[1]
    1.66 +                monomer_number = int(line[2])
    1.67 +            except:
    1.68 +                raise Exception("Wrong format in line %s of file %s! Line skipped" % (line, file_name))
    1.69 +
    1.70 +            self.monomer_ids[(sequence_id,monomer_number)] = class_id 
    1.71 +
    1.72 +            if self.classes.has_key(class_id):
    1.73 +                self.classes[class_id] = self.classes[class_id].append( (sequence_id,monomer_number) )
    1.74 +            else:
    1.75 +                self.classes[class_id] = [(sequence_id,monomer_number)]
    1.76 +           monomers_count += 1
    1.77 +        f.close()
    1.78 +        return monomers_count 
    1.79 +
    1.80 +    @staticmethod
    1.81 +    def  two_vs_one(classes_one, classes_two):
    1.82 +        """ Computes weighted average number of mistakes of classes_two with respect to classes_one 
    1.83 +        May be used as external function
    1.84 +        See compare_with help for more details
    1.85 +        Actually, sizes of intersections are computed and store in class_one_splits but not used
    1.86 +        """
    1.87 +        classes_one_size = len(classes_one.monomer_ids)
    1.88 +        classes_two_size = len(classes_two.monomer_ids)
    1.89 +
    1.90 +        if classes_one_size !=  classes_two_size:
    1.91 +            raise Exception("Comparing sets of monomers are not equal! Check input data.")
    1.92 +            exit()           
    1.93 +
    1.94 +        weighted_mistakes = 0 
    1.95 +
    1.96 +        for class_one in classes_one.classes:
    1.97 +             class_one_size = len(classes_one.classes[class_one])
    1.98 +             class_one_splits = {}
    1.99 +
   1.100 +             for monomer_id in class_one:
   1.101 +                  try:
   1.102 +                      class_two_id = classes_two.monomer_ids[monomer_id]
   1.103 +                  except:
   1.104 +                      raise Exception("Comparing sets of monomers are not equal! Check input data.")
   1.105 +                      exit()
   1.106 +                  if class_one_split.has_key[class_two_id]:
   1.107 +                      class_one_split[class_two_id] = class_one_split[class_two_id] + 1
   1.108 +                  else:
   1.109 +                      class_one_split[class_two_id] = 1
   1.110 +             splitting_number = len(class_one_splits)
   1.111 +             weighted_mistakes += (splitting_number-1)* (class_one_size/classes_one_size)
   1.112 +
   1.113 +        return weighted_mistakes
   1.114 +
   1.115 +    
   1.116 +    def compare_with(other): 
   1.117 +        """Compares two residue homology classes made on the same set of sequences: self and other 
   1.118 +        Both sets of classes MUST contain isomorfic sets of monomer_ids 
   1.119 +       
   1.120 +        RETURN two weighted averege number of mistakes: other vs self and self vs other
   1.121 +            Number of mistakes other vs self in a homology class of self is
   1.122 +                number of classes of other intersecting with the class of self minus one
   1.123 +                In other words, one mistake of other is splitting a class of self 
   1.124 +                    (believed to be true residue homology class) into two classes
   1.125 +           Wieght of a class of self is size of the class divided by total number of monomeres.
   1.126 +       
   1.127 +        Different mesures and outputs may be implemented later.
   1.128 +        """
   1.129 +
   1.130 +        mistakes_two_vs_one = two_vs_one(self, other) # mistakes are splitting self by intersections with other
   1.131 +        mistakes_one_vs_two = two_vs_one(other, self) # mistakes are splitting other by intersections with self
   1.132 +
   1.133 +        return (mistakes_two_vs_one, mistakes_one_vs_two)
   1.134 +
   1.135 +    @staticmethod
   1.136 +    def write_monomer(file,monomer_id,class_id):
   1.137 +        """ Write a line "class_id sequence_id  monome number \n"  into file
   1.138 +        """
   1.139 +        if len(monomer_id) != 2:
   1.140 +            raise Exception("wrong parameters given for Residue_homology.write_monomer: len(monomer_id) is not 2!") 
   1.141 +            exit()
   1.142 +        try:
   1.143 +            file.write("%s\t%s\t%s\n" % (class_id,monomer_id[0], monomer_id[1]) ) 
   1.144 +        except:
   1.145 +            raise Exception("Failed to write monomer into file!") 
   1.146 +            exit()
   1.147 +        return None
   1.148 +
   1.149 +    @staticmethod
   1.150 +    def write_monomers(file,monomer_ids,class_id):
   1.151 +    """ Writes list of monomer_ids forming one homology_class
   1.152 +    """
   1.153 +        for monomer_id in list_of_monomer_ids:
   1.154 +            self.write(file,monomer_id,residue_homology_class_id)
   1.155 +     
   1.156 +    # def write_block(file,block):    
   1.157 +
   1.158 +
   1.159 +