allpy

changeset 669:2f016bfda114
Massive changes to my program
author: Boris Burkov <BurkovBA@gmail.com>
date: Fri, 01 Jul 2011 09:52:49 +0400
parents: c32e728b98d7
children: 1a4a04829ba6
files: allpy/base.py sequence_based_blocks_search/blocks_finder.py
diffstat: 2 files changed, 99 insertions(+), 36 deletions(-) [+]
[-]

allpy/base.py 15

sequence_based_blocks_search/blocks_finder.py 120 allpy/base.py 15 sequence_based_blocks_search/blocks_finder.py 120
allpy/base.py 15
sequence_based_blocks_search/blocks_finder.py 120
     1.1 --- a/allpy/base.py	Tue Jun 14 13:31:18 2011 +0400
     1.2 +++ b/allpy/base.py	Fri Jul 01 09:52:49 2011 +0400
     1.3 @@ -89,6 +89,10 @@
     1.4  
     1.5      def __eq__(self, other):
     1.6          """Monomers within same monomer type are compared by code1."""
     1.7 +        #import traceback
     1.8 +        #for entry in traceback.extract_stack():
     1.9 +        #    if "remove_contained_preblocks" in entry[2]:
    1.10 +        #        print traceback.extract_stack()
    1.11          if not other:
    1.12              return False
    1.13          assert self.type == other.type
    1.14 @@ -151,6 +155,7 @@
    1.15          """Hash sequence by identity."""
    1.16          return id(self)
    1.17  
    1.18 +
    1.19  class Alignment(object):
    1.20      """Alignment. It is a list of Columns."""
    1.21  
    1.22 @@ -451,8 +456,16 @@
    1.23          """Return hash by identity."""
    1.24          return id(self)
    1.25  
    1.26 +    def MyIndex(self):
    1.27 +        for index, column in enumerate(self.alignment.columns):
    1.28 +            if column is self: return index
    1.29 +        raise ValueException
    1.30 +
    1.31      def __repr__(self):
    1.32 -        return "<Column %s>"%(str(self.alignment.columns.index(self)))
    1.33 +    #!!!!!!!!! READ HOW index OF LIST COMPARES OBJECTS AND BASIC TYPES
    1.34 +        return "<Column %s>"%(str(self.MyIndex()))
    1.35 +
    1.36 +
    1.37  
    1.38  
    1.39  class Block(Alignment):

     2.1 --- a/sequence_based_blocks_search/blocks_finder.py	Tue Jun 14 13:31:18 2011 +0400
     2.2 +++ b/sequence_based_blocks_search/blocks_finder.py	Fri Jul 01 09:52:49 2011 +0400
     2.3 @@ -101,18 +101,24 @@
     2.4          for preblock in candidate_preblocks: preblock.term3 += math.log(0.3) - math.log(1)
     2.5          candidate_preblocks+=addition_to_candidate_preblocks
     2.6          #remove from candidates those preblocks that share the same set of sequences with others, but are shorter and have lower scores
     2.7 -        print index_of_column, len(candidate_preblocks),
     2.8 +        #print index_of_column, len(candidate_preblocks),
     2.9 +        print index_of_column, len(candidate_preblocks)        
    2.10          if len(candidate_preblocks)>1000:
    2.11              import cProfile
    2.12              cProfile.runctx('remove_contained_preblocks(candidate_preblocks)',globals(),locals())
    2.13          else:
    2.14              remove_contained_preblocks(candidate_preblocks)
    2.15 -        print len(candidate_preblocks)
    2.16 -        remove_bad_candidates(candidate_preblocks, index_of_column, 0.5, 10)
    2.17 +        #print len(candidate_preblocks)
    2.18 +        #remove_bad_candidates(candidate_preblocks, index_of_column, 0.5, 10)
    2.19          #for those preblocks ending with a conserved position and having the score above threshold, add new links
    2.20 +        new_candidate_preblocks=[]
    2.21 +        overlapping_preblocks=[]
    2.22          for preblock in candidate_preblocks:
    2.23              if preblock.alignment.columns[preblock.last_column_index] is column and preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(alignment.sequences), functional_groups):
    2.24                  create_links(preblock, alignment) #links are not cliques, just connected components 
    2.25 +                overlapping_preblocks+=accept_and_return_overlapping_preblocks_before(preblock, alignment, candidate_preblocks)
    2.26 +            if preblock not in overlapping_preblocks: new_candidate_preblocks.append(preblock)
    2.27 +        candidate_preblocks = new_candidate_preblocks
    2.28  
    2.29  
    2.30  def split_preblocks_by_presense_of_gaps(preblocks, column):
    2.31 @@ -146,11 +152,21 @@
    2.32      output=[]
    2.33      counter = len(preblocks)-1
    2.34      while counter!=-1:
    2.35 -        for preblock in preblocks:
    2.36 -            try:
    2.37 -                if preblock is not preblocks[counter]:
    2.38 +        for index_of_preblock, preblock in enumerate(preblocks):
    2.39 +            if index_of_preblock != counter:
    2.40 +                try:
    2.41                      for sequence in preblocks[counter].sequences:
    2.42 -                        if sequence not in preblock.sequences: raise BreakCycleException
    2.43 +                        sequence_present=False
    2.44 +                        for another_sequence in preblock.sequences:
    2.45 +                            if sequence is another_sequence:
    2.46 +                                sequence_present=True
    2.47 +                                break
    2.48 +                        if not sequence_present: raise BreakCycleException
    2.49 +#                    for sequence in preblocks[counter].sequences:
    2.50 +#                        if sequence not in preblock.sequences:
    2.51 +#                            raise BreakCycleException
    2.52 +
    2.53 +                        #if sequence not in preblock.sequences: raise BreakCycleException
    2.54                      #if preblocks passes the threshold of weight, discard the preblocks[counter] whatever, cause, it or at least its part should've been already accepted.
    2.55                      if preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(preblock.alignment.sequences), functional_groups)\
    2.56                          and preblocks[counter].last_column_index<=preblock.last_column_index:
    2.57 @@ -163,8 +179,8 @@
    2.58                          and preblocks[counter].calculate_score() <= normalized_preblock.calculate_score():
    2.59                          preblocks.remove(preblocks[counter])
    2.60                          break
    2.61 -            except BreakCycleException:
    2.62 -                pass
    2.63 +                except BreakCycleException:
    2.64 +                    pass
    2.65          counter-=1
    2.66  
    2.67  
    2.68 @@ -207,26 +223,60 @@
    2.69  
    2.70  
    2.71  def create_links(preblock, alignment):
    2.72 -    """links are lists of sequences"""
    2.73 +    """links are lists of sequences, we create N links rather than N^2 - no cliques!"""
    2.74      for column in alignment.columns[preblock.first_column_index:preblock.last_column_index+1]:
    2.75 -        if preblock.sequences[0] not in column: continue #ignore gap columns
    2.76 -        for sequence in preblock.sequences:
    2.77 -            if "links" not in column[sequence].__dict__:
    2.78 -                column[sequence].links=[]
    2.79 -            remaining_sequences = copy.copy(preblock.sequences)
    2.80 -            remaining_sequences.remove(sequence)
    2.81 -            for iterated_sequence in remaining_sequences:#create links both for sequence and iterated sequence
    2.82 -                if iterated_sequence not in column[sequence].links: column[sequence].links.append(iterated_sequence)
    2.83 -                if "links" not in column[iterated_sequence].__dict__:
    2.84 -                    column[iterated_sequence].links=[sequence]
    2.85 +        #any_sequence = preblock.sequences.pop()
    2.86 +        #preblock.sequences.add(any_sequence)
    2.87 +        any_sequence = preblock.sequences[0]
    2.88 +        if any_sequence not in column: continue #ignore gap columns
    2.89 +#        for sequence in preblock.sequences:
    2.90 +#            if "links" not in column[sequence].__dict__:
    2.91 +#                column[sequence].links=[]
    2.92 +#            remaining_sequences = copy.copy(preblock.sequences)
    2.93 +#            remaining_sequences.remove(sequence)
    2.94 +#            for iterated_sequence in remaining_sequences:#create links both for sequence and iterated sequence
    2.95 +#                if iterated_sequence not in column[sequence].links: column[sequence].links.append(iterated_sequence)
    2.96 +#                if "links" not in column[iterated_sequence].__dict__:
    2.97 +#                    column[iterated_sequence].links=[sequence]
    2.98 +#                else:
    2.99 +#                    if sequence not in column[iterated_sequence].links: column[iterated_sequence].links.append(sequence)
   2.100 +       
   2.101 +        if "links" not in column[any_sequence].__dict__:
   2.102 +            column[any_sequence].links=[]
   2.103 +        remaining_sequences = copy.copy(preblock.sequences)
   2.104 +        remaining_sequences.remove(any_sequence)
   2.105 +        for sequence in preblock.sequences:#links are bidirectional
   2.106 +            if sequence not in column[any_sequence].links: 
   2.107 +                column[any_sequence].links.append(sequence)
   2.108 +                if "links" not in column[sequence].__dict__:
   2.109 +                    column[sequence].links=[any_sequence]
   2.110                  else:
   2.111 -                    if sequence not in column[iterated_sequence].links: column[iterated_sequence].links.append(sequence)
   2.112 +                    if any_sequence not in column[sequence].links:
   2.113 +                        column[sequence].links.append(any_sequence)
   2.114 +
   2.115 +
   2.116 +
   2.117 +def accept_and_return_overlapping_preblocks_before(preblock, alignment, preblocks):
   2.118 +    def check_sublist(greater_list, smaller_list):
   2.119 +        for element in smaller_list:
   2.120 +            if element not in greater_list: return False
   2.121 +        return True
   2.122 +
   2.123 +    output=[]
   2.124 +    for current_preblock in preblocks:
   2.125 +        if current_preblock.last_column_index >= preblock.first_column_index and\
   2.126 +            current_preblock.last_column_index <= preblock.last_column_index and\
   2.127 +            check_sublist(preblock.sequences, current_preblock.sequences) and\
   2.128 +            current_preblock is not preblock:
   2.129 +            create_links(current_preblock, alignment)
   2.130 +            output.append(current_preblock)
   2.131 +    return output
   2.132  
   2.133  
   2.134  def mark_blocks(alignment):
   2.135      output=[]
   2.136      blocks_in_previous_column = []
   2.137 -    for column in alignment.columns:
   2.138 +    for index_of_column, column in enumerate(alignment.columns):
   2.139          blocks_in_this_column = []
   2.140          remaining_sequences=copy.copy(alignment.sequences)
   2.141          while remaining_sequences !=[]:
   2.142 @@ -247,31 +297,29 @@
   2.143              if block not in blocks_in_this_column:
   2.144                  output.append(block)
   2.145          blocks_in_previous_column = blocks_in_this_column
   2.146 +        #print index_of_column, [gblock.columns for gblock in blocks_in_this_column]
   2.147          if alignment.columns.index(column) == len(alignment.columns)-1: output+=blocks_in_this_column
   2.148      return output
   2.149                 
   2.150                  
   2.151  def find_connected_component(sequences, column):
   2.152      #find one sequence that has blocks
   2.153 +    queue=[]
   2.154      output=[]
   2.155      if sequences!=[]:
   2.156          for sequence in sequences:
   2.157              if sequence not in column: continue
   2.158              if "links" in column[sequence].__dict__:
   2.159 -                output.append(sequence)
   2.160 +                queue.append(sequence)
   2.161                  break
   2.162 -        if output == []: return output
   2.163 +        if queue == []: return output
   2.164          #find all sequences in the same connected component
   2.165 -        not_exhausted_flag=True
   2.166 -        sequences_copy=copy.copy(sequences)
   2.167 -        output_counter=0
   2.168 -        while not_exhausted_flag:
   2.169 -            not_exhausted_flag = False
   2.170 -            for sequence in column[output[output_counter]].links:
   2.171 -                if sequence not in output and sequence in sequences:
   2.172 -                    output.append(sequence)
   2.173 -                    output_counter+=1
   2.174 -                    not_exhausted_flag=True
   2.175 +        while len(queue)>0:
   2.176 +            sequence = queue.pop()
   2.177 +            output.append(sequence)
   2.178 +            for another_sequence in column[sequence].links:
   2.179 +                if another_sequence not in queue and another_sequence not in output and another_sequence in sequences:
   2.180 +                    queue.append(another_sequence)
   2.181      return output
   2.182                  
   2.183  
   2.184 @@ -281,6 +329,8 @@
   2.185              if "links" in monomer.__dict__:
   2.186                  del monomer.links
   2.187  
   2.188 -   
   2.189 +#[gblock for gblock in blocks_in_previous_column for gcolumn in gblock.columns if alignment.columns.index(gcolumn)==147]   
   2.190 +
   2.191 +
   2.192  if __name__== '__main__':
   2.193      main(open(sys.argv[1]))