allpy
changeset 669:2f016bfda114
Massive changes to my program
author | Boris Burkov <BurkovBA@gmail.com> |
---|---|
date | Fri, 01 Jul 2011 09:52:49 +0400 |
parents | c32e728b98d7 |
children | 1a4a04829ba6 |
files | allpy/base.py sequence_based_blocks_search/blocks_finder.py |
diffstat | 2 files changed, 99 insertions(+), 36 deletions(-) [+] |
line diff
1.1 --- a/allpy/base.py Tue Jun 14 13:31:18 2011 +0400 1.2 +++ b/allpy/base.py Fri Jul 01 09:52:49 2011 +0400 1.3 @@ -89,6 +89,10 @@ 1.4 1.5 def __eq__(self, other): 1.6 """Monomers within same monomer type are compared by code1.""" 1.7 + #import traceback 1.8 + #for entry in traceback.extract_stack(): 1.9 + # if "remove_contained_preblocks" in entry[2]: 1.10 + # print traceback.extract_stack() 1.11 if not other: 1.12 return False 1.13 assert self.type == other.type 1.14 @@ -151,6 +155,7 @@ 1.15 """Hash sequence by identity.""" 1.16 return id(self) 1.17 1.18 + 1.19 class Alignment(object): 1.20 """Alignment. It is a list of Columns.""" 1.21 1.22 @@ -451,8 +456,16 @@ 1.23 """Return hash by identity.""" 1.24 return id(self) 1.25 1.26 + def MyIndex(self): 1.27 + for index, column in enumerate(self.alignment.columns): 1.28 + if column is self: return index 1.29 + raise ValueException 1.30 + 1.31 def __repr__(self): 1.32 - return "<Column %s>"%(str(self.alignment.columns.index(self))) 1.33 + #!!!!!!!!! READ HOW index OF LIST COMPARES OBJECTS AND BASIC TYPES 1.34 + return "<Column %s>"%(str(self.MyIndex())) 1.35 + 1.36 + 1.37 1.38 1.39 class Block(Alignment):
2.1 --- a/sequence_based_blocks_search/blocks_finder.py Tue Jun 14 13:31:18 2011 +0400 2.2 +++ b/sequence_based_blocks_search/blocks_finder.py Fri Jul 01 09:52:49 2011 +0400 2.3 @@ -101,18 +101,24 @@ 2.4 for preblock in candidate_preblocks: preblock.term3 += math.log(0.3) - math.log(1) 2.5 candidate_preblocks+=addition_to_candidate_preblocks 2.6 #remove from candidates those preblocks that share the same set of sequences with others, but are shorter and have lower scores 2.7 - print index_of_column, len(candidate_preblocks), 2.8 + #print index_of_column, len(candidate_preblocks), 2.9 + print index_of_column, len(candidate_preblocks) 2.10 if len(candidate_preblocks)>1000: 2.11 import cProfile 2.12 cProfile.runctx('remove_contained_preblocks(candidate_preblocks)',globals(),locals()) 2.13 else: 2.14 remove_contained_preblocks(candidate_preblocks) 2.15 - print len(candidate_preblocks) 2.16 - remove_bad_candidates(candidate_preblocks, index_of_column, 0.5, 10) 2.17 + #print len(candidate_preblocks) 2.18 + #remove_bad_candidates(candidate_preblocks, index_of_column, 0.5, 10) 2.19 #for those preblocks ending with a conserved position and having the score above threshold, add new links 2.20 + new_candidate_preblocks=[] 2.21 + overlapping_preblocks=[] 2.22 for preblock in candidate_preblocks: 2.23 if preblock.alignment.columns[preblock.last_column_index] is column and preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(alignment.sequences), functional_groups): 2.24 create_links(preblock, alignment) #links are not cliques, just connected components 2.25 + overlapping_preblocks+=accept_and_return_overlapping_preblocks_before(preblock, alignment, candidate_preblocks) 2.26 + if preblock not in overlapping_preblocks: new_candidate_preblocks.append(preblock) 2.27 + candidate_preblocks = new_candidate_preblocks 2.28 2.29 2.30 def split_preblocks_by_presense_of_gaps(preblocks, column): 2.31 @@ -146,11 +152,21 @@ 2.32 output=[] 2.33 counter = len(preblocks)-1 2.34 while counter!=-1: 2.35 - for preblock in preblocks: 2.36 - try: 2.37 - if preblock is not preblocks[counter]: 2.38 + for index_of_preblock, preblock in enumerate(preblocks): 2.39 + if index_of_preblock != counter: 2.40 + try: 2.41 for sequence in preblocks[counter].sequences: 2.42 - if sequence not in preblock.sequences: raise BreakCycleException 2.43 + sequence_present=False 2.44 + for another_sequence in preblock.sequences: 2.45 + if sequence is another_sequence: 2.46 + sequence_present=True 2.47 + break 2.48 + if not sequence_present: raise BreakCycleException 2.49 +# for sequence in preblocks[counter].sequences: 2.50 +# if sequence not in preblock.sequences: 2.51 +# raise BreakCycleException 2.52 + 2.53 + #if sequence not in preblock.sequences: raise BreakCycleException 2.54 #if preblocks passes the threshold of weight, discard the preblocks[counter] whatever, cause, it or at least its part should've been already accepted. 2.55 if preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(preblock.alignment.sequences), functional_groups)\ 2.56 and preblocks[counter].last_column_index<=preblock.last_column_index: 2.57 @@ -163,8 +179,8 @@ 2.58 and preblocks[counter].calculate_score() <= normalized_preblock.calculate_score(): 2.59 preblocks.remove(preblocks[counter]) 2.60 break 2.61 - except BreakCycleException: 2.62 - pass 2.63 + except BreakCycleException: 2.64 + pass 2.65 counter-=1 2.66 2.67 2.68 @@ -207,26 +223,60 @@ 2.69 2.70 2.71 def create_links(preblock, alignment): 2.72 - """links are lists of sequences""" 2.73 + """links are lists of sequences, we create N links rather than N^2 - no cliques!""" 2.74 for column in alignment.columns[preblock.first_column_index:preblock.last_column_index+1]: 2.75 - if preblock.sequences[0] not in column: continue #ignore gap columns 2.76 - for sequence in preblock.sequences: 2.77 - if "links" not in column[sequence].__dict__: 2.78 - column[sequence].links=[] 2.79 - remaining_sequences = copy.copy(preblock.sequences) 2.80 - remaining_sequences.remove(sequence) 2.81 - for iterated_sequence in remaining_sequences:#create links both for sequence and iterated sequence 2.82 - if iterated_sequence not in column[sequence].links: column[sequence].links.append(iterated_sequence) 2.83 - if "links" not in column[iterated_sequence].__dict__: 2.84 - column[iterated_sequence].links=[sequence] 2.85 + #any_sequence = preblock.sequences.pop() 2.86 + #preblock.sequences.add(any_sequence) 2.87 + any_sequence = preblock.sequences[0] 2.88 + if any_sequence not in column: continue #ignore gap columns 2.89 +# for sequence in preblock.sequences: 2.90 +# if "links" not in column[sequence].__dict__: 2.91 +# column[sequence].links=[] 2.92 +# remaining_sequences = copy.copy(preblock.sequences) 2.93 +# remaining_sequences.remove(sequence) 2.94 +# for iterated_sequence in remaining_sequences:#create links both for sequence and iterated sequence 2.95 +# if iterated_sequence not in column[sequence].links: column[sequence].links.append(iterated_sequence) 2.96 +# if "links" not in column[iterated_sequence].__dict__: 2.97 +# column[iterated_sequence].links=[sequence] 2.98 +# else: 2.99 +# if sequence not in column[iterated_sequence].links: column[iterated_sequence].links.append(sequence) 2.100 + 2.101 + if "links" not in column[any_sequence].__dict__: 2.102 + column[any_sequence].links=[] 2.103 + remaining_sequences = copy.copy(preblock.sequences) 2.104 + remaining_sequences.remove(any_sequence) 2.105 + for sequence in preblock.sequences:#links are bidirectional 2.106 + if sequence not in column[any_sequence].links: 2.107 + column[any_sequence].links.append(sequence) 2.108 + if "links" not in column[sequence].__dict__: 2.109 + column[sequence].links=[any_sequence] 2.110 else: 2.111 - if sequence not in column[iterated_sequence].links: column[iterated_sequence].links.append(sequence) 2.112 + if any_sequence not in column[sequence].links: 2.113 + column[sequence].links.append(any_sequence) 2.114 + 2.115 + 2.116 + 2.117 +def accept_and_return_overlapping_preblocks_before(preblock, alignment, preblocks): 2.118 + def check_sublist(greater_list, smaller_list): 2.119 + for element in smaller_list: 2.120 + if element not in greater_list: return False 2.121 + return True 2.122 + 2.123 + output=[] 2.124 + for current_preblock in preblocks: 2.125 + if current_preblock.last_column_index >= preblock.first_column_index and\ 2.126 + current_preblock.last_column_index <= preblock.last_column_index and\ 2.127 + check_sublist(preblock.sequences, current_preblock.sequences) and\ 2.128 + current_preblock is not preblock: 2.129 + create_links(current_preblock, alignment) 2.130 + output.append(current_preblock) 2.131 + return output 2.132 2.133 2.134 def mark_blocks(alignment): 2.135 output=[] 2.136 blocks_in_previous_column = [] 2.137 - for column in alignment.columns: 2.138 + for index_of_column, column in enumerate(alignment.columns): 2.139 blocks_in_this_column = [] 2.140 remaining_sequences=copy.copy(alignment.sequences) 2.141 while remaining_sequences !=[]: 2.142 @@ -247,31 +297,29 @@ 2.143 if block not in blocks_in_this_column: 2.144 output.append(block) 2.145 blocks_in_previous_column = blocks_in_this_column 2.146 + #print index_of_column, [gblock.columns for gblock in blocks_in_this_column] 2.147 if alignment.columns.index(column) == len(alignment.columns)-1: output+=blocks_in_this_column 2.148 return output 2.149 2.150 2.151 def find_connected_component(sequences, column): 2.152 #find one sequence that has blocks 2.153 + queue=[] 2.154 output=[] 2.155 if sequences!=[]: 2.156 for sequence in sequences: 2.157 if sequence not in column: continue 2.158 if "links" in column[sequence].__dict__: 2.159 - output.append(sequence) 2.160 + queue.append(sequence) 2.161 break 2.162 - if output == []: return output 2.163 + if queue == []: return output 2.164 #find all sequences in the same connected component 2.165 - not_exhausted_flag=True 2.166 - sequences_copy=copy.copy(sequences) 2.167 - output_counter=0 2.168 - while not_exhausted_flag: 2.169 - not_exhausted_flag = False 2.170 - for sequence in column[output[output_counter]].links: 2.171 - if sequence not in output and sequence in sequences: 2.172 - output.append(sequence) 2.173 - output_counter+=1 2.174 - not_exhausted_flag=True 2.175 + while len(queue)>0: 2.176 + sequence = queue.pop() 2.177 + output.append(sequence) 2.178 + for another_sequence in column[sequence].links: 2.179 + if another_sequence not in queue and another_sequence not in output and another_sequence in sequences: 2.180 + queue.append(another_sequence) 2.181 return output 2.182 2.183 2.184 @@ -281,6 +329,8 @@ 2.185 if "links" in monomer.__dict__: 2.186 del monomer.links 2.187 2.188 - 2.189 +#[gblock for gblock in blocks_in_previous_column for gcolumn in gblock.columns if alignment.columns.index(gcolumn)==147] 2.190 + 2.191 + 2.192 if __name__== '__main__': 2.193 main(open(sys.argv[1]))