Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/raw-rev/b35116e13f35
Дата изменения: Unknown
Дата индексирования: Tue Oct 2 08:08:34 2012
Кодировка:

# HG changeset patch
# User Boris Burkov
# Date 1307531757 -14400
# Node ID b35116e13f352626253b3f178e21c3b8078cb350
# Parent 612c618fb7b0f116cb91f341911262cd372424c6
Changed the __repr__for sequence to . Renamed row() to row_as_list().

diff -r 612c618fb7b0 -r b35116e13f35 allpy/base.py
--- a/allpy/base.py Tue Jun 07 17:37:06 2011 +0400
+++ b/allpy/base.py Wed Jun 08 15:15:57 2011 +0400
@@ -82,7 +82,7 @@
return cls.by_name[name.strip().capitalize()]()

def __repr__(self):
- return str(self.code1)
+ return "" % str(self.code1)

def __str__(self):
"""Returns one-letter code"""
@@ -139,7 +139,7 @@

def __repr__(self):
if self.name:
- return str(self.name)
+ return '' % str(self.name)
else:
return '' % str(self)

@@ -231,7 +231,7 @@
# Data access methods for alignment
# =================================

- def row(self, sequence):
+ def row_as_list(self, sequence):
"""Creates and returns temporary list of monomers and Nones"""
output=[]
for column in self.columns:
@@ -247,7 +247,7 @@
if monomer:
return monomer.code1
return "-"
- row = self.row(sequence)
+ row = self.row_as_list(sequence)
list_of_letters = map(char, row)
output=""
for letter in list_of_letters: output+=letter
diff -r 612c618fb7b0 -r b35116e13f35 sequence_based_blocks_search/blocks_finder.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sequence_based_blocks_search/blocks_finder.py Wed Jun 08 15:15:57 2011 +0400
@@ -0,0 +1,262 @@
+#!/usr/bin/python
+
+import sys
+sys.path.append("../")
+from allpy import protein
+import copy
+import math
+from functional_groups import functional_groups
+
+class Preblock(object):
+ """Attributes:
+ first_column_index=self.alignment.columns[column1]
+ last_column_index=self.aligmnent.columns[column2]
+ sequences=[sequence1, sequence2...]
+ alignment
+
+ A preblock score is calculated as follows:
+ Suppose, that a preblock contains 2 conserved and 3 inconserved positions.
+ Preblock score is sum of scores of its conserved and inconserved columns:
+ Score(conserved column) = log(p(group|70% conservation)) - n*log(f(group))
+ Score(inconserved column) = log(0.3) - log(1)
+ Thus score of preblock contains 3 terms: sum of scores of inconserved column
+ sum of log(p(group|70% conservation)) and sum of log(f(group)).
+ """
+ def __init__(self, first_column_index, last_column_index, sequences, term1, term2, term3, alignment):
+ self.first_column_index = first_column_index
+ self.last_column_index = last_column_index
+ self.sequences = sequences
+ self.term1 = term1
+ self.term2 = term2
+ self.term3 = term3
+ self.alignment = alignment
+
+ def calculate_score(self):
+ return self.term1 - len(self.sequences)*self.term2 + self.term3
+
+ def __repr__(self):
+ return "preblock:%s, %s, %s, %s"%(self.first_column_index, self.last_column_index, self.calculate_score(), self.sequences)
+
+
+class BreakCycleException(Exception):
+ pass
+
+
+def main(file):
+ alignment = protein.Alignment().append_file(file)
+ groups_content = find_groups_content(alignment, functional_groups)
+ find_links(groups_content, alignment)
+ blocks = mark_blocks(alignment)
+ delete_links(alignment)
+ return blocks
+
+
+def find_groups_content(alignment, functional_groups):
+ groups_content = {}
+ for column in alignment.columns:
+ groups2sequences = find_sequences_that_belong_to_groups_in_column(column, functional_groups)
+ groups_content[column] = groups2sequences
+ return groups_content
+
+
+def find_sequences_that_belong_to_groups_in_column(column, functional_groups):
+ output = {}
+ for group_name in functional_groups.keys():
+ output[group_name] = []
+ for sequence in column.keys():
+ if column[sequence].code1 in functional_groups[group_name]["aminoacids"]:
+ output[group_name].append(sequence)
+ return output
+
+
+def find_links(groups_content, alignment):
+ candidate_preblocks = []
+ for index_of_column, column in enumerate(alignment.columns):
+ candidate_preblocks, addition_to_candidate_preblocks = split_preblocks_by_presense_of_gaps(candidate_preblocks, column)
+ #create new preblocks based on the groups
+ try:
+ for group_name in functional_groups:
+ if len(groups_content[column][group_name])>1:
+ addition_to_candidate_preblocks.append(Preblock(index_of_column, index_of_column, copy.copy(groups_content[column][group_name]), functional_groups[group_name]["70%"], functional_groups[group_name]["100%"], 0.0, alignment))
+ except KeyError as ke:
+ print "Found non-canonical aminoacid: %s"%(ke)
+ #create new preblocks as combinations of the old ones
+ try:
+ for group_name in functional_groups:
+ for preblock in candidate_preblocks:
+ if len([sequence_in_block for sequence_in_block in groups_content[column][group_name] if sequence_in_block in preblock.sequences]) > 1:
+ addition_to_candidate_preblocks.append(Preblock(preblock.first_column_index, index_of_column, [sequence_in_block for sequence_in_block in groups_content[column][group_name] if sequence_in_block in preblock.sequences], preblock.term1+functional_groups[group_name]["70%"], preblock.term2+functional_groups[group_name]["100%"], preblock.term3, alignment))
+ except KeyError as ke:
+ print "Found non-canonical aminoacid: %s"%(ke)
+ #add one inconserved column to the old ones, merge all together
+ for preblock in candidate_preblocks: preblock.term3 += math.log(0.3) - math.log(1)
+ candidate_preblocks+=addition_to_candidate_preblocks
+ #remove from candidates those preblocks that share the same set of sequences with others, but are shorter and have smaller score
+ print index_of_column, len(candidate_preblocks),
+ remove_contained_preblocks(candidate_preblocks)
+ print len(candidate_preblocks)
+ #for those preblocks ending with a conserved position and having the score above threshold, add new links
+ for preblock in candidate_preblocks:
+ if preblock.alignment.columns[preblock.last_column_index] is column and preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(alignment.sequences), functional_groups):
+ create_links(preblock, alignment) #links are not cliques, just connected components
+
+
+def split_preblocks_by_presense_of_gaps(preblocks, column):
+ """Each preblock is split into up to two preblocks vertically:
+ those, with sequences, that contain gaps and those, that
+ don't.
+ """
+ preblocks_without_gap = []
+ preblocks_with_gap = []
+ for preblock in preblocks:
+ preblock_sequences_without_gap = []
+ preblock_sequences_with_gap = []
+ for sequence in preblock.sequences:
+ if sequence in column.keys():
+ preblock_sequences_without_gap.append(sequence)
+ else:
+ preblock_sequences_with_gap.append(sequence)
+ if len(preblock_sequences_without_gap)>=2:
+ preblocks_without_gap.append(Preblock(preblock.first_column_index, preblock.last_column_index, preblock_sequences_without_gap, preblock.term1, preblock.term2, preblock.term3, preblock.alignment))#added one inconserved position
+ if len(preblock_sequences_with_gap)>=2:
+ preblocks_with_gap.append(Preblock(preblock.first_column_index, preblock.last_column_index, preblock_sequences_with_gap, preblock.term1, preblock.term2, preblock.term3, preblock.alignment))
+ return preblocks_without_gap, preblocks_with_gap
+
+
+def remove_contained_preblocks(preblocks):
+ """removes blocks that have rivals with same sets of sequences,
+ but shorter and with inferior scores, normalized to the number
+ of sequences within the block
+
+ """
+ output=[]
+ counter = len(preblocks)-1
+ while counter!=-1:
+ for preblock in preblocks:
+ try:
+ if preblock is not preblocks[counter]:
+ for sequence in preblocks[counter].sequences:
+ if sequence not in preblock.sequences: raise BreakCycleException
+ #if preblocks passes the threshold of weight, discard the preblocks[counter] whatever, cause, it or at least its part should've been already accepted.
+ if preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(preblock.alignment.sequences), functional_groups)\
+ and preblocks[counter].last_column_index<=preblock.last_column_index:
+ preblocks.remove(preblocks[counter])
+ break
+ #create normalized preblock with same set of sequences as the examined one
+ normalized_preblock = Preblock(preblock.first_column_index, preblock.last_column_index, preblocks[counter].sequences, preblock.term1, preblock.term2, preblock.term3, preblock.alignment)
+ if preblocks[counter].first_column_index>=preblock.first_column_index\
+ and preblocks[counter].last_column_index<=preblock.last_column_index\
+ and preblocks[counter].calculate_score() <= normalized_preblock.calculate_score():
+ preblocks.remove(preblocks[counter])
+ break
+ except BreakCycleException:
+ pass
+ counter-=1
+
+
+def calculate_threshold(number_of_sequences_in_preblock, number_of_sequences_in_alignment, functional_groups):
+ """
+ We seek to find the probability, that given n randomly generated
+ sequences we won't find alignment of them, such that there is a
+ block of k sequences with score not less than the observed.
+
+ First, we have to pick 5 sequences out of 20;
+ then iterate over all the ways to shift them against each other.
+
+ The number of shifts of length 1 is roughly 5*4*len(alignment)-1)^3
+ The number of shifts of length 2 is roughly 5*4*len(alignment)-2)^3
+ ...
+
+ Considering these shifts independent, we seek to find the p-values
+ for each of the scores to be not found in all the stretches.
+
+ Thus, for each shift we calculate the weights of all groups and
+ inconserved positions, create the transition matrix, calculate
+ its eigenvectors and eigenvalues, round the weights, find coefficient
+ of [1,0,0,...] decomposition in the basis of eigenvectors.
+
+
+ """
+
+ #length_of_stretch = math.factorial(number_of_sequences_in_alignment)/math.factorial(number_of_sequences_in_preblock)/math.factorial(number_of_sequences_in_preblock)*^number_of_sequences_in_alignment
+ return (-1)*functional_groups["u"]["70%"]*1.5*number_of_sequences_in_preblock
+
+
+def create_links(preblock, alignment):
+ """links are lists of sequences"""
+ for column in alignment.columns[preblock.first_column_index:preblock.last_column_index+1]:
+ if preblock.sequences[0] not in column: continue #ignore gap columns
+ for sequence in preblock.sequences:
+ if "links" not in column[sequence].__dict__:
+ column[sequence].links=[]
+ remaining_sequences = copy.copy(preblock.sequences)
+ remaining_sequences.remove(sequence)
+ for iterated_sequence in remaining_sequences:#create links both for sequence and iterated sequence
+ if iterated_sequence not in column[sequence].links: column[sequence].links.append(iterated_sequence)
+ if "links" not in column[iterated_sequence].__dict__:
+ column[iterated_sequence].links=[sequence]
+ else:
+ if sequence not in column[iterated_sequence].links: column[iterated_sequence].links.append(sequence)
+
+
+def mark_blocks(alignment):
+ output=[]
+ blocks_in_previous_column = []
+ for column in alignment.columns:
+ blocks_in_this_column = []
+ remaining_sequences=copy.copy(alignment.sequences)
+ while remaining_sequences !=[]:
+ connected_component=find_connected_component(remaining_sequences, column)
+ if connected_component==[]: break
+ #if connected_component extends existing block, extend that block, else create new one
+ appended_to_block_flag = False
+ for block in blocks_in_previous_column:
+ if set(connected_component)==set(block.sequences):
+ block.columns.append(column)
+ blocks_in_this_column.append(block)
+ appended_to_block_flag = True
+ if appended_to_block_flag==False:
+ blocks_in_this_column.append(protein.Block.from_alignment(alignment, connected_component, [column]))
+ for sequence in connected_component:
+ remaining_sequences.remove(sequence)
+ for block in blocks_in_previous_column:
+ if block not in blocks_in_this_column:
+ output.append(block)
+ blocks_in_previous_column = blocks_in_this_column
+ if alignment.columns.index(column) == len(alignment.columns)-1: output+=blocks_in_this_column
+ return output
+
+
+def find_connected_component(sequences, column):
+ #find one sequence that has blocks
+ output=[]
+ if sequences!=[]:
+ for sequence in sequences:
+ if sequence not in column: continue
+ if "links" in column[sequence].__dict__:
+ output.append(sequence)
+ break
+ if output == []: return output
+ #find all sequences in the same connected component
+ not_exhausted_flag=True
+ sequences_copy=copy.copy(sequences)
+ output_counter=0
+ while not_exhausted_flag:
+ not_exhausted_flag = False
+ for sequence in column[output[output_counter]].links:
+ if sequence not in output and sequence in sequences:
+ output.append(sequence)
+ output_counter+=1
+ not_exhausted_flag=True
+ return output
+
+
+def delete_links(alignment):
+ for sequence in alignment.sequences:
+ for monomer in sequence:
+ if "links" in monomer.__dict__:
+ del monomer.links
+
+
+if __name__== '__main__':
+ main(open(sys.argv[1]))
diff -r 612c618fb7b0 -r b35116e13f35 sequence_based_blocks_search/functional_groups.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sequence_based_blocks_search/functional_groups.py Wed Jun 08 15:15:57 2011 +0400
@@ -0,0 +1,59 @@
+# group name: aminoacids in group, logprobability of group for 100% identical sequences, >60%, >30%
+functional_groups={
+ "b":{"aminoacids":"IVLMFA","100%":-1.671163536, "70%":-3, "30%":-4.0},
+ "t":{"aminoacids":"AGSTC", "100%":-1.625934282, "70%":-3, "30%":-4.0},
+ "u":{"aminoacids":"HKR", "100%":-2.805912948, "70%":-4, "30%":-4.5},
+ "l":{"aminoacids":"IVLM", "100%":-2.321928095, "70%":-4, "30%":-4.5},
+ "c":{"aminoacids":"IV", "100%":-3.23786383, "70%":-4, "30%":-4.5},
+ "a":{"aminoacids":"FWY", "100%":-3.53951953, "70%":-4, "30%":-4.5},
+ "d":{"aminoacids":"DE", "100%":-3.095419565, "70%":-4, "30%":-4.5},
+ "e":{"aminoacids":"KR", "100%":-3.13289427, "70%":-4, "30%":-4.5},
+ "g":{"aminoacids":"DN", "100%":-3.279283757, "70%":-4, "30%":-4.5},
+ "f":{"aminoacids":"EQ", "100%":-3.395928676, "70%":-4, "30%":-4.5},
+ "i":{"aminoacids":"ST", "100%":-2.805912948, "70%":-4, "30%":-4.5},
+ "h":{"aminoacids":"AG", "100%":-2.756330919, "70%":-4, "30%":-4.5},
+ "A":{"aminoacids":"A", "100%":-3.756330919, "70%":-5, "30%":-5.5},
+ "G":{"aminoacids":"G", "100%":-3.756330919, "70%":-5, "30%":-5.5},
+ "S":{"aminoacids":"S", "100%":-3.625934282, "70%":-5, "30%":-5.5},
+ "T":{"aminoacids":"T", "100%":-4.011587974, "70%":-5, "30%":-5.5},
+ "C":{"aminoacids":"C", "100%":-4.921390165, "70%":-6, "30%":-6.5},
+ "P":{"aminoacids":"P", "100%":-4.321928095, "70%":-5, "30%":-6.5},
+ "N":{"aminoacids":"N", "100%":-4.506352666, "70%":-5, "30%":-6.5},
+ "H":{"aminoacids":"H", "100%":-5.10780329, "70%":-6, "30%":-7 },
+ "K":{"aminoacids":"K", "100%":-3.795859283, "70%":-5, "30%":-5.5},
+ "R":{"aminoacids":"R", "100%":-4.573466862, "70%":-5, "30%":-5.5},
+ "D":{"aminoacids":"D", "100%":-4.083141235, "70%":-5, "30%":-5.5},
+ "E":{"aminoacids":"E", "100%":-4.10780329, "70%":-5, "30%":-5.5},
+ "Q":{"aminoacids":"Q", "100%":-4.756330919, "70%":-5, "30%":-5.5},
+ "I":{"aminoacids":"I", "100%":-4.717856771, "70%":-5, "30%":-5.5},
+ "L":{"aminoacids":"L", "100%":-3.717856771, "70%":-5, "30%":-5.5},
+ "V":{"aminoacids":"V", "100%":-3.878321443, "70%":-5, "30%":-5.5},
+ "F":{"aminoacids":"F", "100%":-4.64385619, "70%":-5, "30%":-5.5},
+ "W":{"aminoacids":"W", "100%":-6.265344567, "70%":-7, "30%":-7.5},
+ "Y":{"aminoacids":"Y", "100%":-4.921390165, "70%":-5, "30%":-5.5},
+ "M":{"aminoacids":"M", "100%":-5.795859283, "70%":-6.5,"30%": -7}
+ }
+
+aminoacids2functional_groups={
+ "A":["A", "b", "h", "t"],
+ "C":["C", "t"],
+ "E":["E", "d", "f"],
+ "D":["D", "d", "g"],
+ "G":["G", "h", "t"],
+ "F":["F", "a", "b"],
+ "I":["I", "c", "b", "l"],
+ "H":["H", "u"],
+ "K":["K", "e", "u"],
+ "M":["M", "b", "l"],
+ "L":["L", "b", "l"],
+ "N":["N", "g"],
+ "Q":["Q", "f"],
+ "P":["P"],
+ "S":["S", "i", "t"],
+ "R":["R", "e", "u"],
+ "T":["T", "i", "t"],
+ "W":["W", "a"],
+ "V":["V", "c", "b", "l"],
+ "Y":["Y", "a"]
+}
+