Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/raw-rev/117b03ee9e8f
Дата изменения: Unknown
Дата индексирования: Tue Oct 2 07:37:19 2012
Кодировка:

# HG changeset patch
# User boris
# Date 1291411157 -10800
# Node ID 117b03ee9e8f63b110879c7a040e87c4b6485dcd
# Parent 90a75f2fe7df75a80595630128b37e429b7f8992
allpy.base: add alignment (old version)

diff -r 90a75f2fe7df -r 117b03ee9e8f allpy/base.py
--- a/allpy/base.py Sat Dec 04 00:18:50 2010 +0300
+++ b/allpy/base.py Sat Dec 04 00:19:17 2010 +0300
@@ -1,4 +1,5 @@
import _monomer
+from fasta import save_fasta

class Sequence(list):
""" Sequence of Monomers
@@ -14,16 +15,223 @@
MonomerType = _monomer.MonomerType
Monomer = _monomer.Monomer

-class Alignment(list):
+class Alignment(dict):
+ """ Alignment
+
+ {:[,None,]}
+ keys are the Sequence objects, values are the lists, which
+ contain monomers of those sequences or None for gaps in the
+ corresponding sequence of alignment
+ """
+ # _sequences -- list of Sequence objects. Sequences don't contain gaps
+ # - see sequence.py module
+
+ def __init__(self, *args):
+ """overloaded constructor
+
+ Alignment() -> new empty Alignment
+ Alignment(sequences, body) -> new Alignment with sequences and
+ body initialized from arguments
+ Alignment(fasta_file) -> new Alignment, read body and sequences
+ from fasta file
+
+ """
+ if len(args)>1:#overloaded constructor
+ self.sequences=args[0]
+ self.body=args[1]
+ elif len(args)==0:
+ self.sequences=[]
+ self.body={}
+ else:
+ self.sequences, self.body = Alignment.from_fasta(args[0])
+
+ def length(self):
+ """ Returns width, ie length of each sequence with gaps """
+ return max([len(line) for line in self.body.values()])
+
+ def height(self):
+ """ The number of sequences in alignment (it's thickness). """
+ return len(self.body)
+
+ def identity(self):
+ """ Calculate the identity of alignment positions for colouring.
+
+ For every (row, column) in alignment the percentage of the exactly
+ same residue in the same column in the alignment is calculated.
+ The data structure is just like the Alignment.body, but istead of
+ monomers it contains float percentages.
+ """
+ # Oh, God, that's awful! Absolutely not understandable.
+ # First, calculate percentages of amino acids in every column
+ contribution = 1.0 / len(self.sequences)
+ all_columns = []
+ for position in range(len(self)):
+ column_percentage = {}
+ for seq in self.body:
+ if self.body[seq][position] is not None:
+ aa = self.body[seq][position].code
+ else:
+ aa = None
+ if aa in allpy.data.amino_acids:
+ if aa in column_percentage.keys():
+ column_percentage[aa] += contribution
+ else:
+ column_percentage[aa] = contribution
+ all_columns.append(column_percentage)
+ # Second, map these percentages onto the alignment
+ self.identity_percentages = {}
+ for seq in self.sequences:
+ self.identity_percentages[seq] = []
+ for seq in self.identity_percentages:
+ line = self.identity_percentages[seq]
+ for position in range(len(self)):
+ if self.body[seq][position] is not None:
+ aa = self.body[seq][position].code
+ else:
+ aa = None
+ line.append(all_columns[position].get(aa))
+ return self.identity_percentages
+
+ @staticmethod
+ def from_fasta(file, monomer_kind=AminoAcidType):
+ """ Import data from fasta file
+
+ monomer_kind is class, inherited from MonomerType
+
+ >>> import alignment
+ >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))
+ """
+ import re
+
+ sequences = []
+ body = {}
+
+ raw_sequences = file.read().split(">")
+ if len(raw_sequences) <= 1:
+ raise Exception("Wrong format of fasta-file %s" % file.name)
+
+ raw_sequences = raw_sequences[1:] #ignore everything before the first >
+ for raw in raw_sequences:
+ parsed_raw_sequence = raw.split("\n")
+ parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
+ name_and_description = parsed_raw_sequence[0]
+ name_and_description = name_and_description.split(" ",1)
+ if len(name_and_description) == 2:
+ name, description = name_and_description
+ elif len(name_and_description) == 1:
+ #if there is description
+ name = name_and_description[0]
+ description = ''
+ else:
+ raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \
+ {'name': name, 'file': file.name})
+
+ if len(parsed_raw_sequence) <= 1:
+ raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \
+ {'name': name, 'file': file.name})
+ string = ""
+ for piece in parsed_raw_sequence[1:]:
+ piece_without_whitespace_chars = re.sub("\s", "", piece)
+ string += piece_without_whitespace_chars
+ monomers = [] #convert into Monomer objects
+ body_list = [] #create the respective list in body dict
+ for current_monomer in string:
+ if current_monomer not in ["-", ".", "~"]:
+ monomers.append(monomer_kind.from_code1(current_monomer).instance())
+ body_list.append(monomers[-1])
+ else:
+ body_list.append(None)
+ s = sequence.Sequence(monomers, name, description)
+ sequences.append(s)
+ body[s] = body_list
+ return sequences, body
+
+ @staticmethod
+ def from_sequences(*sequences):
+ """ Constructs new alignment from sequences
+
+ Add None's to right end to make equal lengthes of alignment sequences
+ """
+ alignment = Alignment()
+ alignment.sequences = sequences
+ max_length = max(len(sequence) for sequence in sequences)
+ for sequence in sequences:
+ gaps_count = max_length - len(sequence)
+ alignment.body[sequence] = sequence.monomers + [None] * gaps_count
+ return alignment
+
+ def save_fasta(self, out_file, long_line=70, gap='-'):
+ """ Saves alignment to given file
+
+ Splits long lines to substrings of length=long_line
+ To prevent this, set long_line=None
+ """
+ block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)
+
+ def muscle_align(self):
+ """ Simple align ths alignment using sequences (muscle)
+
+ uses old Monomers and Sequences objects
+ """
+ tmp_file = NamedTemporaryFile(delete=False)
+ self.save_fasta(tmp_file)
+ tmp_file.close()
+ os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})
+ sequences, body = Alignment.from_fasta(open(tmp_file.name))
+ for sequence in self.sequences:
+ try:
+ new_sequence = [i for i in sequences if sequence==i][0]
+ except:
+ raise Exception("Align: Cann't find sequence %s in muscle output" % \
+ sequence.name)
+ old_monomers = iter(sequence.monomers)
+ self.body[sequence] = []
+ for monomer in body[new_sequence]:
+ if not monomer:
+ self.body[sequence].append(monomer)
+ else:
+ old_monomer = old_monomers.next()
+ if monomer != old_monomer:
+ raise Exception("Align: alignment errors")
+ self.body[sequence].append(old_monomer)
+ os.unlink(tmp_file.name)
+
+ def column(self, sequence=None, sequences=None, original=None):
+ """ returns list of columns of alignment
+
+ sequence or sequences:
+ if sequence is given, then column is (original_monomer, monomer)
+ if sequences is given, then column is (original_monomer, {sequence: monomer})
+ if both of them are given, it is an error
+ original (Sequence type):
+ if given, this filters only columns represented by original sequence
+ """
+ if sequence and sequences:
+ raise Exception("Wrong usage. read help")
+ indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])
+ alignment = self.body.items()
+ alignment.sort(key=lambda i: indexes[i[0]])
+ alignment = [monomers for seq, monomers in alignment]
+ for column in zip(*alignment):
+ if not original or column[indexes[original]]:
+ if sequence:
+ yield (column[indexes[original]], column[indexes[sequence]])
+ else:
+ yield (column[indexes[original]],
+ dict([(s, column[indexes[s]]) for s in sequences]))
+
+ def secstr(self, sequence, pdb_chain, gap='-'):
+ """ Returns string representing secondary structure """
+ return ''.join([
+ (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)
+ for m in self.body[sequence]])
+
+class Block(object):
""" """
pass

-class Block(list):
- """ """
- pass

-