Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/annotate/0e8b5ad66125/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Mar 2 07:33:14 2014
Кодировка:
allpy: allpy/base.py annotate

allpy

annotate allpy/base.py @ 259:0e8b5ad66125

Merge
author Daniil Alexeyevsky <me.dendik@gmail.com>
date Tue, 14 Dec 2010 20:05:13 +0300
parents 6507cd6808c7
children aae821828b03
rev   line source
bnagaev@244 1 import _monomer
bnagaev@249 2 from fasta import save_fasta
bnagaev@239 3
bnagaev@239 4 class Sequence(list):
bnagaev@243 5 """ Sequence of Monomers
bnagaev@243 6
bnagaev@243 7 list of monomer objects
bnagaev@243 8
bnagaev@243 9 Mandatory data:
bnagaev@243 10 * name -- str with the name of sequence
bnagaev@243 11 * description -- str with description of the sequence
bnagaev@243 12 """
bnagaev@243 13 pass
bnagaev@243 14
bnagaev@244 15 MonomerType = _monomer.MonomerType
bnagaev@244 16 Monomer = _monomer.Monomer
bnagaev@239 17
bnagaev@249 18 class Alignment(dict):
bnagaev@249 19 """ Alignment
bnagaev@249 20
bnagaev@249 21 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
bnagaev@249 22 keys are the Sequence objects, values are the lists, which
bnagaev@249 23 contain monomers of those sequences or None for gaps in the
bnagaev@249 24 corresponding sequence of alignment
bnagaev@249 25 """
bnagaev@249 26 # _sequences -- list of Sequence objects. Sequences don't contain gaps
bnagaev@249 27 # - see sequence.py module
bnagaev@249 28
bnagaev@249 29 def __init__(self, *args):
bnagaev@249 30 """overloaded constructor
bnagaev@249 31
bnagaev@249 32 Alignment() -> new empty Alignment
bnagaev@249 33 Alignment(sequences, body) -> new Alignment with sequences and
bnagaev@249 34 body initialized from arguments
bnagaev@249 35 Alignment(fasta_file) -> new Alignment, read body and sequences
bnagaev@249 36 from fasta file
bnagaev@249 37
bnagaev@249 38 """
bnagaev@249 39 if len(args)>1:#overloaded constructor
bnagaev@249 40 self.sequences=args[0]
bnagaev@249 41 self.body=args[1]
bnagaev@249 42 elif len(args)==0:
bnagaev@249 43 self.sequences=[]
bnagaev@249 44 self.body={}
bnagaev@249 45 else:
bnagaev@249 46 self.sequences, self.body = Alignment.from_fasta(args[0])
bnagaev@249 47
bnagaev@249 48 def length(self):
bnagaev@249 49 """ Returns width, ie length of each sequence with gaps """
bnagaev@249 50 return max([len(line) for line in self.body.values()])
bnagaev@249 51
bnagaev@249 52 def height(self):
bnagaev@249 53 """ The number of sequences in alignment (it's thickness). """
bnagaev@249 54 return len(self.body)
bnagaev@249 55
bnagaev@249 56 def identity(self):
bnagaev@249 57 """ Calculate the identity of alignment positions for colouring.
bnagaev@249 58
bnagaev@249 59 For every (row, column) in alignment the percentage of the exactly
bnagaev@249 60 same residue in the same column in the alignment is calculated.
bnagaev@249 61 The data structure is just like the Alignment.body, but istead of
bnagaev@249 62 monomers it contains float percentages.
bnagaev@249 63 """
bnagaev@249 64 # Oh, God, that's awful! Absolutely not understandable.
bnagaev@249 65 # First, calculate percentages of amino acids in every column
bnagaev@249 66 contribution = 1.0 / len(self.sequences)
bnagaev@249 67 all_columns = []
bnagaev@249 68 for position in range(len(self)):
bnagaev@249 69 column_percentage = {}
bnagaev@249 70 for seq in self.body:
bnagaev@249 71 if self.body[seq][position] is not None:
bnagaev@249 72 aa = self.body[seq][position].code
bnagaev@249 73 else:
bnagaev@249 74 aa = None
bnagaev@249 75 if aa in allpy.data.amino_acids:
bnagaev@249 76 if aa in column_percentage.keys():
bnagaev@249 77 column_percentage[aa] += contribution
bnagaev@249 78 else:
bnagaev@249 79 column_percentage[aa] = contribution
bnagaev@249 80 all_columns.append(column_percentage)
bnagaev@249 81 # Second, map these percentages onto the alignment
bnagaev@249 82 self.identity_percentages = {}
bnagaev@249 83 for seq in self.sequences:
bnagaev@249 84 self.identity_percentages[seq] = []
bnagaev@249 85 for seq in self.identity_percentages:
bnagaev@249 86 line = self.identity_percentages[seq]
bnagaev@249 87 for position in range(len(self)):
bnagaev@249 88 if self.body[seq][position] is not None:
bnagaev@249 89 aa = self.body[seq][position].code
bnagaev@249 90 else:
bnagaev@249 91 aa = None
bnagaev@249 92 line.append(all_columns[position].get(aa))
bnagaev@249 93 return self.identity_percentages
bnagaev@249 94
bnagaev@249 95 @staticmethod
bnagaev@249 96 def from_fasta(file, monomer_kind=AminoAcidType):
bnagaev@249 97 """ Import data from fasta file
bnagaev@249 98
bnagaev@249 99 monomer_kind is class, inherited from MonomerType
bnagaev@249 100
bnagaev@249 101 >>> import alignment
bnagaev@249 102 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))
bnagaev@249 103 """
bnagaev@249 104 import re
bnagaev@249 105
bnagaev@249 106 sequences = []
bnagaev@249 107 body = {}
bnagaev@249 108
bnagaev@249 109 raw_sequences = file.read().split(">")
bnagaev@249 110 if len(raw_sequences) <= 1:
bnagaev@249 111 raise Exception("Wrong format of fasta-file %s" % file.name)
bnagaev@249 112
bnagaev@249 113 raw_sequences = raw_sequences[1:] #ignore everything before the first >
bnagaev@249 114 for raw in raw_sequences:
bnagaev@249 115 parsed_raw_sequence = raw.split("\n")
bnagaev@249 116 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
bnagaev@249 117 name_and_description = parsed_raw_sequence[0]
bnagaev@249 118 name_and_description = name_and_description.split(" ",1)
bnagaev@249 119 if len(name_and_description) == 2:
bnagaev@249 120 name, description = name_and_description
bnagaev@249 121 elif len(name_and_description) == 1:
bnagaev@249 122 #if there is description
bnagaev@249 123 name = name_and_description[0]
bnagaev@249 124 description = ''
bnagaev@249 125 else:
bnagaev@249 126 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \
bnagaev@249 127 {'name': name, 'file': file.name})
bnagaev@249 128
bnagaev@249 129 if len(parsed_raw_sequence) <= 1:
bnagaev@249 130 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \
bnagaev@249 131 {'name': name, 'file': file.name})
bnagaev@249 132 string = ""
bnagaev@249 133 for piece in parsed_raw_sequence[1:]:
bnagaev@249 134 piece_without_whitespace_chars = re.sub("\s", "", piece)
bnagaev@249 135 string += piece_without_whitespace_chars
bnagaev@249 136 monomers = [] #convert into Monomer objects
bnagaev@249 137 body_list = [] #create the respective list in body dict
bnagaev@249 138 for current_monomer in string:
bnagaev@249 139 if current_monomer not in ["-", ".", "~"]:
bnagaev@249 140 monomers.append(monomer_kind.from_code1(current_monomer).instance())
bnagaev@249 141 body_list.append(monomers[-1])
bnagaev@249 142 else:
bnagaev@249 143 body_list.append(None)
bnagaev@249 144 s = sequence.Sequence(monomers, name, description)
bnagaev@249 145 sequences.append(s)
bnagaev@249 146 body[s] = body_list
bnagaev@249 147 return sequences, body
bnagaev@249 148
bnagaev@249 149 @staticmethod
bnagaev@249 150 def from_sequences(*sequences):
bnagaev@249 151 """ Constructs new alignment from sequences
bnagaev@249 152
bnagaev@249 153 Add None's to right end to make equal lengthes of alignment sequences
bnagaev@249 154 """
bnagaev@249 155 alignment = Alignment()
bnagaev@249 156 alignment.sequences = sequences
bnagaev@249 157 max_length = max(len(sequence) for sequence in sequences)
bnagaev@249 158 for sequence in sequences:
bnagaev@249 159 gaps_count = max_length - len(sequence)
bnagaev@249 160 alignment.body[sequence] = sequence.monomers + [None] * gaps_count
bnagaev@249 161 return alignment
bnagaev@249 162
bnagaev@249 163 def save_fasta(self, out_file, long_line=70, gap='-'):
bnagaev@249 164 """ Saves alignment to given file
bnagaev@249 165
bnagaev@249 166 Splits long lines to substrings of length=long_line
bnagaev@249 167 To prevent this, set long_line=None
bnagaev@249 168 """
bnagaev@249 169 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)
bnagaev@249 170
bnagaev@249 171 def muscle_align(self):
bnagaev@249 172 """ Simple align ths alignment using sequences (muscle)
bnagaev@249 173
bnagaev@249 174 uses old Monomers and Sequences objects
bnagaev@249 175 """
bnagaev@249 176 tmp_file = NamedTemporaryFile(delete=False)
bnagaev@249 177 self.save_fasta(tmp_file)
bnagaev@249 178 tmp_file.close()
bnagaev@249 179 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})
bnagaev@249 180 sequences, body = Alignment.from_fasta(open(tmp_file.name))
bnagaev@249 181 for sequence in self.sequences:
bnagaev@249 182 try:
bnagaev@249 183 new_sequence = [i for i in sequences if sequence==i][0]
bnagaev@249 184 except:
bnagaev@249 185 raise Exception("Align: Cann't find sequence %s in muscle output" % \
bnagaev@249 186 sequence.name)
bnagaev@249 187 old_monomers = iter(sequence.monomers)
bnagaev@249 188 self.body[sequence] = []
bnagaev@249 189 for monomer in body[new_sequence]:
bnagaev@249 190 if not monomer:
bnagaev@249 191 self.body[sequence].append(monomer)
bnagaev@249 192 else:
bnagaev@249 193 old_monomer = old_monomers.next()
bnagaev@249 194 if monomer != old_monomer:
bnagaev@249 195 raise Exception("Align: alignment errors")
bnagaev@249 196 self.body[sequence].append(old_monomer)
bnagaev@249 197 os.unlink(tmp_file.name)
bnagaev@249 198
bnagaev@249 199 def column(self, sequence=None, sequences=None, original=None):
bnagaev@249 200 """ returns list of columns of alignment
bnagaev@249 201
bnagaev@249 202 sequence or sequences:
bnagaev@249 203 if sequence is given, then column is (original_monomer, monomer)
bnagaev@249 204 if sequences is given, then column is (original_monomer, {sequence: monomer})
bnagaev@249 205 if both of them are given, it is an error
bnagaev@249 206 original (Sequence type):
bnagaev@249 207 if given, this filters only columns represented by original sequence
bnagaev@249 208 """
bnagaev@249 209 if sequence and sequences:
bnagaev@249 210 raise Exception("Wrong usage. read help")
bnagaev@249 211 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])
bnagaev@249 212 alignment = self.body.items()
bnagaev@249 213 alignment.sort(key=lambda i: indexes[i[0]])
bnagaev@249 214 alignment = [monomers for seq, monomers in alignment]
bnagaev@249 215 for column in zip(*alignment):
bnagaev@249 216 if not original or column[indexes[original]]:
bnagaev@249 217 if sequence:
bnagaev@249 218 yield (column[indexes[original]], column[indexes[sequence]])
bnagaev@249 219 else:
bnagaev@249 220 yield (column[indexes[original]],
bnagaev@249 221 dict([(s, column[indexes[s]]) for s in sequences]))
bnagaev@249 222
bnagaev@249 223 def secstr(self, sequence, pdb_chain, gap='-'):
bnagaev@249 224 """ Returns string representing secondary structure """
bnagaev@249 225 return ''.join([
bnagaev@249 226 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)
bnagaev@249 227 for m in self.body[sequence]])
bnagaev@249 228
bnagaev@249 229 class Block(object):
bnagaev@239 230 """ """
bnagaev@239 231 pass
bnagaev@239 232