Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/9369dbad919d/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 20:06:29 2013
Кодировка:
allpy: 9369dbad919d allpy/base.py

allpy

view allpy/base.py @ 251:9369dbad919d

Incompatible changes to Monomer interfaces. This branch does not work! - (!!) only changed allpy._monomer, not uses - (!!) removed (temporarily) classes for specific monomer types (DNAMonomer, etc) - refurbished allpy.data.AAcodes to allpy.data.codes with much cleaner interface - refurbished allpy._monomer for simplicity and more friendly interface Now it will (someday) be possible to say: a = Monomer.from_name("alanine") b = protein.Monomer.from_code1("a") c = protein.MonomerType.from_code3("ala") d = dna.Monomer.from_code3("DA") but impossible to say: d = protein.Monomer.from_code3("DA")
author Daniil Alexeyevsky <me.dendik@gmail.com>
date Mon, 13 Dec 2010 20:12:11 +0300
parents 6507cd6808c7
children aae821828b03
line source
1 import _monomer
2 from fasta import save_fasta
4 class Sequence(list):
5 """ Sequence of Monomers
7 list of monomer objects
9 Mandatory data:
10 * name -- str with the name of sequence
11 * description -- str with description of the sequence
12 """
13 pass
15 MonomerType = _monomer.MonomerType
16 Monomer = _monomer.Monomer
18 class Alignment(dict):
19 """ Alignment
21 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
22 keys are the Sequence objects, values are the lists, which
23 contain monomers of those sequences or None for gaps in the
24 corresponding sequence of alignment
25 """
26 # _sequences -- list of Sequence objects. Sequences don't contain gaps
27 # - see sequence.py module
29 def __init__(self, *args):
30 """overloaded constructor
32 Alignment() -> new empty Alignment
33 Alignment(sequences, body) -> new Alignment with sequences and
34 body initialized from arguments
35 Alignment(fasta_file) -> new Alignment, read body and sequences
36 from fasta file
38 """
39 if len(args)>1:#overloaded constructor
40 self.sequences=args[0]
41 self.body=args[1]
42 elif len(args)==0:
43 self.sequences=[]
44 self.body={}
45 else:
46 self.sequences, self.body = Alignment.from_fasta(args[0])
48 def length(self):
49 """ Returns width, ie length of each sequence with gaps """
50 return max([len(line) for line in self.body.values()])
52 def height(self):
53 """ The number of sequences in alignment (it's thickness). """
54 return len(self.body)
56 def identity(self):
57 """ Calculate the identity of alignment positions for colouring.
59 For every (row, column) in alignment the percentage of the exactly
60 same residue in the same column in the alignment is calculated.
61 The data structure is just like the Alignment.body, but istead of
62 monomers it contains float percentages.
63 """
64 # Oh, God, that's awful! Absolutely not understandable.
65 # First, calculate percentages of amino acids in every column
66 contribution = 1.0 / len(self.sequences)
67 all_columns = []
68 for position in range(len(self)):
69 column_percentage = {}
70 for seq in self.body:
71 if self.body[seq][position] is not None:
72 aa = self.body[seq][position].code
73 else:
74 aa = None
75 if aa in allpy.data.amino_acids:
76 if aa in column_percentage.keys():
77 column_percentage[aa] += contribution
78 else:
79 column_percentage[aa] = contribution
80 all_columns.append(column_percentage)
81 # Second, map these percentages onto the alignment
82 self.identity_percentages = {}
83 for seq in self.sequences:
84 self.identity_percentages[seq] = []
85 for seq in self.identity_percentages:
86 line = self.identity_percentages[seq]
87 for position in range(len(self)):
88 if self.body[seq][position] is not None:
89 aa = self.body[seq][position].code
90 else:
91 aa = None
92 line.append(all_columns[position].get(aa))
93 return self.identity_percentages
95 @staticmethod
96 def from_fasta(file, monomer_kind=AminoAcidType):
97 """ Import data from fasta file
99 monomer_kind is class, inherited from MonomerType
101 >>> import alignment
102 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))
103 """
104 import re
106 sequences = []
107 body = {}
109 raw_sequences = file.read().split(">")
110 if len(raw_sequences) <= 1:
111 raise Exception("Wrong format of fasta-file %s" % file.name)
113 raw_sequences = raw_sequences[1:] #ignore everything before the first >
114 for raw in raw_sequences:
115 parsed_raw_sequence = raw.split("\n")
116 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
117 name_and_description = parsed_raw_sequence[0]
118 name_and_description = name_and_description.split(" ",1)
119 if len(name_and_description) == 2:
120 name, description = name_and_description
121 elif len(name_and_description) == 1:
122 #if there is description
123 name = name_and_description[0]
124 description = ''
125 else:
126 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \
127 {'name': name, 'file': file.name})
129 if len(parsed_raw_sequence) <= 1:
130 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \
131 {'name': name, 'file': file.name})
132 string = ""
133 for piece in parsed_raw_sequence[1:]:
134 piece_without_whitespace_chars = re.sub("\s", "", piece)
135 string += piece_without_whitespace_chars
136 monomers = [] #convert into Monomer objects
137 body_list = [] #create the respective list in body dict
138 for current_monomer in string:
139 if current_monomer not in ["-", ".", "~"]:
140 monomers.append(monomer_kind.from_code1(current_monomer).instance())
141 body_list.append(monomers[-1])
142 else:
143 body_list.append(None)
144 s = sequence.Sequence(monomers, name, description)
145 sequences.append(s)
146 body[s] = body_list
147 return sequences, body
149 @staticmethod
150 def from_sequences(*sequences):
151 """ Constructs new alignment from sequences
153 Add None's to right end to make equal lengthes of alignment sequences
154 """
155 alignment = Alignment()
156 alignment.sequences = sequences
157 max_length = max(len(sequence) for sequence in sequences)
158 for sequence in sequences:
159 gaps_count = max_length - len(sequence)
160 alignment.body[sequence] = sequence.monomers + [None] * gaps_count
161 return alignment
163 def save_fasta(self, out_file, long_line=70, gap='-'):
164 """ Saves alignment to given file
166 Splits long lines to substrings of length=long_line
167 To prevent this, set long_line=None
168 """
169 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)
171 def muscle_align(self):
172 """ Simple align ths alignment using sequences (muscle)
174 uses old Monomers and Sequences objects
175 """
176 tmp_file = NamedTemporaryFile(delete=False)
177 self.save_fasta(tmp_file)
178 tmp_file.close()
179 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})
180 sequences, body = Alignment.from_fasta(open(tmp_file.name))
181 for sequence in self.sequences:
182 try:
183 new_sequence = [i for i in sequences if sequence==i][0]
184 except:
185 raise Exception("Align: Cann't find sequence %s in muscle output" % \
186 sequence.name)
187 old_monomers = iter(sequence.monomers)
188 self.body[sequence] = []
189 for monomer in body[new_sequence]:
190 if not monomer:
191 self.body[sequence].append(monomer)
192 else:
193 old_monomer = old_monomers.next()
194 if monomer != old_monomer:
195 raise Exception("Align: alignment errors")
196 self.body[sequence].append(old_monomer)
197 os.unlink(tmp_file.name)
199 def column(self, sequence=None, sequences=None, original=None):
200 """ returns list of columns of alignment
202 sequence or sequences:
203 if sequence is given, then column is (original_monomer, monomer)
204 if sequences is given, then column is (original_monomer, {sequence: monomer})
205 if both of them are given, it is an error
206 original (Sequence type):
207 if given, this filters only columns represented by original sequence
208 """
209 if sequence and sequences:
210 raise Exception("Wrong usage. read help")
211 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])
212 alignment = self.body.items()
213 alignment.sort(key=lambda i: indexes[i[0]])
214 alignment = [monomers for seq, monomers in alignment]
215 for column in zip(*alignment):
216 if not original or column[indexes[original]]:
217 if sequence:
218 yield (column[indexes[original]], column[indexes[sequence]])
219 else:
220 yield (column[indexes[original]],
221 dict([(s, column[indexes[s]]) for s in sequences]))
223 def secstr(self, sequence, pdb_chain, gap='-'):
224 """ Returns string representing secondary structure """
225 return ''.join([
226 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)
227 for m in self.body[sequence]])
229 class Block(object):
230 """ """
231 pass