Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/tip/allpy/structure.py
Дата изменения: Unknown
Дата индексирования: Tue Apr 12 07:45:10 2016
Кодировка:
allpy: b556c96c6719 allpy/structure.py

allpy

view allpy/structure.py @ 1168:b556c96c6719

blocks3d/www Makefile: never check certificates of github, they are too confusing for wget
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Mon, 26 May 2014 17:20:29 +0400
parents 205cd762fd3b
children
line source
1 """ Functions to get pdb information from fasta id
2 and to generate fasta id from pdb information
4 pdb information: code, chain, model
6 TODO: same for local pdb files
7 """
9 import re
10 import os
11 import os.path
12 import urllib2
13 from copy import copy
14 import json
15 from cStringIO import StringIO
17 from Bio.PDB import PDBParser
18 from Bio.PDB import Select
19 from Bio.PDB import Superimposer, CaPPBuilder, PDBIO
20 from Bio.PDB import DSSP
22 import base
23 import markups
24 import processors
25 import config
26 from graph import Graph
27 from homology import MonomerHomology
28 from util import Silence
30 from tempfile import NamedTemporaryFile
33 # for pdb-codes
34 pdb_file_pattern = re.compile(r"^file:(?P<path>[^:]+)(:(?P<chain>[0-9a-z ])(:(?P<model>[0-9]{1,3})(:(?P<resi_begin>-?\d+)(:(?P<resi_end>-?\d+))?)?)?)?", re.I)
36 # for pdb-codes
37 re1 = re.compile(r"(^|[^a-z0-9])(?P<code>[0-9][0-9a-z]{3})([^a-z0-9](?P<chain>[0-9a-z ]?)([^a-z0-9/](?P<model>[0-9]{1,3}))?)?(/(?P<resi_begin>-?\d+):(?P<resi_end>-?\d+))?", re.I)
39 def pdb_id_parse(ID):
40 if ID.startswith('file:'):
41 match = pdb_file_pattern.search(ID)
42 else:
43 match = re1.search(ID)
44 if not match:
45 return None
46 d = match.groupdict()
47 if 'chain' not in d or not d['chain']:
48 d['chain'] = ' '
49 if 'model' not in d or not d['model']:
50 d['model'] = 0
51 d['model'] = int(d['model'])
52 if d['resi_begin']:
53 d['resi_begin'] = int(d['resi_begin'])
54 if d['resi_end']:
55 d['resi_end'] = int(d['resi_end'])
56 return d
58 def get_structure(file, name):
59 with Silence():
60 return PDBParser().get_structure(name, file)
62 class DownloadPdb(object):
63 """ Functor downloading pdb from web """
64 def __init__(self, pdb_url=config.pdb_url):
65 """ pdb_url -- format string, template of url """
66 self.pdb_url = pdb_url
67 def __call__(self, code):
68 code = code.lower()
69 url = self.pdb_url % code
70 if config.debug:
71 print "Download %s" % url
72 return urllib2.urlopen(url)
73 download_pdb = DownloadPdb()
75 class CachedDownloadPdb(object):
76 def __init__(self, pdb_url=config.pdb_url, cache_dir='pdb_cache', save=True):
77 self.pdb_url = pdb_url
78 self.cache_dir=cache_dir
79 self.download_pdb = DownloadPdb(pdb_url=self.pdb_url)
80 self.save = save
81 def __call__(self, code):
82 code = code.lower()
83 path = os.path.join(self.cache_dir, code + '.pdb')
84 if os.path.exists(path):
85 return open(path)
86 in_file = self.download_pdb(code)
87 if self.save and not os.path.exists(path):
88 try:
89 if not os.path.exists(self.cache_dir):
90 os.mkdir(self.cache_dir)
91 with open(path, 'w') as out_file:
92 s = in_file.read()
93 out_file.write(s)
94 in_file = StringIO(s)
95 except:
96 try:
97 os.unlink(path)
98 except:
99 pass
100 return in_file
101 cached_download_pdb = CachedDownloadPdb()
103 class SequenceMixin(base.Sequence):
104 """Mixin for adding PDB data to a Sequence.
106 Please note: since this is a mixin, no objects of this class should be
107 created. This class is intended to be subclassed together with one of
108 Sequence classes.
110 Attributes:
112 * pdb_chain -- Bio.PDB.Chain
114 Monomers from this sequence get attribute "pdb_residue", "ca_xyz"
116 ?TODO: global pdb_structures
117 """
119 def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain='A', pdb_model=0,
120 xyz_only=False,
121 min_continuous_match=config.min_continuous_match,
122 min_match=config.min_match,
123 max_waste_in_pdb=config.max_waste_in_pdb,
124 resi_begin=None, resi_end=None):
125 """ Read Pdb chain from file
127 and align each Monomer with PDB.Residue
129 * xyz_only -- if True PDB.Residue objects are not stores
130 * min_continuous_match
131 * min_match
132 * max_waste_in_pdb
133 * resi_begin and resi_end -- identifiers of first and last pdb
134 residues of residue range to be used. If only one is provided,
135 another parameter is set to first or last residue of chain.
136 They can exceed first and last pdb residues of pdb chain.
137 They can be integers or biopythons identifiers of residues.
138 Id of residue is (hetero_flag, sequence_id, insertion_code),
139 see biopdb documantation for more information.
141 The sequence is aligned with pdb_sequence using muscle.
142 The alignment is splitted to continuous_blocks
143 of min length _min_continuous_match_.
144 Monomers from continuous_blocks are used to get pdb information.
145 The number of them should be more than _min_match_.
146 The number of other monomers with structure from pdb_sequence
147 should be less than _max_waste_in_pdb_.
149 raise AssertionError
150 """
151 structure = get_structure(pdb_file, self.name)
152 chain = structure[pdb_model][pdb_chain]
153 if not xyz_only:
154 self.pdb_chain = chain
155 pdb_sequence = self.__class__.from_pdb_chain(chain)
156 if resi_begin:
157 if isinstance(resi_begin, int):
158 resi_begin = (None, resi_begin, None)
159 while pdb_sequence[0].pdb_residue.get_id()[1] < resi_begin[1]:
160 pdb_sequence.pop(0)
161 if resi_end:
162 if isinstance(resi_end, int):
163 resi_end = (None, resi_end, None)
164 while pdb_sequence[-1].pdb_residue.get_id()[1] > resi_end[1]:
165 pdb_sequence.pop()
166 Alignment = self.types.Alignment
167 a = Alignment()
168 a.append_sequence(self)
169 a.append_sequence(pdb_sequence)
170 a.realign(processors.Muscle())
171 Block = self.types.Block
172 matches = Block.from_alignment(a, columns=[])
173 for column in a.columns:
174 if self in column and pdb_sequence in column:
175 monomer = column[self]
176 pdb_monomer = column[pdb_sequence]
177 if hasattr(pdb_monomer, 'pdb_residue'):
178 if monomer == pdb_monomer:
179 matches.columns.append(column)
180 match_blocks = matches.continuous_blocks(min_continuous_match)
181 for block in match_blocks:
182 for monomer, pdb_monomer in block.columns_as_lists():
183 monomer.ca_xyz = pdb_monomer.pdb_residue['CA'].get_vector()
184 if not xyz_only:
185 monomer.pdb_residue = pdb_monomer.pdb_residue
186 matched_pdb = []
187 for block in match_blocks:
188 for column in block.columns:
189 matched_pdb.append(column[pdb_sequence])
190 central_start = matched_pdb[0]
191 central_stop = matched_pdb[-1]
192 central_pdb = []
193 started = False
194 for m in pdb_sequence:
195 if m is central_start:
196 started = True
197 if started and hasattr(m, 'pdb_residue'):
198 central_pdb.append(m)
199 if m is central_stop:
200 break
201 assert len(set(central_pdb) - set(matched_pdb)) <= max_waste_in_pdb,\
202 "too much waste in matched pdb (%i > %i)" %\
203 (len(set(central_pdb) - set(matched_pdb)), max_waste_in_pdb)
204 assert len(matched_pdb) >= min_match,\
205 "too short match (%i < %i)" % (len(matched_pdb), min_match)
207 def pdb_unload(self):
208 """ Delete all pdb-connected links """
209 del self.pdb_chain
210 for monomer in self:
211 if hasattr(monomer, 'pdb_residue'):
212 delattr(monomer, 'pdb_residue')
214 @classmethod
215 def from_pdb_chain(cls, chain):
216 """ Return Sequence with Monomers with link to Bio.PDB.Residue
218 chain is Bio.PDB.Chain
219 """
220 cappbuilder = CaPPBuilder()
221 peptides = cappbuilder.build_peptides(chain)
222 Sequence = cls
223 sequence = Sequence()
224 sequence.pdb_chain = chain
225 for peptide in peptides:
226 for ca_atom in peptide.get_ca_list():
227 residue = ca_atom.get_parent()
228 monomer = sequence.append_monomer(code3=residue.get_resname())
229 monomer.pdb_residue = residue
230 return sequence
232 def auto_pdb(self, conformity=None, pdb_getter=download_pdb, xyz_only=False,
233 resi_begin=None, resi_end=None, read_local=True):
234 """ Add pdb information to each monomer
236 * read_local -- if this method is allowed to read a local file
237 (for sequence names like "file:...")
239 raise AssertionError if info has not been successfully added
241 TODO: conformity_file
242 """
243 match = pdb_id_parse(self.name)
244 assert match, "Can't parse seq id: %s" % self.name
245 if 'path' in match and read_local:
246 code = 'path'
247 pdb_file = open(match['path'])
248 else:
249 code = match['code']
250 pdb_file = pdb_getter(code)
251 chain = match['chain']
252 model = match['model']
253 resi_begin = resi_begin or match['resi_begin']
254 resi_end = resi_end or match['resi_end']
255 self.set_pdb_chain(pdb_file, code, chain, model, xyz_only=xyz_only,
256 resi_begin=resi_begin, resi_end=resi_end)
258 def superimpose(self, main_sequence, gc, extra_columns=False):
259 """ Superimpose structure of this to main_sequence
261 * gc -- collection of columns to use as for superimposition
262 * extra_columns -- allow columns with gaps or without structure
263 """
264 fixed_gc = []
265 moving_gc = []
266 for column in gc:
267 if main_sequence in column and self in column:
268 if hasattr(column[main_sequence], 'pdb_residue') \
269 and hasattr(column[self], 'pdb_residue'):
270 fixed_gc.append(column[main_sequence].pdb_residue['CA'])
271 moving_gc.append(column[self].pdb_residue['CA'])
272 else:
273 assert extra_columns
274 else:
275 assert extra_columns
276 sup = Superimposer()
277 sup.set_atoms(fixed_gc, moving_gc)
278 moving = self.pdb_chain.get_atoms()
279 sup.apply(moving)
280 for monomer in self:
281 if hasattr(monomer, 'pdb_residue'):
282 monomer.ca_xyz = monomer.pdb_residue['CA'].get_vector()
284 def save_pdb(self, out_filename, new_chain=None, new_model=None):
285 """ Save pdb_chain to out_file """
286 chain = self.pdb_chain
287 old_chain = chain.id
288 if new_chain:
289 chain.id = new_chain # change private member
290 model = chain.get_parent()
291 class MySelect(Select):
292 def accept_chain(myselect, tested_chain):
293 return tested_chain is chain
294 def accept_model(myselect, tested_model):
295 return tested_model is model
296 temp = StringIO()
297 io = PDBIO()
298 structure = model.get_parent()
299 io.set_structure(structure)
300 io.save(temp, MySelect())
301 if new_model is not None:
302 out_filename.write("%-80s\n" % ("MODEL %4i" % new_model))
303 temp.seek(0)
304 for line in temp:
305 if not line.startswith('MODEL') and not line.startswith('ENDMDL'):
306 out_filename.write(line)
307 temp.close()
308 if new_model is not None:
309 out_filename.write("%-80s\n" % "ENDMDL")
310 self.pdb_chain.id = old_chain
312 class AlignmentMixin(base.Alignment):
313 """Mixin to add 3D properties to alignments.
315 Please note: since this is a mixin, no objects of this class should be
316 created. This class is intended to be subclassed together with one of
317 Alignment classes.
318 """
320 def column2pos(self):
321 """ Return {column: pos_n} """
322 c2p = {}
323 for i, column in enumerate(self.columns):
324 c2p[column] = i
325 return c2p
327 def blocks_to_file(self, file, blocks):
328 """ Write blocks to text |-separated file
330 Blocks are firstly separated to continuous_blocks,
331 each continuous_block takes one line of file.
332 Sequence-to-block relation is stored as ManyToMany,
333 each pair <Block, Sequence> takes 1 line
334 """
335 column2pos = self.column2pos()
336 file.write("# Block|from|to(exclusive)|gaps\n")
337 for i, block in enumerate(blocks):
338 for cb in block.continuous_blocks():
339 p_from = column2pos[cb.columns[0]]
340 p_to = column2pos[cb.columns[-1]] + 1
341 width = p_to - p_from
342 gaps = width - len(cb.columns)
343 file.write("%(i)i|%(from)i|%(to)i|%(gaps)i\n" %\
344 {'i': i, 'from': p_from,
345 'to': p_to, 'gaps': gaps})
346 file.write("# Block|Sequence\n")
347 for i, block in enumerate(blocks):
348 for sequence in block.sequences:
349 file.write("%(i)i|%(sequence)s\n" %\
350 {'i': i, 'sequence': sequence.name})
352 def blocks_from_file(self, file):
353 """ Return blocks from file generated by blocks_to_file """
354 name2sequence = {}
355 for sequence in self.sequences:
356 name2sequence[sequence.name] = sequence
357 Block = self.types.Block
358 blocks = {} # {int: Block}
359 for line in file:
360 line = line.strip()
361 if not line or '#' in line:
362 continue
363 parts = line.split('|')
364 if len(parts) == 4:
365 i, pos_from, pos_to, gaps = parts
366 i = int(i)
367 pos_from = int(pos_from)
368 pos_to = int(pos_to)
369 if i not in blocks:
370 blocks[i] = Block.from_alignment(self,
371 sequences=[], columns=[])
372 for pos in range(pos_from, pos_to):
373 blocks[i].columns.append(self.columns[pos])
374 else:
375 i, sequence_name = parts
376 i = int(i)
377 sequence = name2sequence[sequence_name]
378 blocks[i].sequences.append(sequence)
379 blocks = [block for i, block in sorted(blocks.items())]
380 for block in blocks:
381 block_sequences = set(block.sequences)
382 columns = []
383 for column in block.columns:
384 if set(column.keys()) & block_sequences:
385 columns.append(column)
386 block.columns = columns
387 return blocks
389 def blocks_to_html(self, file, blocks, template):
390 """ Write convenient html-file for viewing blocks
392 template -- str with template of html
393 this template should contain substring 'self_js_text'
394 to be replaced with information about blocks
395 """
396 column2pos = self.column2pos()
397 js_blocks = []
398 js_fasta_dict = {}
399 for block in blocks:
400 p_from = column2pos[block.columns[0]]
401 p_to = column2pos[block.columns[-1]]
402 js_block = {}
403 if len(block.columns) == p_to + 1 - p_from:
404 # continuous block
405 js_block['start'] = p_from
406 js_block['end'] = p_to
407 else:
408 # not continuous block
409 js_block['positions'] = [column2pos[c] for c in block.columns]
410 js_block['IDs'] = []
411 for sequence in block.sequences:
412 js_block['IDs'].append(sequence.name)
413 js_block['cores'] = []
414 js_blocks.append(js_block)
415 def char(monomer):
416 if monomer:
417 return monomer.code1
418 return '-'
419 for row in self.rows_as_lists():
420 sequence = row.sequence
421 line = "".join(map(char, row))
422 js_fasta_dict[sequence.name] = line
423 self_js_test = ("blocks = json('%(blocks)s');" +\
424 "fasta_dict=json('%(fasta_dict)s');") %\
425 {'blocks': json.dumps(js_blocks),
426 'fasta_dict': json.dumps(js_fasta_dict)}
427 file.write(template.replace('self_js_text', self_js_test))
429 def blocks_to_homology(self, file, blocks):
430 """ Apply transitive closure of homology and save as homology classes
432 Add number markup to every sequence if it has not been added
433 """
434 for s in self.sequences:
435 s.add_markup('number', use_existing=True)
436 class_id = 1
437 column2blocks = {}
438 for column in self.columns:
439 column2blocks[column] = []
440 for block in blocks:
441 for column in block.columns:
442 column2blocks[column].append(block)
443 for i, column in enumerate(self.columns):
444 sequence_graph = Graph()
445 for block in column2blocks[column]:
446 for s1 in block.sequences:
447 for s2 in block.sequences:
448 if s1 is not s2:
449 sequence_graph.set_edge(s1, s2)
450 classes = sequence_graph.connected_components()
451 for s in column:
452 if s not in sequence_graph:
453 classes.append([s])
454 for c in classes:
455 monomer_ids = [(s.name, column[s].number) for s in c]
456 MonomerHomology.write_class(file, monomer_ids, class_id, i+1)
457 class_id += 1
459 def blocks_from_homology(self, homology, min_width=-1):
460 """ Return blocks extracted from homology object
462 * homology -- MonomerHomology object (must be marked up by numbers)
463 * min_width -- min width of block accepted (-1 = all)
464 """
465 result = []
466 Block = self.types.Block
467 name2sequence = {}
468 for sequence in self.sequences:
469 name2sequence[sequence.name] = sequence
470 for names, classes in homology.blocks_data:
471 columns = [self.columns[homology.columns[cl]-1] for cl in classes]
472 sequences = [name2sequence[name] for name in names]
473 if len(columns) >= min_width:
474 result.append(Block.from_alignment(self, sequences, columns))
475 return result
477 def nongap_columns(self):
478 """ return list of not pure gap columns """
479 self_sequences = set(self.sequences)
480 columns = []
481 for column in self.columns:
482 if set(column.keys()) & self_sequences:
483 columns.append(column)
484 return columns
486 def superimpose(self, gc, extra_columns=False):
487 """ Superimpose monomers from this block at gc positions
489 * gc -- collection of columns to use as for superimposition
490 * extra_columns -- allow columns with gaps or without structure
491 """
492 gc = list(gc)
493 if len(self.sequences) >= 1:
494 sequences = copy(self.sequences)
495 main_sequence = sequences.pop()
496 for sequence in sequences:
497 sequence.superimpose(main_sequence, gc, extra_columns=extra_columns)
499 def save_pdb(self, out_file):
500 """ Save all sequences
502 return {sequence: (new_chain, new_model)}
503 """
504 map = {}
505 chains = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
506 chain_index = -1
507 model = 0
508 for sequence in self.sequences:
509 chain_index += 1
510 if chain_index >= len(chains):
511 chain_index = 0
512 model += 1
513 chain = chains[chain_index]
514 sequence.save_pdb(out_file, chain, model)
515 map[sequence] = (chain, model)
516 return map
518 def blocks_to_pymol(self, file, blocks):
519 """ Create pymol script for viewing superimpositions by blocks
521 One sequence is selected to be the main sequence.
522 Other sequences are fitted to the main sequence, using all the columns
523 from any block, which contains both sequences.
525 All atoms, involved into self block, are shown with sticks,
526 transparency=0.6 (only backbone C, N and CA atoms).
527 Atoms, involved in blocks, are shown with cartoon.
529 TODO: models
530 """
531 def sequence_to_pdb(s):
532 return pdb_id_parse(s.pdb_chain.get_parent().get_parent().get_id())['code']
533 def monomer_id(m):
534 return m.pdb_residue.get_id()[1]
535 def sequence_line(s):
536 return "%(pdb)s_%(chain)s" % \
537 {'pdb': sequence_to_pdb(s), 'chain': s.pdb_chain.get_id()}
538 def columns_line(columns, s):
539 groups = []
540 for c in sorted(monomer_id(c[s]) for c in columns if s in c):
541 if not groups or groups[-1][-1] != c - 1:
542 groups.append([c])
543 else:
544 groups[-1].append(c)
545 intervals = []
546 for group in groups:
547 if len(group) == 1:
548 intervals.append(str(group[0]))
549 else:
550 intervals.append(str(group[0]) + '-' + str(group[-1]))
551 return '((resi %(resi)s) and %(seq)s)' % \
552 {'resi': ','.join(intervals), 'seq': sequence_line(s)}
553 FITTABLE = " and name ca and alt a+''"
554 pdbs = set(sequence_to_pdb(s) for s in self.sequences)
555 file.write('# Command "by_block X" is defined to superimpose block X\n')
556 file.write('\n')
557 for pdb in pdbs:
558 file.write("fetch %(pdb)s, async=0\n" % {'pdb': pdb})
559 for s in self.sequences:
560 file.write("create %(pdb)s_%(chain)s,%(pdb)s and chain %(chain)s\n"\
561 % {'pdb': sequence_to_pdb(s), 'chain': s.pdb_chain.get_id()})
562 for pdb in pdbs:
563 file.write("delete %(pdb)s\n" % {'pdb': pdb})
564 fit_to = self.sequences[0]
565 for i, block in enumerate(blocks):
566 subselections = []
567 for s in block.sequences:
568 subselections.append(columns_line(block.columns, s))
569 file.write("select block_%(i)i, %(sels)s\n" %
570 {'i': i, 'sels': ' or '.join(subselections)})
571 for fit_from in self.sequences[1:]:
572 columns = set()
573 for block in blocks:
574 if fit_to in block.sequences and fit_from in block.sequences:
575 columns |= set(col for col in block.columns
576 if fit_from in col and fit_to in col)
577 file.write("pair_fit %(fit_from)s, %(fit_to)s\n" %
578 {'fit_to': columns_line(columns, fit_to) + FITTABLE,
579 'fit_from': columns_line(columns, fit_from) + FITTABLE})
580 file.write("hide everything, all\n")
581 for s in self.sequences:
582 file.write("show sticks, %(seq)s and name c+n+ca\n" %
583 {'seq': columns_line(self.columns, s)})
584 for block in blocks:
585 for s in block.sequences:
586 file.write("show cartoon, %(seq)s\n" %
587 {'seq': columns_line(block.columns, s)})
588 file.write("set stick_transparency, 0.6, all\n")
589 file.write("orient visible\n")
590 block_to_seqs = {}
591 for i, block in enumerate(blocks):
592 block_to_seqs[i] = [sequence_line(s) for s in block.sequences]
593 file.write('''
594 python
595 from pymol import cmd
596 block_to_seqs = {block_to_seqs}
597 def by_block(i):
598 main_seq = block_to_seqs[i][0]
599 for seq in block_to_seqs[i][1:]:
600 template = "block_%(i)s and %(seq)s {FITTABLE}"
601 fit_to = template % {'i': i, 'seq': main_seq}
602 fit_from = template % {'i': i, 'seq': seq}
603 cmd.do("pair_fit %(fit_from)s, %(fit_to)s" % vars())
604 cmd.extend('by_block', by_block)
605 python end
606 '''
607 .replace('{block_to_seqs}', json.dumps(block_to_seqs))
608 .replace('{FITTABLE}', FITTABLE)
611 class BlockMixin(base.Block, AlignmentMixin):
612 """Mixin to add 3D properties to blocks.
614 Please note: since this is a mixin, no objects of this class should be
615 created. This class is intended to be subclassed together with one of
616 Block classes.
617 """
619 def add_surrounded_columns(self, column_set, test=None):
620 """ add to column_set non-gap columns surrounded by column_set
622 * test(block, column) -- function. If specified, column
623 is added only if test(...) == True
624 """
625 nongap_columns = self.nongap_columns()
626 for i, column in enumerate(nongap_columns):
627 if column not in column_set:
628 if not set(self.sequences) - set(column.keys()): # no gaps
629 if i > 0\
630 and nongap_columns[i-1] in column_set\
631 and i < len(nongap_columns)-1\
632 and nongap_columns[i+1] in column_set:
633 if not test or test(self, column):
634 column_set.add(column)
636 def geometrical_cores(self, max_delta=config.delta,
637 timeout=config.timeout, minsize=config.minsize,
638 ac_new_atoms=config.ac_new_atoms, ac_count=config.ac_count,
639 ignore_one_ss=True):
640 """ Return length-sorted list of GCs
641 GC is set of columns
643 * max_delta -- threshold of distance spreading
644 * timeout -- Bron-Kerbosch timeout (then fast O(n ln n) algorithm)
645 * minsize -- min size of each core
646 * ac_new_atoms -- min part or new atoms in new alternative core
647 current GC is compared with each of already selected GCs if
648 difference is less then ac_new_atoms, current GC is skipped
649 difference = part of new atoms in current core
650 * ac_count -- max number of cores (including main core)
651 0 means infinity
652 * ignore_one_ss -- ignore geometrical cores, owned by one secondary
653 structure element in every sequence
655 weight is calculated as 1 / (delta + 1)
656 delta in [0, +inf) => weight in (0, 1]
657 """
658 graph = Graph()
659 for i, column1 in enumerate(self.columns):
660 for j, column2 in enumerate(self.columns):
661 if i < j:
662 distances = []
663 for sequence in self.sequences:
664 if sequence in column1 and sequence in column2:
665 m1 = column1[sequence]
666 m2 = column2[sequence]
667 if hasattr(m1, 'ca_xyz') and hasattr(m2, 'ca_xyz'):
668 delta_vector = m1.ca_xyz - m2.ca_xyz
669 d = sum([C**2 for C in delta_vector.get_array()]) ** 0.5
670 distances.append(d)
671 if len(distances) == len(self.sequences):
672 delta = max(distances) - min(distances)
673 if delta <= max_delta:
674 graph.set_edge(column1, column2,
675 1.0 / (1.0 + delta))
676 cliques = graph.cliques(minsize=minsize, timeout=timeout)
677 GCs = []
678 for clique in cliques:
679 clique = set(clique)
680 self.add_surrounded_columns(clique, test=lambda b, c: \
681 all(hasattr(c[s], 'ca_xyz') \
682 for s in set(b.sequences) & set(c)))
683 for GC in GCs:
684 new_vertices = clique - GC
685 if len(new_vertices) < ac_new_atoms * len(clique):
686 break
687 else:
688 GCs.append(clique)
689 if ac_count != 0 and len(GCs) >= ac_count:
690 break
691 if ignore_one_ss:
692 for s in self.sequences:
693 s.add_markup('ss', use_existing=True)
694 def ss_is_ok(sequence, columns):
695 ss_types = set(c[sequence].ss for c in columns)
696 # is not pure alpha-helix or beta-strand
697 return ss_types != set('H') and ss_types != set('E')
698 def gc_is_ok(gc):
699 return any(ss_is_ok(s, gc) for s in self.sequences)
700 GCs = [gc for gc in GCs if gc_is_ok(gc)]
701 return GCs
703 def pair_core_parts(self, max_delta=config.delta,
704 timeout=config.timeout, min_width=config.min_width,
705 min_core_size=config.min_core_size, ignore_one_ss=True, join=True):
706 """ Return list of continuous parts of gc for each sequence pair
708 part is block
710 * join -- join parts from all alternative cores
711 """
712 column2pos = self.column2pos()
713 result = []
714 for i, seq1 in enumerate(self.sequences):
715 for j, seq2 in enumerate(self.sequences):
716 if i < j:
717 block = copy(self)
718 block.sequences = [seq1, seq2]
719 cores = block.geometrical_cores(max_delta=max_delta,
720 timeout=timeout, minsize=min_core_size,
721 ac_new_atoms=0.0, ac_count=0,
722 ignore_one_ss=ignore_one_ss)
723 parts = []
724 for core in cores:
725 core_block = copy(block)
726 core_block.columns = sorted(core,
727 key=column2pos.get)
728 parts += core_block.continuous_blocks(min_width)
729 if join:
730 columns = set()
731 for part in parts:
732 columns |= set(part.columns)
733 join_block = copy(block)
734 join_block.columns = sorted(columns,
735 key=column2pos.get)
736 result += join_block.continuous_blocks()
737 else:
738 result += parts
739 return result
741 def blocks3d(self, max_delta=config.delta,
742 timeout=config.timeout, timeout_2=config.timeout_2,
743 min_width=config.min_width, ignore_one_ss=True, primary_cliques=False,
744 parts=None):
745 """ Return length-sorted list of reliable blocks
747 * max_delta -- threshold of distance spreading (ignored if parts)
748 * timeout -- Bron-Kerbosch timeout (couple cores) (ignored if parts)
749 * timeout_2 -- Bron-Kerbosch timeout (blocks)
750 * min_width -- min width of each core (ignored if parts)
751 * ignore_one_ss -- ignore geometrical cores, owned by one secondary
752 structure element in every sequence (ignored if parts)
753 * primary_cliques -- if True, return cliques
754 instead of not-self-intersected blocks
755 * parts -- pre-calculated pair core parts (the method will change it)
756 """
757 result = []
758 # for sorting
759 column2pos = self.column2pos()
760 sequence2i = {}
761 for i, sequence in enumerate(self.sequences):
762 sequence2i[sequence] = i
763 graph = Graph()
764 monomer2column = {}
765 monomer2sequence = {}
766 for column in self.columns:
767 for sequence, monomer in column.items():
768 monomer2column[monomer] = column
769 monomer2sequence[monomer] = sequence
770 if parts is None:
771 parts = self.pair_core_parts(max_delta=max_delta, timeout=timeout,
772 min_width=min_width, ignore_one_ss=ignore_one_ss)
773 boundaries = set() # of Columns
774 for part in parts:
775 boundaries.add(part.columns[0])
776 boundaries.add(part.columns[-1])
777 for part in parts:
778 part.columns = sorted(set(part.columns) & boundaries,
779 key=column2pos.get)
780 monomers = part.monomers()
781 for m1 in monomers:
782 for m2 in monomers:
783 if m1 is not m2:
784 graph.set_edge(m1, m2, 1.0)
785 cliques = graph.cliques(minsize=4, timeout=timeout_2)
786 clique_blocks = []
787 # expand cliques to rectangles
788 for clique in cliques:
789 sequences = set(monomer2sequence[m] for m in clique)
790 columns = set(monomer2column[m] for m in clique)
791 columns = sorted(columns, key=column2pos.get)
792 first_column_i = column2pos[columns[0]]
793 last_column_i = column2pos[columns[-1]]
794 block = copy(self)
795 block.columns = self.columns[first_column_i:last_column_i+1]
796 block.sequences = sequences
797 clique_blocks.append(block)
798 def cmp_blocks(block1, block2):
799 height1 = len(block1.sequences)
800 height2 = len(block2.sequences)
801 if cmp(height1, height2):
802 return cmp(height1, height2)
803 width1 = len(block1.columns)
804 width2 = len(block2.columns)
805 return cmp(width1, width2)
806 # sort blocks
807 clique_blocks.sort(cmp=cmp_blocks, reverse=True)
808 if primary_cliques:
809 return clique_blocks
810 used_monomers = set()
811 for clique_block in clique_blocks:
812 monomers = set(clique_block.monomers())
813 monomers -= used_monomers
814 result_len = len(result)
815 while monomers:
816 sequences = set(monomer2sequence[m] for m in monomers)
817 columns = set(monomer2column[m] for m in monomers)
818 sequences = sorted(sequences, key=sequence2i.get)
819 columns = sorted(columns, key=column2pos.get)
821 block = copy(self)
822 block.columns = columns
823 block.sequences = sequences
824 while len(sequences) > 1:
825 filled_columns = []
826 for c in columns:
827 for s in sequences:
828 if s not in c or c[s] not in monomers:
829 break
830 else:
831 filled_columns.append(c)
832 block = copy(self)
833 block.columns = filled_columns
834 block.sequences = sequences
835 # FIXME min_width is not needed
836 parts = block.continuous_blocks(min_width)
837 if parts:
838 for part in parts:
839 used_monomers |= set(part.monomers())
840 monomers -= set(part.monomers())
841 result.append(part)
842 break
843 else:
844 #remove sequence
845 number_of_columns = {}
846 for sequence in sequences:
847 number_of_columns[sequence] =\
848 len(set(sequence) & monomers)
849 _rev = dict([(v,k) for (k,v) in number_of_columns.items()])
850 victim = _rev[min(_rev.keys())]
851 sequences.remove(sequence)
852 if len(result) == result_len:
853 break
854 else:
855 result_len = len(result)
856 return result
858 def continuous_blocks(self, min_width=1):
859 """ Return list of continued blocks """
860 result = []
861 self_columns = set(self.columns)
862 self_sequences = set(self.sequences)
863 columns_batch = []
864 for column in self.alignment.columns:
865 if column in self_columns:
866 columns_batch.append(column)
867 elif not set(column.keys()) & self_sequences:
868 pass # pure gap (inside block) column
869 elif len(columns_batch) >= min_width:
870 block = copy(self)
871 block.columns = columns_batch
872 columns_batch = []
873 result.append(block)
874 else:
875 columns_batch = []
876 if len(columns_batch) >= min_width:
877 block = copy(self)
878 block.columns = columns_batch
879 result.append(block)
880 return result
882 def monomers(self):
883 """ Return list of all monomers in this block """
884 result = []
885 for column in self.columns:
886 for sequence in self.sequences:
887 if sequence in column:
888 result.append(column[sequence])
889 return result
891 class _SequenceSecondaryStructureMarkup(base.SequenceMarkup):
892 """ WARNING! WARNING! WARNING!
893 This class will soon be eliminated and replaced with totally incompatible one.
894 Only use on your own risk!
896 Secondary structure markup for sequence.
898 Depends on dsspcmbi program.
899 Sequence should be structure.SequenceMixin, pdb should be loaded.
900 Note that DSSP cannot handle mutiple models!
901 Note that dssp executable name is hardcoded (=dsspcmbi).
902 Note that new version of dssp can not be used, so use DSSPold.
904 Notation:
905 * H -- alpha-helix
906 * B -- Isolated beta-bridge residue
907 * E -- Strand
908 * G -- 3-10 helix
909 * I -- pi-helix
910 * T -- Turn
911 * S -- Bend
912 * - -- Other
913 """
915 name = 'ss'
916 io_class = 'SequenceSecondaryStructureMarkup'
918 def refresh(self):
919 chain = self.sequence.pdb_chain
920 model = chain.get_parent()
921 pdb_file = NamedTemporaryFile(delete=False)
922 self.sequence.save_pdb(pdb_file)
923 pdb_file.close()
924 with Silence(dup="stderr"):
925 dssp=DSSP(model, pdb_file.name, dssp='dsspcmbi')
926 for monomer in self.sequence:
927 try:
928 monomer.ss = dssp[(chain.get_id(), monomer.pdb_residue.get_id())][1]
929 except:
930 monomer.ss = '?' # FIXME
931 os.unlink(pdb_file.name)
932 markups.SequenceSecondaryStructureMarkup = _SequenceSecondaryStructureMarkup
933 markups.update()
935 # vim: set ts=4 sts=4 sw=4 et: