view allpy/structure.py @ 1168:b556c96c6719
blocks3d/www Makefile: never check certificates of github, they are too confusing for wget
author |
Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru> |
date |
Mon, 26 May 2014 17:20:29 +0400 |
parents |
205cd762fd3b |
children |
|
line source
1 """ Functions to get pdb information from fasta id
2 and to generate fasta id from pdb information
4 pdb information: code, chain, model
6 TODO: same for local pdb files
15 from cStringIO import StringIO
17 from Bio.PDB import PDBParser
18 from Bio.PDB import Select
19 from Bio.PDB import Superimposer, CaPPBuilder, PDBIO
20 from Bio.PDB import DSSP
26 from graph import Graph
27 from homology import MonomerHomology
28 from util import Silence
30 from tempfile import NamedTemporaryFile
34 pdb_file_pattern = re.compile(r"^file:(?P<path>[^:]+)(:(?P<chain>[0-9a-z ])(:(?P<model>[0-9]{1,3})(:(?P<resi_begin>-?\d+)(:(?P<resi_end>-?\d+))?)?)?)?", re.I)
37 re1 = re.compile(r"(^|[^a-z0-9])(?P<code>[0-9][0-9a-z]{3})([^a-z0-9](?P<chain>[0-9a-z ]?)([^a-z0-9/](?P<model>[0-9]{1,3}))?)?(/(?P<resi_begin>-?\d+):(?P<resi_end>-?\d+))?", re.I)
40 if ID.startswith('file:'):
41 match = pdb_file_pattern.search(ID)
43 match = re1.search(ID)
47 if 'chain' not in d or not d['chain']:
49 if 'model' not in d or not d['model']:
51 d['model'] = int(d['model'])
53 d['resi_begin'] = int(d['resi_begin'])
55 d['resi_end'] = int(d['resi_end'])
58 def get_structure(file, name):
60 return PDBParser().get_structure(name, file)
62 class DownloadPdb(object):
63 """ Functor downloading pdb from web """
64 def __init__(self, pdb_url=config.pdb_url):
65 """ pdb_url -- format string, template of url """
66 self.pdb_url = pdb_url
67 def __call__(self, code):
69 url = self.pdb_url % code
71 print "Download %s" % url
72 return urllib2.urlopen(url)
73 download_pdb = DownloadPdb()
75 class CachedDownloadPdb(object):
76 def __init__(self, pdb_url=config.pdb_url, cache_dir='pdb_cache', save=True):
77 self.pdb_url = pdb_url
78 self.cache_dir=cache_dir
79 self.download_pdb = DownloadPdb(pdb_url=self.pdb_url)
81 def __call__(self, code):
83 path = os.path.join(self.cache_dir, code + '.pdb')
84 if os.path.exists(path):
86 in_file = self.download_pdb(code)
87 if self.save and not os.path.exists(path):
89 if not os.path.exists(self.cache_dir):
90 os.mkdir(self.cache_dir)
91 with open(path, 'w') as out_file:
101 cached_download_pdb = CachedDownloadPdb()
103 class SequenceMixin(base.Sequence):
104 """Mixin for adding PDB data to a Sequence.
106 Please note: since this is a mixin, no objects of this class should be
107 created. This class is intended to be subclassed together with one of
112 * pdb_chain -- Bio.PDB.Chain
114 Monomers from this sequence get attribute "pdb_residue", "ca_xyz"
116 ?TODO: global pdb_structures
119 def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain='A', pdb_model=0,
121 min_continuous_match=config.min_continuous_match,
122 min_match=config.min_match,
123 max_waste_in_pdb=config.max_waste_in_pdb,
124 resi_begin=None, resi_end=None):
125 """ Read Pdb chain from file
127 and align each Monomer with PDB.Residue
129 * xyz_only -- if True PDB.Residue objects are not stores
130 * min_continuous_match
133 * resi_begin and resi_end -- identifiers of first and last pdb
134 residues of residue range to be used. If only one is provided,
135 another parameter is set to first or last residue of chain.
136 They can exceed first and last pdb residues of pdb chain.
137 They can be integers or biopythons identifiers of residues.
138 Id of residue is (hetero_flag, sequence_id, insertion_code),
139 see biopdb documantation for more information.
141 The sequence is aligned with pdb_sequence using muscle.
142 The alignment is splitted to continuous_blocks
143 of min length _min_continuous_match_.
144 Monomers from continuous_blocks are used to get pdb information.
145 The number of them should be more than _min_match_.
146 The number of other monomers with structure from pdb_sequence
147 should be less than _max_waste_in_pdb_.
151 structure = get_structure(pdb_file, self.name)
152 chain = structure[pdb_model][pdb_chain]
154 self.pdb_chain = chain
155 pdb_sequence = self.__class__.from_pdb_chain(chain)
157 if isinstance(resi_begin, int):
158 resi_begin = (None, resi_begin, None)
159 while pdb_sequence[0].pdb_residue.get_id()[1] < resi_begin[1]:
162 if isinstance(resi_end, int):
163 resi_end = (None, resi_end, None)
164 while pdb_sequence[-1].pdb_residue.get_id()[1] > resi_end[1]:
166 Alignment = self.types.Alignment
168 a.append_sequence(self)
169 a.append_sequence(pdb_sequence)
170 a.realign(processors.Muscle())
171 Block = self.types.Block
172 matches = Block.from_alignment(a, columns=[])
173 for column in a.columns:
174 if self in column and pdb_sequence in column:
175 monomer = column[self]
176 pdb_monomer = column[pdb_sequence]
177 if hasattr(pdb_monomer, 'pdb_residue'):
178 if monomer == pdb_monomer:
179 matches.columns.append(column)
180 match_blocks = matches.continuous_blocks(min_continuous_match)
181 for block in match_blocks:
182 for monomer, pdb_monomer in block.columns_as_lists():
183 monomer.ca_xyz = pdb_monomer.pdb_residue['CA'].get_vector()
185 monomer.pdb_residue = pdb_monomer.pdb_residue
187 for block in match_blocks:
188 for column in block.columns:
189 matched_pdb.append(column[pdb_sequence])
190 central_start = matched_pdb[0]
191 central_stop = matched_pdb[-1]
194 for m in pdb_sequence:
195 if m is central_start:
197 if started and hasattr(m, 'pdb_residue'):
198 central_pdb.append(m)
199 if m is central_stop:
201 assert len(set(central_pdb) - set(matched_pdb)) <= max_waste_in_pdb,\
202 "too much waste in matched pdb (%i > %i)" %\
203 (len(set(central_pdb) - set(matched_pdb)), max_waste_in_pdb)
204 assert len(matched_pdb) >= min_match,\
205 "too short match (%i < %i)" % (len(matched_pdb), min_match)
207 def pdb_unload(self):
208 """ Delete all pdb-connected links """
211 if hasattr(monomer, 'pdb_residue'):
212 delattr(monomer, 'pdb_residue')
215 def from_pdb_chain(cls, chain):
216 """ Return Sequence with Monomers with link to Bio.PDB.Residue
218 chain is Bio.PDB.Chain
220 cappbuilder = CaPPBuilder()
221 peptides = cappbuilder.build_peptides(chain)
223 sequence = Sequence()
224 sequence.pdb_chain = chain
225 for peptide in peptides:
226 for ca_atom in peptide.get_ca_list():
227 residue = ca_atom.get_parent()
228 monomer = sequence.append_monomer(code3=residue.get_resname())
229 monomer.pdb_residue = residue
232 def auto_pdb(self, conformity=None, pdb_getter=download_pdb, xyz_only=False,
233 resi_begin=None, resi_end=None, read_local=True):
234 """ Add pdb information to each monomer
236 * read_local -- if this method is allowed to read a local file
237 (for sequence names like "file:...")
239 raise AssertionError if info has not been successfully added
241 TODO: conformity_file
243 match = pdb_id_parse(self.name)
244 assert match, "Can't parse seq id: %s" % self.name
245 if 'path' in match and read_local:
247 pdb_file = open(match['path'])
250 pdb_file = pdb_getter(code)
251 chain = match['chain']
252 model = match['model']
253 resi_begin = resi_begin or match['resi_begin']
254 resi_end = resi_end or match['resi_end']
255 self.set_pdb_chain(pdb_file, code, chain, model, xyz_only=xyz_only,
256 resi_begin=resi_begin, resi_end=resi_end)
258 def superimpose(self, main_sequence, gc, extra_columns=False):
259 """ Superimpose structure of this to main_sequence
261 * gc -- collection of columns to use as for superimposition
262 * extra_columns -- allow columns with gaps or without structure
267 if main_sequence in column and self in column:
268 if hasattr(column[main_sequence], 'pdb_residue') \
269 and hasattr(column[self], 'pdb_residue'):
270 fixed_gc.append(column[main_sequence].pdb_residue['CA'])
271 moving_gc.append(column[self].pdb_residue['CA'])
277 sup.set_atoms(fixed_gc, moving_gc)
278 moving = self.pdb_chain.get_atoms()
281 if hasattr(monomer, 'pdb_residue'):
282 monomer.ca_xyz = monomer.pdb_residue['CA'].get_vector()
284 def save_pdb(self, out_filename, new_chain=None, new_model=None):
285 """ Save pdb_chain to out_file """
286 chain = self.pdb_chain
289 chain.id = new_chain # change private member
290 model = chain.get_parent()
291 class MySelect(Select):
292 def accept_chain(myselect, tested_chain):
293 return tested_chain is chain
294 def accept_model(myselect, tested_model):
295 return tested_model is model
298 structure = model.get_parent()
299 io.set_structure(structure)
300 io.save(temp, MySelect())
301 if new_model is not None:
302 out_filename.write("%-80s\n" % ("MODEL %4i" % new_model))
305 if not line.startswith('MODEL') and not line.startswith('ENDMDL'):
306 out_filename.write(line)
308 if new_model is not None:
309 out_filename.write("%-80s\n" % "ENDMDL")
310 self.pdb_chain.id = old_chain
312 class AlignmentMixin(base.Alignment):
313 """Mixin to add 3D properties to alignments.
315 Please note: since this is a mixin, no objects of this class should be
316 created. This class is intended to be subclassed together with one of
320 def column2pos(self):
321 """ Return {column: pos_n} """
323 for i, column in enumerate(self.columns):
327 def blocks_to_file(self, file, blocks):
328 """ Write blocks to text |-separated file
330 Blocks are firstly separated to continuous_blocks,
331 each continuous_block takes one line of file.
332 Sequence-to-block relation is stored as ManyToMany,
333 each pair <Block, Sequence> takes 1 line
335 column2pos = self.column2pos()
336 file.write("# Block|from|to(exclusive)|gaps\n")
337 for i, block in enumerate(blocks):
338 for cb in block.continuous_blocks():
339 p_from = column2pos[cb.columns[0]]
340 p_to = column2pos[cb.columns[-1]] + 1
341 width = p_to - p_from
342 gaps = width - len(cb.columns)
343 file.write("%(i)i|%(from)i|%(to)i|%(gaps)i\n" %\
344 {'i': i, 'from': p_from,
345 'to': p_to, 'gaps': gaps})
346 file.write("# Block|Sequence\n")
347 for i, block in enumerate(blocks):
348 for sequence in block.sequences:
349 file.write("%(i)i|%(sequence)s\n" %\
350 {'i': i, 'sequence': sequence.name})
352 def blocks_from_file(self, file):
353 """ Return blocks from file generated by blocks_to_file """
355 for sequence in self.sequences:
356 name2sequence[sequence.name] = sequence
357 Block = self.types.Block
358 blocks = {} # {int: Block}
361 if not line or '#' in line:
363 parts = line.split('|')
365 i, pos_from, pos_to, gaps = parts
367 pos_from = int(pos_from)
370 blocks[i] = Block.from_alignment(self,
371 sequences=[], columns=[])
372 for pos in range(pos_from, pos_to):
373 blocks[i].columns.append(self.columns[pos])
375 i, sequence_name = parts
377 sequence = name2sequence[sequence_name]
378 blocks[i].sequences.append(sequence)
379 blocks = [block for i, block in sorted(blocks.items())]
381 block_sequences = set(block.sequences)
383 for column in block.columns:
384 if set(column.keys()) & block_sequences:
385 columns.append(column)
386 block.columns = columns
389 def blocks_to_html(self, file, blocks, template):
390 """ Write convenient html-file for viewing blocks
392 template -- str with template of html
393 this template should contain substring 'self_js_text'
394 to be replaced with information about blocks
396 column2pos = self.column2pos()
400 p_from = column2pos[block.columns[0]]
401 p_to = column2pos[block.columns[-1]]
403 if len(block.columns) == p_to + 1 - p_from:
405 js_block['start'] = p_from
406 js_block['end'] = p_to
408 # not continuous block
409 js_block['positions'] = [column2pos[c] for c in block.columns]
411 for sequence in block.sequences:
412 js_block['IDs'].append(sequence.name)
413 js_block['cores'] = []
414 js_blocks.append(js_block)
419 for row in self.rows_as_lists():
420 sequence = row.sequence
421 line = "".join(map(char, row))
422 js_fasta_dict[sequence.name] = line
423 self_js_test = ("blocks = json('%(blocks)s');" +\
424 "fasta_dict=json('%(fasta_dict)s');") %\
425 {'blocks': json.dumps(js_blocks),
426 'fasta_dict': json.dumps(js_fasta_dict)}
427 file.write(template.replace('self_js_text', self_js_test))
429 def blocks_to_homology(self, file, blocks):
430 """ Apply transitive closure of homology and save as homology classes
432 Add number markup to every sequence if it has not been added
434 for s in self.sequences:
435 s.add_markup('number', use_existing=True)
438 for column in self.columns:
439 column2blocks[column] = []
441 for column in block.columns:
442 column2blocks[column].append(block)
443 for i, column in enumerate(self.columns):
444 sequence_graph = Graph()
445 for block in column2blocks[column]:
446 for s1 in block.sequences:
447 for s2 in block.sequences:
449 sequence_graph.set_edge(s1, s2)
450 classes = sequence_graph.connected_components()
452 if s not in sequence_graph:
455 monomer_ids = [(s.name, column[s].number) for s in c]
456 MonomerHomology.write_class(file, monomer_ids, class_id, i+1)
459 def blocks_from_homology(self, homology, min_width=-1):
460 """ Return blocks extracted from homology object
462 * homology -- MonomerHomology object (must be marked up by numbers)
463 * min_width -- min width of block accepted (-1 = all)
466 Block = self.types.Block
468 for sequence in self.sequences:
469 name2sequence[sequence.name] = sequence
470 for names, classes in homology.blocks_data:
471 columns = [self.columns[homology.columns[cl]-1] for cl in classes]
472 sequences = [name2sequence[name] for name in names]
473 if len(columns) >= min_width:
474 result.append(Block.from_alignment(self, sequences, columns))
477 def nongap_columns(self):
478 """ return list of not pure gap columns """
479 self_sequences = set(self.sequences)
481 for column in self.columns:
482 if set(column.keys()) & self_sequences:
483 columns.append(column)
486 def superimpose(self, gc, extra_columns=False):
487 """ Superimpose monomers from this block at gc positions
489 * gc -- collection of columns to use as for superimposition
490 * extra_columns -- allow columns with gaps or without structure
493 if len(self.sequences) >= 1:
494 sequences = copy(self.sequences)
495 main_sequence = sequences.pop()
496 for sequence in sequences:
497 sequence.superimpose(main_sequence, gc, extra_columns=extra_columns)
499 def save_pdb(self, out_file):
500 """ Save all sequences
502 return {sequence: (new_chain, new_model)}
505 chains = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
508 for sequence in self.sequences:
510 if chain_index >= len(chains):
513 chain = chains[chain_index]
514 sequence.save_pdb(out_file, chain, model)
515 map[sequence] = (chain, model)
518 def blocks_to_pymol(self, file, blocks):
519 """ Create pymol script for viewing superimpositions by blocks
521 One sequence is selected to be the main sequence.
522 Other sequences are fitted to the main sequence, using all the columns
523 from any block, which contains both sequences.
525 All atoms, involved into self block, are shown with sticks,
526 transparency=0.6 (only backbone C, N and CA atoms).
527 Atoms, involved in blocks, are shown with cartoon.
531 def sequence_to_pdb(s):
532 return pdb_id_parse(s.pdb_chain.get_parent().get_parent().get_id())['code']
534 return m.pdb_residue.get_id()[1]
535 def sequence_line(s):
536 return "%(pdb)s_%(chain)s" % \
537 {'pdb': sequence_to_pdb(s), 'chain': s.pdb_chain.get_id()}
538 def columns_line(columns, s):
540 for c in sorted(monomer_id(c[s]) for c in columns if s in c):
541 if not groups or groups[-1][-1] != c - 1:
548 intervals.append(str(group[0]))
550 intervals.append(str(group[0]) + '-' + str(group[-1]))
551 return '((resi %(resi)s) and %(seq)s)' % \
552 {'resi': ','.join(intervals), 'seq': sequence_line(s)}
553 FITTABLE = " and name ca and alt a+''"
554 pdbs = set(sequence_to_pdb(s) for s in self.sequences)
555 file.write('# Command "by_block X" is defined to superimpose block X\n')
558 file.write("fetch %(pdb)s, async=0\n" % {'pdb': pdb})
559 for s in self.sequences:
560 file.write("create %(pdb)s_%(chain)s,%(pdb)s and chain %(chain)s\n"\
561 % {'pdb': sequence_to_pdb(s), 'chain': s.pdb_chain.get_id()})
563 file.write("delete %(pdb)s\n" % {'pdb': pdb})
564 fit_to = self.sequences[0]
565 for i, block in enumerate(blocks):
567 for s in block.sequences:
568 subselections.append(columns_line(block.columns, s))
569 file.write("select block_%(i)i, %(sels)s\n" %
570 {'i': i, 'sels': ' or '.join(subselections)})
571 for fit_from in self.sequences[1:]:
574 if fit_to in block.sequences and fit_from in block.sequences:
575 columns |= set(col for col in block.columns
576 if fit_from in col and fit_to in col)
577 file.write("pair_fit %(fit_from)s, %(fit_to)s\n" %
578 {'fit_to': columns_line(columns, fit_to) + FITTABLE,
579 'fit_from': columns_line(columns, fit_from) + FITTABLE})
580 file.write("hide everything, all\n")
581 for s in self.sequences:
582 file.write("show sticks, %(seq)s and name c+n+ca\n" %
583 {'seq': columns_line(self.columns, s)})
585 for s in block.sequences:
586 file.write("show cartoon, %(seq)s\n" %
587 {'seq': columns_line(block.columns, s)})
588 file.write("set stick_transparency, 0.6, all\n")
589 file.write("orient visible\n")
591 for i, block in enumerate(blocks):
592 block_to_seqs[i] = [sequence_line(s) for s in block.sequences]
595 from pymol import cmd
596 block_to_seqs = {block_to_seqs}
598 main_seq = block_to_seqs[i][0]
599 for seq in block_to_seqs[i][1:]:
600 template = "block_%(i)s and %(seq)s {FITTABLE}"
601 fit_to = template % {'i': i, 'seq': main_seq}
602 fit_from = template % {'i': i, 'seq': seq}
603 cmd.do("pair_fit %(fit_from)s, %(fit_to)s" % vars())
604 cmd.extend('by_block', by_block)
607 .replace('{block_to_seqs}', json.dumps(block_to_seqs))
608 .replace('{FITTABLE}', FITTABLE)
611 class BlockMixin(base.Block, AlignmentMixin):
612 """Mixin to add 3D properties to blocks.
614 Please note: since this is a mixin, no objects of this class should be
615 created. This class is intended to be subclassed together with one of
619 def add_surrounded_columns(self, column_set, test=None):
620 """ add to column_set non-gap columns surrounded by column_set
622 * test(block, column) -- function. If specified, column
623 is added only if test(...) == True
625 nongap_columns = self.nongap_columns()
626 for i, column in enumerate(nongap_columns):
627 if column not in column_set:
628 if not set(self.sequences) - set(column.keys()): # no gaps
630 and nongap_columns[i-1] in column_set\
631 and i < len(nongap_columns)-1\
632 and nongap_columns[i+1] in column_set:
633 if not test or test(self, column):
634 column_set.add(column)
636 def geometrical_cores(self, max_delta=config.delta,
637 timeout=config.timeout, minsize=config.minsize,
638 ac_new_atoms=config.ac_new_atoms, ac_count=config.ac_count,
640 """ Return length-sorted list of GCs
643 * max_delta -- threshold of distance spreading
644 * timeout -- Bron-Kerbosch timeout (then fast O(n ln n) algorithm)
645 * minsize -- min size of each core
646 * ac_new_atoms -- min part or new atoms in new alternative core
647 current GC is compared with each of already selected GCs if
648 difference is less then ac_new_atoms, current GC is skipped
649 difference = part of new atoms in current core
650 * ac_count -- max number of cores (including main core)
652 * ignore_one_ss -- ignore geometrical cores, owned by one secondary
653 structure element in every sequence
655 weight is calculated as 1 / (delta + 1)
656 delta in [0, +inf) => weight in (0, 1]
659 for i, column1 in enumerate(self.columns):
660 for j, column2 in enumerate(self.columns):
663 for sequence in self.sequences:
664 if sequence in column1 and sequence in column2:
665 m1 = column1[sequence]
666 m2 = column2[sequence]
667 if hasattr(m1, 'ca_xyz') and hasattr(m2, 'ca_xyz'):
668 delta_vector = m1.ca_xyz - m2.ca_xyz
669 d = sum([C**2 for C in delta_vector.get_array()]) ** 0.5
671 if len(distances) == len(self.sequences):
672 delta = max(distances) - min(distances)
673 if delta <= max_delta:
674 graph.set_edge(column1, column2,
676 cliques = graph.cliques(minsize=minsize, timeout=timeout)
678 for clique in cliques:
680 self.add_surrounded_columns(clique, test=lambda b, c: \
681 all(hasattr(c[s], 'ca_xyz') \
682 for s in set(b.sequences) & set(c)))
684 new_vertices = clique - GC
685 if len(new_vertices) < ac_new_atoms * len(clique):
689 if ac_count != 0 and len(GCs) >= ac_count:
692 for s in self.sequences:
693 s.add_markup('ss', use_existing=True)
694 def ss_is_ok(sequence, columns):
695 ss_types = set(c[sequence].ss for c in columns)
696 # is not pure alpha-helix or beta-strand
697 return ss_types != set('H') and ss_types != set('E')
699 return any(ss_is_ok(s, gc) for s in self.sequences)
700 GCs = [gc for gc in GCs if gc_is_ok(gc)]
703 def pair_core_parts(self, max_delta=config.delta,
704 timeout=config.timeout, min_width=config.min_width,
705 min_core_size=config.min_core_size, ignore_one_ss=True, join=True):
706 """ Return list of continuous parts of gc for each sequence pair
710 * join -- join parts from all alternative cores
712 column2pos = self.column2pos()
714 for i, seq1 in enumerate(self.sequences):
715 for j, seq2 in enumerate(self.sequences):
718 block.sequences = [seq1, seq2]
719 cores = block.geometrical_cores(max_delta=max_delta,
720 timeout=timeout, minsize=min_core_size,
721 ac_new_atoms=0.0, ac_count=0,
722 ignore_one_ss=ignore_one_ss)
725 core_block = copy(block)
726 core_block.columns = sorted(core,
728 parts += core_block.continuous_blocks(min_width)
732 columns |= set(part.columns)
733 join_block = copy(block)
734 join_block.columns = sorted(columns,
736 result += join_block.continuous_blocks()
741 def blocks3d(self, max_delta=config.delta,
742 timeout=config.timeout, timeout_2=config.timeout_2,
743 min_width=config.min_width, ignore_one_ss=True, primary_cliques=False,
745 """ Return length-sorted list of reliable blocks
747 * max_delta -- threshold of distance spreading (ignored if parts)
748 * timeout -- Bron-Kerbosch timeout (couple cores) (ignored if parts)
749 * timeout_2 -- Bron-Kerbosch timeout (blocks)
750 * min_width -- min width of each core (ignored if parts)
751 * ignore_one_ss -- ignore geometrical cores, owned by one secondary
752 structure element in every sequence (ignored if parts)
753 * primary_cliques -- if True, return cliques
754 instead of not-self-intersected blocks
755 * parts -- pre-calculated pair core parts (the method will change it)
759 column2pos = self.column2pos()
761 for i, sequence in enumerate(self.sequences):
762 sequence2i[sequence] = i
765 monomer2sequence = {}
766 for column in self.columns:
767 for sequence, monomer in column.items():
768 monomer2column[monomer] = column
769 monomer2sequence[monomer] = sequence
771 parts = self.pair_core_parts(max_delta=max_delta, timeout=timeout,
772 min_width=min_width, ignore_one_ss=ignore_one_ss)
773 boundaries = set() # of Columns
775 boundaries.add(part.columns[0])
776 boundaries.add(part.columns[-1])
778 part.columns = sorted(set(part.columns) & boundaries,
780 monomers = part.monomers()
784 graph.set_edge(m1, m2, 1.0)
785 cliques = graph.cliques(minsize=4, timeout=timeout_2)
787 # expand cliques to rectangles
788 for clique in cliques:
789 sequences = set(monomer2sequence[m] for m in clique)
790 columns = set(monomer2column[m] for m in clique)
791 columns = sorted(columns, key=column2pos.get)
792 first_column_i = column2pos[columns[0]]
793 last_column_i = column2pos[columns[-1]]
795 block.columns = self.columns[first_column_i:last_column_i+1]
796 block.sequences = sequences
797 clique_blocks.append(block)
798 def cmp_blocks(block1, block2):
799 height1 = len(block1.sequences)
800 height2 = len(block2.sequences)
801 if cmp(height1, height2):
802 return cmp(height1, height2)
803 width1 = len(block1.columns)
804 width2 = len(block2.columns)
805 return cmp(width1, width2)
807 clique_blocks.sort(cmp=cmp_blocks, reverse=True)
810 used_monomers = set()
811 for clique_block in clique_blocks:
812 monomers = set(clique_block.monomers())
813 monomers -= used_monomers
814 result_len = len(result)
816 sequences = set(monomer2sequence[m] for m in monomers)
817 columns = set(monomer2column[m] for m in monomers)
818 sequences = sorted(sequences, key=sequence2i.get)
819 columns = sorted(columns, key=column2pos.get)
822 block.columns = columns
823 block.sequences = sequences
824 while len(sequences) > 1:
828 if s not in c or c[s] not in monomers:
831 filled_columns.append(c)
833 block.columns = filled_columns
834 block.sequences = sequences
835 # FIXME min_width is not needed
836 parts = block.continuous_blocks(min_width)
839 used_monomers |= set(part.monomers())
840 monomers -= set(part.monomers())
845 number_of_columns = {}
846 for sequence in sequences:
847 number_of_columns[sequence] =\
848 len(set(sequence) & monomers)
849 _rev = dict([(v,k) for (k,v) in number_of_columns.items()])
850 victim = _rev[min(_rev.keys())]
851 sequences.remove(sequence)
852 if len(result) == result_len:
855 result_len = len(result)
858 def continuous_blocks(self, min_width=1):
859 """ Return list of continued blocks """
861 self_columns = set(self.columns)
862 self_sequences = set(self.sequences)
864 for column in self.alignment.columns:
865 if column in self_columns:
866 columns_batch.append(column)
867 elif not set(column.keys()) & self_sequences:
868 pass # pure gap (inside block) column
869 elif len(columns_batch) >= min_width:
871 block.columns = columns_batch
876 if len(columns_batch) >= min_width:
878 block.columns = columns_batch
883 """ Return list of all monomers in this block """
885 for column in self.columns:
886 for sequence in self.sequences:
887 if sequence in column:
888 result.append(column[sequence])
891 class _SequenceSecondaryStructureMarkup(base.SequenceMarkup):
892 """ WARNING! WARNING! WARNING!
893 This class will soon be eliminated and replaced with totally incompatible one.
894 Only use on your own risk!
896 Secondary structure markup for sequence.
898 Depends on dsspcmbi program.
899 Sequence should be structure.SequenceMixin, pdb should be loaded.
900 Note that DSSP cannot handle mutiple models!
901 Note that dssp executable name is hardcoded (=dsspcmbi).
902 Note that new version of dssp can not be used, so use DSSPold.
906 * B -- Isolated beta-bridge residue
916 io_class = 'SequenceSecondaryStructureMarkup'
919 chain = self.sequence.pdb_chain
920 model = chain.get_parent()
921 pdb_file = NamedTemporaryFile(delete=False)
922 self.sequence.save_pdb(pdb_file)
924 with Silence(dup="stderr"):
925 dssp=DSSP(model, pdb_file.name, dssp='dsspcmbi')
926 for monomer in self.sequence:
928 monomer.ss = dssp[(chain.get_id(), monomer.pdb_residue.get_id())][1]
930 monomer.ss = '?' # FIXME
931 os.unlink(pdb_file.name)
932 markups.SequenceSecondaryStructureMarkup = _SequenceSecondaryStructureMarkup
935 # vim: set ts=4 sts=4 sw=4 et: