allpy: b556c96c6719 allpy/structure.py

allpy

view allpy/structure.py @ 1168:b556c96c6719

blocks3d/www Makefile: never check certificates of github, they are too confusing for wget

author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Mon, 26 May 2014 17:20:29 +0400
parents	205cd762fd3b
children

line source

1 """ Functions to get pdb information from fasta id

2 and to generate fasta id from pdb information

4 pdb information: code, chain, model

6 TODO: same for local pdb files

7 """

9 import re

10 import os

11 import os.path

12 import urllib2

13 from copy import copy

14 import json

15 from cStringIO import StringIO

17 from Bio.PDB import PDBParser

18 from Bio.PDB import Select

19 from Bio.PDB import Superimposer, CaPPBuilder, PDBIO

20 from Bio.PDB import DSSP

22 import base

23 import markups

24 import processors

25 import config

26 from graph import Graph

27 from homology import MonomerHomology

28 from util import Silence

30 from tempfile import NamedTemporaryFile

33 # for pdb-codes

34 pdb_file_pattern = re.compile(r"^file:(?P<path>[^:]+)(:(?P<chain>[0-9a-z ])(:(?P<model>[0-9]{1,3})(:(?P<resi_begin>-?\d+)(:(?P<resi_end>-?\d+))?)?)?)?", re.I)

36 # for pdb-codes

37 re1 = re.compile(r"(^|[^a-z0-9])(?P<code>[0-9][0-9a-z]{3})([^a-z0-9](?P<chain>[0-9a-z ]?)([^a-z0-9/](?P<model>[0-9]{1,3}))?)?(/(?P<resi_begin>-?\d+):(?P<resi_end>-?\d+))?", re.I)

39 def pdb_id_parse(ID):

40 if ID.startswith('file:'):

41 match = pdb_file_pattern.search(ID)

42 else:

43 match = re1.search(ID)

44 if not match:

45 return None

46 d = match.groupdict()

47 if 'chain' not in d or not d['chain']:

48 d['chain'] = ' '

49 if 'model' not in d or not d['model']:

50 d['model'] = 0

51 d['model'] = int(d['model'])

52 if d['resi_begin']:

53 d['resi_begin'] = int(d['resi_begin'])

54 if d['resi_end']:

55 d['resi_end'] = int(d['resi_end'])

56 return d

58 def get_structure(file, name):

59 with Silence():

60 return PDBParser().get_structure(name, file)

62 class DownloadPdb(object):

63 """ Functor downloading pdb from web """

64 def __init__(self, pdb_url=config.pdb_url):

65 """ pdb_url -- format string, template of url """

66 self.pdb_url = pdb_url

67 def __call__(self, code):

68 code = code.lower()

69 url = self.pdb_url % code

70 if config.debug:

71 print "Download %s" % url

72 return urllib2.urlopen(url)

73 download_pdb = DownloadPdb()

75 class CachedDownloadPdb(object):

76 def __init__(self, pdb_url=config.pdb_url, cache_dir='pdb_cache', save=True):

77 self.pdb_url = pdb_url

78 self.cache_dir=cache_dir

79 self.download_pdb = DownloadPdb(pdb_url=self.pdb_url)

80 self.save = save

81 def __call__(self, code):

82 code = code.lower()

83 path = os.path.join(self.cache_dir, code + '.pdb')

84 if os.path.exists(path):

85 return open(path)

86 in_file = self.download_pdb(code)

87 if self.save and not os.path.exists(path):

88 try:

89 if not os.path.exists(self.cache_dir):

90 os.mkdir(self.cache_dir)

91 with open(path, 'w') as out_file:

92 s = in_file.read()

93 out_file.write(s)

94 in_file = StringIO(s)

95 except:

96 try:

97 os.unlink(path)

98 except:

99 pass

100 return in_file

101 cached_download_pdb = CachedDownloadPdb()

102

103 class SequenceMixin(base.Sequence):

104 """Mixin for adding PDB data to a Sequence.

105

106 Please note: since this is a mixin, no objects of this class should be

107 created. This class is intended to be subclassed together with one of

108 Sequence classes.

109

110 Attributes:

111

112 * pdb_chain -- Bio.PDB.Chain

113

114 Monomers from this sequence get attribute "pdb_residue", "ca_xyz"

115

116 ?TODO: global pdb_structures

117 """

118

119 def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain='A', pdb_model=0,

120 xyz_only=False,

121 min_continuous_match=config.min_continuous_match,

122 min_match=config.min_match,

123 max_waste_in_pdb=config.max_waste_in_pdb,

124 resi_begin=None, resi_end=None):

125 """ Read Pdb chain from file

126

127 and align each Monomer with PDB.Residue

128

129 * xyz_only -- if True PDB.Residue objects are not stores

130 * min_continuous_match

131 * min_match

132 * max_waste_in_pdb

133 * resi_begin and resi_end -- identifiers of first and last pdb

134 residues of residue range to be used. If only one is provided,

135 another parameter is set to first or last residue of chain.

136 They can exceed first and last pdb residues of pdb chain.

137 They can be integers or biopythons identifiers of residues.

138 Id of residue is (hetero_flag, sequence_id, insertion_code),

139 see biopdb documantation for more information.

140

141 The sequence is aligned with pdb_sequence using muscle.

142 The alignment is splitted to continuous_blocks

143 of min length _min_continuous_match_.

144 Monomers from continuous_blocks are used to get pdb information.

145 The number of them should be more than _min_match_.

146 The number of other monomers with structure from pdb_sequence

147 should be less than _max_waste_in_pdb_.

148

149 raise AssertionError

150 """

151 structure = get_structure(pdb_file, self.name)

152 chain = structure[pdb_model][pdb_chain]

153 if not xyz_only:

154 self.pdb_chain = chain

155 pdb_sequence = self.__class__.from_pdb_chain(chain)

156 if resi_begin:

157 if isinstance(resi_begin, int):

158 resi_begin = (None, resi_begin, None)

159 while pdb_sequence[0].pdb_residue.get_id()[1] < resi_begin[1]:

160 pdb_sequence.pop(0)

161 if resi_end:

162 if isinstance(resi_end, int):

163 resi_end = (None, resi_end, None)

164 while pdb_sequence[-1].pdb_residue.get_id()[1] > resi_end[1]:

165 pdb_sequence.pop()

166 Alignment = self.types.Alignment

167 a = Alignment()

168 a.append_sequence(self)

169 a.append_sequence(pdb_sequence)

170 a.realign(processors.Muscle())

171 Block = self.types.Block

172 matches = Block.from_alignment(a, columns=[])

173 for column in a.columns:

174 if self in column and pdb_sequence in column:

175 monomer = column[self]

176 pdb_monomer = column[pdb_sequence]

177 if hasattr(pdb_monomer, 'pdb_residue'):

178 if monomer == pdb_monomer:

179 matches.columns.append(column)

180 match_blocks = matches.continuous_blocks(min_continuous_match)

181 for block in match_blocks:

182 for monomer, pdb_monomer in block.columns_as_lists():

183 monomer.ca_xyz = pdb_monomer.pdb_residue['CA'].get_vector()

184 if not xyz_only:

185 monomer.pdb_residue = pdb_monomer.pdb_residue

186 matched_pdb = []

187 for block in match_blocks:

188 for column in block.columns:

189 matched_pdb.append(column[pdb_sequence])

190 central_start = matched_pdb[0]

191 central_stop = matched_pdb[-1]

192 central_pdb = []

193 started = False

194 for m in pdb_sequence:

195 if m is central_start:

196 started = True

197 if started and hasattr(m, 'pdb_residue'):

198 central_pdb.append(m)

199 if m is central_stop:

200 break

201 assert len(set(central_pdb) - set(matched_pdb)) <= max_waste_in_pdb,\

202 "too much waste in matched pdb (%i > %i)" %\

203 (len(set(central_pdb) - set(matched_pdb)), max_waste_in_pdb)

204 assert len(matched_pdb) >= min_match,\

205 "too short match (%i < %i)" % (len(matched_pdb), min_match)

206

207 def pdb_unload(self):

208 """ Delete all pdb-connected links """

209 del self.pdb_chain

210 for monomer in self:

211 if hasattr(monomer, 'pdb_residue'):

212 delattr(monomer, 'pdb_residue')

213

214 @classmethod

215 def from_pdb_chain(cls, chain):

216 """ Return Sequence with Monomers with link to Bio.PDB.Residue

217

218 chain is Bio.PDB.Chain

219 """

220 cappbuilder = CaPPBuilder()

221 peptides = cappbuilder.build_peptides(chain)

222 Sequence = cls

223 sequence = Sequence()

224 sequence.pdb_chain = chain

225 for peptide in peptides:

226 for ca_atom in peptide.get_ca_list():

227 residue = ca_atom.get_parent()

228 monomer = sequence.append_monomer(code3=residue.get_resname())

229 monomer.pdb_residue = residue

230 return sequence

231

232 def auto_pdb(self, conformity=None, pdb_getter=download_pdb, xyz_only=False,

233 resi_begin=None, resi_end=None, read_local=True):

234 """ Add pdb information to each monomer

235

236 * read_local -- if this method is allowed to read a local file

237 (for sequence names like "file:...")

238

239 raise AssertionError if info has not been successfully added

240

241 TODO: conformity_file

242 """

243 match = pdb_id_parse(self.name)

244 assert match, "Can't parse seq id: %s" % self.name

245 if 'path' in match and read_local:

246 code = 'path'

247 pdb_file = open(match['path'])

248 else:

249 code = match['code']

250 pdb_file = pdb_getter(code)

251 chain = match['chain']

252 model = match['model']

253 resi_begin = resi_begin or match['resi_begin']

254 resi_end = resi_end or match['resi_end']

255 self.set_pdb_chain(pdb_file, code, chain, model, xyz_only=xyz_only,

256 resi_begin=resi_begin, resi_end=resi_end)

257

258 def superimpose(self, main_sequence, gc, extra_columns=False):

259 """ Superimpose structure of this to main_sequence

260

261 * gc -- collection of columns to use as for superimposition

262 * extra_columns -- allow columns with gaps or without structure

263 """

264 fixed_gc = []

265 moving_gc = []

266 for column in gc:

267 if main_sequence in column and self in column:

268 if hasattr(column[main_sequence], 'pdb_residue') \

269 and hasattr(column[self], 'pdb_residue'):

270 fixed_gc.append(column[main_sequence].pdb_residue['CA'])

271 moving_gc.append(column[self].pdb_residue['CA'])

272 else:

273 assert extra_columns

274 else:

275 assert extra_columns

276 sup = Superimposer()

277 sup.set_atoms(fixed_gc, moving_gc)

278 moving = self.pdb_chain.get_atoms()

279 sup.apply(moving)

280 for monomer in self:

281 if hasattr(monomer, 'pdb_residue'):

282 monomer.ca_xyz = monomer.pdb_residue['CA'].get_vector()

283

284 def save_pdb(self, out_filename, new_chain=None, new_model=None):

285 """ Save pdb_chain to out_file """

286 chain = self.pdb_chain

287 old_chain = chain.id

288 if new_chain:

289 chain.id = new_chain # change private member

290 model = chain.get_parent()

291 class MySelect(Select):

292 def accept_chain(myselect, tested_chain):

293 return tested_chain is chain

294 def accept_model(myselect, tested_model):

295 return tested_model is model

296 temp = StringIO()

297 io = PDBIO()

298 structure = model.get_parent()

299 io.set_structure(structure)

300 io.save(temp, MySelect())

301 if new_model is not None:

302 out_filename.write("%-80s\n" % ("MODEL %4i" % new_model))

303 temp.seek(0)

304 for line in temp:

305 if not line.startswith('MODEL') and not line.startswith('ENDMDL'):

306 out_filename.write(line)

307 temp.close()

308 if new_model is not None:

309 out_filename.write("%-80s\n" % "ENDMDL")

310 self.pdb_chain.id = old_chain

311

312 class AlignmentMixin(base.Alignment):

313 """Mixin to add 3D properties to alignments.

314

315 Please note: since this is a mixin, no objects of this class should be

316 created. This class is intended to be subclassed together with one of

317 Alignment classes.

318 """

319

320 def column2pos(self):

321 """ Return {column: pos_n} """

322 c2p = {}

323 for i, column in enumerate(self.columns):

324 c2p[column] = i

325 return c2p

326

327 def blocks_to_file(self, file, blocks):

328 """ Write blocks to text |-separated file

329

330 Blocks are firstly separated to continuous_blocks,

331 each continuous_block takes one line of file.

332 Sequence-to-block relation is stored as ManyToMany,

333 each pair <Block, Sequence> takes 1 line

334 """

335 column2pos = self.column2pos()

336 file.write("# Block|from|to(exclusive)|gaps\n")

337 for i, block in enumerate(blocks):

338 for cb in block.continuous_blocks():

339 p_from = column2pos[cb.columns[0]]

340 p_to = column2pos[cb.columns[-1]] + 1

341 width = p_to - p_from

342 gaps = width - len(cb.columns)

343 file.write("%(i)i|%(from)i|%(to)i|%(gaps)i\n" %\

344 {'i': i, 'from': p_from,

345 'to': p_to, 'gaps': gaps})

346 file.write("# Block|Sequence\n")

347 for i, block in enumerate(blocks):

348 for sequence in block.sequences:

349 file.write("%(i)i|%(sequence)s\n" %\

350 {'i': i, 'sequence': sequence.name})

351

352 def blocks_from_file(self, file):

353 """ Return blocks from file generated by blocks_to_file """

354 name2sequence = {}

355 for sequence in self.sequences:

356 name2sequence[sequence.name] = sequence

357 Block = self.types.Block

358 blocks = {} # {int: Block}

359 for line in file:

360 line = line.strip()

361 if not line or '#' in line:

362 continue

363 parts = line.split('|')

364 if len(parts) == 4:

365 i, pos_from, pos_to, gaps = parts

366 i = int(i)

367 pos_from = int(pos_from)

368 pos_to = int(pos_to)

369 if i not in blocks:

370 blocks[i] = Block.from_alignment(self,

371 sequences=[], columns=[])

372 for pos in range(pos_from, pos_to):

373 blocks[i].columns.append(self.columns[pos])

374 else:

375 i, sequence_name = parts

376 i = int(i)

377 sequence = name2sequence[sequence_name]

378 blocks[i].sequences.append(sequence)

379 blocks = [block for i, block in sorted(blocks.items())]

380 for block in blocks:

381 block_sequences = set(block.sequences)

382 columns = []

383 for column in block.columns:

384 if set(column.keys()) & block_sequences:

385 columns.append(column)

386 block.columns = columns

387 return blocks

388

389 def blocks_to_html(self, file, blocks, template):

390 """ Write convenient html-file for viewing blocks

391

392 template -- str with template of html

393 this template should contain substring 'self_js_text'

394 to be replaced with information about blocks

395 """

396 column2pos = self.column2pos()

397 js_blocks = []

398 js_fasta_dict = {}

399 for block in blocks:

400 p_from = column2pos[block.columns[0]]

401 p_to = column2pos[block.columns[-1]]

402 js_block = {}

403 if len(block.columns) == p_to + 1 - p_from:

404 # continuous block

405 js_block['start'] = p_from

406 js_block['end'] = p_to

407 else:

408 # not continuous block

409 js_block['positions'] = [column2pos[c] for c in block.columns]

410 js_block['IDs'] = []

411 for sequence in block.sequences:

412 js_block['IDs'].append(sequence.name)

413 js_block['cores'] = []

414 js_blocks.append(js_block)

415 def char(monomer):

416 if monomer:

417 return monomer.code1

418 return '-'

419 for row in self.rows_as_lists():

420 sequence = row.sequence

421 line = "".join(map(char, row))

422 js_fasta_dict[sequence.name] = line

423 self_js_test = ("blocks = json('%(blocks)s');" +\

424 "fasta_dict=json('%(fasta_dict)s');") %\

425 {'blocks': json.dumps(js_blocks),

426 'fasta_dict': json.dumps(js_fasta_dict)}

427 file.write(template.replace('self_js_text', self_js_test))

428

429 def blocks_to_homology(self, file, blocks):

430 """ Apply transitive closure of homology and save as homology classes

431

432 Add number markup to every sequence if it has not been added

433 """

434 for s in self.sequences:

435 s.add_markup('number', use_existing=True)

436 class_id = 1

437 column2blocks = {}

438 for column in self.columns:

439 column2blocks[column] = []

440 for block in blocks:

441 for column in block.columns:

442 column2blocks[column].append(block)

443 for i, column in enumerate(self.columns):

444 sequence_graph = Graph()

445 for block in column2blocks[column]:

446 for s1 in block.sequences:

447 for s2 in block.sequences:

448 if s1 is not s2:

449 sequence_graph.set_edge(s1, s2)

450 classes = sequence_graph.connected_components()

451 for s in column:

452 if s not in sequence_graph:

453 classes.append([s])

454 for c in classes:

455 monomer_ids = [(s.name, column[s].number) for s in c]

456 MonomerHomology.write_class(file, monomer_ids, class_id, i+1)

457 class_id += 1

458

459 def blocks_from_homology(self, homology, min_width=-1):

460 """ Return blocks extracted from homology object

461

462 * homology -- MonomerHomology object (must be marked up by numbers)

463 * min_width -- min width of block accepted (-1 = all)

464 """

465 result = []

466 Block = self.types.Block

467 name2sequence = {}

468 for sequence in self.sequences:

469 name2sequence[sequence.name] = sequence

470 for names, classes in homology.blocks_data:

471 columns = [self.columns[homology.columns[cl]-1] for cl in classes]

472 sequences = [name2sequence[name] for name in names]

473 if len(columns) >= min_width:

474 result.append(Block.from_alignment(self, sequences, columns))

475 return result

476

477 def nongap_columns(self):

478 """ return list of not pure gap columns """

479 self_sequences = set(self.sequences)

480 columns = []

481 for column in self.columns:

482 if set(column.keys()) & self_sequences:

483 columns.append(column)

484 return columns

485

486 def superimpose(self, gc, extra_columns=False):

487 """ Superimpose monomers from this block at gc positions

488

489 * gc -- collection of columns to use as for superimposition

490 * extra_columns -- allow columns with gaps or without structure

491 """

492 gc = list(gc)

493 if len(self.sequences) >= 1:

494 sequences = copy(self.sequences)

495 main_sequence = sequences.pop()

496 for sequence in sequences:

497 sequence.superimpose(main_sequence, gc, extra_columns=extra_columns)

498

499 def save_pdb(self, out_file):

500 """ Save all sequences

501

502 return {sequence: (new_chain, new_model)}

503 """

504 map = {}

505 chains = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

506 chain_index = -1

507 model = 0

508 for sequence in self.sequences:

509 chain_index += 1

510 if chain_index >= len(chains):

511 chain_index = 0

512 model += 1

513 chain = chains[chain_index]

514 sequence.save_pdb(out_file, chain, model)

515 map[sequence] = (chain, model)

516 return map

517

518 def blocks_to_pymol(self, file, blocks):

519 """ Create pymol script for viewing superimpositions by blocks

520

521 One sequence is selected to be the main sequence.

522 Other sequences are fitted to the main sequence, using all the columns

523 from any block, which contains both sequences.

524

525 All atoms, involved into self block, are shown with sticks,

526 transparency=0.6 (only backbone C, N and CA atoms).

527 Atoms, involved in blocks, are shown with cartoon.

528

529 TODO: models

530 """

531 def sequence_to_pdb(s):

532 return pdb_id_parse(s.pdb_chain.get_parent().get_parent().get_id())['code']

533 def monomer_id(m):

534 return m.pdb_residue.get_id()[1]

535 def sequence_line(s):

536 return "%(pdb)s_%(chain)s" % \

537 {'pdb': sequence_to_pdb(s), 'chain': s.pdb_chain.get_id()}

538 def columns_line(columns, s):

539 groups = []

540 for c in sorted(monomer_id(c[s]) for c in columns if s in c):

541 if not groups or groups[-1][-1] != c - 1:

542 groups.append([c])

543 else:

544 groups[-1].append(c)

545 intervals = []

546 for group in groups:

547 if len(group) == 1:

548 intervals.append(str(group[0]))

549 else:

550 intervals.append(str(group[0]) + '-' + str(group[-1]))

551 return '((resi %(resi)s) and %(seq)s)' % \

552 {'resi': ','.join(intervals), 'seq': sequence_line(s)}

553 FITTABLE = " and name ca and alt a+''"

554 pdbs = set(sequence_to_pdb(s) for s in self.sequences)

555 file.write('# Command "by_block X" is defined to superimpose block X\n')

556 file.write('\n')

557 for pdb in pdbs:

558 file.write("fetch %(pdb)s, async=0\n" % {'pdb': pdb})

559 for s in self.sequences:

560 file.write("create %(pdb)s_%(chain)s,%(pdb)s and chain %(chain)s\n"\

561 % {'pdb': sequence_to_pdb(s), 'chain': s.pdb_chain.get_id()})

562 for pdb in pdbs:

563 file.write("delete %(pdb)s\n" % {'pdb': pdb})

564 fit_to = self.sequences[0]

565 for i, block in enumerate(blocks):

566 subselections = []

567 for s in block.sequences:

568 subselections.append(columns_line(block.columns, s))

569 file.write("select block_%(i)i, %(sels)s\n" %

570 {'i': i, 'sels': ' or '.join(subselections)})

571 for fit_from in self.sequences[1:]:

572 columns = set()

573 for block in blocks:

574 if fit_to in block.sequences and fit_from in block.sequences:

575 columns |= set(col for col in block.columns

576 if fit_from in col and fit_to in col)

577 file.write("pair_fit %(fit_from)s, %(fit_to)s\n" %

578 {'fit_to': columns_line(columns, fit_to) + FITTABLE,

579 'fit_from': columns_line(columns, fit_from) + FITTABLE})

580 file.write("hide everything, all\n")

581 for s in self.sequences:

582 file.write("show sticks, %(seq)s and name c+n+ca\n" %

583 {'seq': columns_line(self.columns, s)})

584 for block in blocks:

585 for s in block.sequences:

586 file.write("show cartoon, %(seq)s\n" %

587 {'seq': columns_line(block.columns, s)})

588 file.write("set stick_transparency, 0.6, all\n")

589 file.write("orient visible\n")

590 block_to_seqs = {}

591 for i, block in enumerate(blocks):

592 block_to_seqs[i] = [sequence_line(s) for s in block.sequences]

593 file.write('''

594 python

595 from pymol import cmd

596 block_to_seqs = {block_to_seqs}

597 def by_block(i):

598 main_seq = block_to_seqs[i][0]

599 for seq in block_to_seqs[i][1:]:

600 template = "block_%(i)s and %(seq)s {FITTABLE}"

601 fit_to = template % {'i': i, 'seq': main_seq}

602 fit_from = template % {'i': i, 'seq': seq}

603 cmd.do("pair_fit %(fit_from)s, %(fit_to)s" % vars())

604 cmd.extend('by_block', by_block)

605 python end

606 '''

607 .replace('{block_to_seqs}', json.dumps(block_to_seqs))

608 .replace('{FITTABLE}', FITTABLE)

609 )

610

611 class BlockMixin(base.Block, AlignmentMixin):

612 """Mixin to add 3D properties to blocks.

613

614 Please note: since this is a mixin, no objects of this class should be

615 created. This class is intended to be subclassed together with one of

616 Block classes.

617 """

618

619 def add_surrounded_columns(self, column_set, test=None):

620 """ add to column_set non-gap columns surrounded by column_set

621

622 * test(block, column) -- function. If specified, column

623 is added only if test(...) == True

624 """

625 nongap_columns = self.nongap_columns()

626 for i, column in enumerate(nongap_columns):

627 if column not in column_set:

628 if not set(self.sequences) - set(column.keys()): # no gaps

629 if i > 0\

630 and nongap_columns[i-1] in column_set\

631 and i < len(nongap_columns)-1\

632 and nongap_columns[i+1] in column_set:

633 if not test or test(self, column):

634 column_set.add(column)

635

636 def geometrical_cores(self, max_delta=config.delta,

637 timeout=config.timeout, minsize=config.minsize,

638 ac_new_atoms=config.ac_new_atoms, ac_count=config.ac_count,

639 ignore_one_ss=True):

640 """ Return length-sorted list of GCs

641 GC is set of columns

642

643 * max_delta -- threshold of distance spreading

644 * timeout -- Bron-Kerbosch timeout (then fast O(n ln n) algorithm)

645 * minsize -- min size of each core

646 * ac_new_atoms -- min part or new atoms in new alternative core

647 current GC is compared with each of already selected GCs if

648 difference is less then ac_new_atoms, current GC is skipped

649 difference = part of new atoms in current core

650 * ac_count -- max number of cores (including main core)

651 0 means infinity

652 * ignore_one_ss -- ignore geometrical cores, owned by one secondary

653 structure element in every sequence

654

655 weight is calculated as 1 / (delta + 1)

656 delta in [0, +inf) => weight in (0, 1]

657 """

658 graph = Graph()

659 for i, column1 in enumerate(self.columns):

660 for j, column2 in enumerate(self.columns):

661 if i < j:

662 distances = []

663 for sequence in self.sequences:

664 if sequence in column1 and sequence in column2:

665 m1 = column1[sequence]

666 m2 = column2[sequence]

667 if hasattr(m1, 'ca_xyz') and hasattr(m2, 'ca_xyz'):

668 delta_vector = m1.ca_xyz - m2.ca_xyz

669 d = sum([C**2 for C in delta_vector.get_array()]) ** 0.5

670 distances.append(d)

671 if len(distances) == len(self.sequences):

672 delta = max(distances) - min(distances)

673 if delta <= max_delta:

674 graph.set_edge(column1, column2,

675 1.0 / (1.0 + delta))

676 cliques = graph.cliques(minsize=minsize, timeout=timeout)

677 GCs = []

678 for clique in cliques:

679 clique = set(clique)

680 self.add_surrounded_columns(clique, test=lambda b, c: \

681 all(hasattr(c[s], 'ca_xyz') \

682 for s in set(b.sequences) & set(c)))

683 for GC in GCs:

684 new_vertices = clique - GC

685 if len(new_vertices) < ac_new_atoms * len(clique):

686 break

687 else:

688 GCs.append(clique)

689 if ac_count != 0 and len(GCs) >= ac_count:

690 break

691 if ignore_one_ss:

692 for s in self.sequences:

693 s.add_markup('ss', use_existing=True)

694 def ss_is_ok(sequence, columns):

695 ss_types = set(c[sequence].ss for c in columns)

696 # is not pure alpha-helix or beta-strand

697 return ss_types != set('H') and ss_types != set('E')

698 def gc_is_ok(gc):

699 return any(ss_is_ok(s, gc) for s in self.sequences)

700 GCs = [gc for gc in GCs if gc_is_ok(gc)]

701 return GCs

702

703 def pair_core_parts(self, max_delta=config.delta,

704 timeout=config.timeout, min_width=config.min_width,

705 min_core_size=config.min_core_size, ignore_one_ss=True, join=True):

706 """ Return list of continuous parts of gc for each sequence pair

707

708 part is block

709

710 * join -- join parts from all alternative cores

711 """

712 column2pos = self.column2pos()

713 result = []

714 for i, seq1 in enumerate(self.sequences):

715 for j, seq2 in enumerate(self.sequences):

716 if i < j:

717 block = copy(self)

718 block.sequences = [seq1, seq2]

719 cores = block.geometrical_cores(max_delta=max_delta,

720 timeout=timeout, minsize=min_core_size,

721 ac_new_atoms=0.0, ac_count=0,

722 ignore_one_ss=ignore_one_ss)

723 parts = []

724 for core in cores:

725 core_block = copy(block)

726 core_block.columns = sorted(core,

727 key=column2pos.get)

728 parts += core_block.continuous_blocks(min_width)

729 if join:

730 columns = set()

731 for part in parts:

732 columns |= set(part.columns)

733 join_block = copy(block)

734 join_block.columns = sorted(columns,

735 key=column2pos.get)

736 result += join_block.continuous_blocks()

737 else:

738 result += parts

739 return result

740

741 def blocks3d(self, max_delta=config.delta,

742 timeout=config.timeout, timeout_2=config.timeout_2,

743 min_width=config.min_width, ignore_one_ss=True, primary_cliques=False,

744 parts=None):

745 """ Return length-sorted list of reliable blocks

746

747 * max_delta -- threshold of distance spreading (ignored if parts)

748 * timeout -- Bron-Kerbosch timeout (couple cores) (ignored if parts)

749 * timeout_2 -- Bron-Kerbosch timeout (blocks)

750 * min_width -- min width of each core (ignored if parts)

751 * ignore_one_ss -- ignore geometrical cores, owned by one secondary

752 structure element in every sequence (ignored if parts)

753 * primary_cliques -- if True, return cliques

754 instead of not-self-intersected blocks

755 * parts -- pre-calculated pair core parts (the method will change it)

756 """

757 result = []

758 # for sorting

759 column2pos = self.column2pos()

760 sequence2i = {}

761 for i, sequence in enumerate(self.sequences):

762 sequence2i[sequence] = i

763 graph = Graph()

764 monomer2column = {}

765 monomer2sequence = {}

766 for column in self.columns:

767 for sequence, monomer in column.items():

768 monomer2column[monomer] = column

769 monomer2sequence[monomer] = sequence

770 if parts is None:

771 parts = self.pair_core_parts(max_delta=max_delta, timeout=timeout,

772 min_width=min_width, ignore_one_ss=ignore_one_ss)

773 boundaries = set() # of Columns

774 for part in parts:

775 boundaries.add(part.columns[0])

776 boundaries.add(part.columns[-1])

777 for part in parts:

778 part.columns = sorted(set(part.columns) & boundaries,

779 key=column2pos.get)

780 monomers = part.monomers()

781 for m1 in monomers:

782 for m2 in monomers:

783 if m1 is not m2:

784 graph.set_edge(m1, m2, 1.0)

785 cliques = graph.cliques(minsize=4, timeout=timeout_2)

786 clique_blocks = []

787 # expand cliques to rectangles

788 for clique in cliques:

789 sequences = set(monomer2sequence[m] for m in clique)

790 columns = set(monomer2column[m] for m in clique)

791 columns = sorted(columns, key=column2pos.get)

792 first_column_i = column2pos[columns[0]]

793 last_column_i = column2pos[columns[-1]]

794 block = copy(self)

795 block.columns = self.columns[first_column_i:last_column_i+1]

796 block.sequences = sequences

797 clique_blocks.append(block)

798 def cmp_blocks(block1, block2):

799 height1 = len(block1.sequences)

800 height2 = len(block2.sequences)

801 if cmp(height1, height2):

802 return cmp(height1, height2)

803 width1 = len(block1.columns)

804 width2 = len(block2.columns)

805 return cmp(width1, width2)

806 # sort blocks

807 clique_blocks.sort(cmp=cmp_blocks, reverse=True)

808 if primary_cliques:

809 return clique_blocks

810 used_monomers = set()

811 for clique_block in clique_blocks:

812 monomers = set(clique_block.monomers())

813 monomers -= used_monomers

814 result_len = len(result)

815 while monomers:

816 sequences = set(monomer2sequence[m] for m in monomers)

817 columns = set(monomer2column[m] for m in monomers)

818 sequences = sorted(sequences, key=sequence2i.get)

819 columns = sorted(columns, key=column2pos.get)

820

821 block = copy(self)

822 block.columns = columns

823 block.sequences = sequences

824 while len(sequences) > 1:

825 filled_columns = []

826 for c in columns:

827 for s in sequences:

828 if s not in c or c[s] not in monomers:

829 break

830 else:

831 filled_columns.append(c)

832 block = copy(self)

833 block.columns = filled_columns

834 block.sequences = sequences

835 # FIXME min_width is not needed

836 parts = block.continuous_blocks(min_width)

837 if parts:

838 for part in parts:

839 used_monomers |= set(part.monomers())

840 monomers -= set(part.monomers())

841 result.append(part)

842 break

843 else:

844 #remove sequence

845 number_of_columns = {}

846 for sequence in sequences:

847 number_of_columns[sequence] =\

848 len(set(sequence) & monomers)

849 _rev = dict([(v,k) for (k,v) in number_of_columns.items()])

850 victim = _rev[min(_rev.keys())]

851 sequences.remove(sequence)

852 if len(result) == result_len:

853 break

854 else:

855 result_len = len(result)

856 return result

857

858 def continuous_blocks(self, min_width=1):

859 """ Return list of continued blocks """

860 result = []

861 self_columns = set(self.columns)

862 self_sequences = set(self.sequences)

863 columns_batch = []

864 for column in self.alignment.columns:

865 if column in self_columns:

866 columns_batch.append(column)

867 elif not set(column.keys()) & self_sequences:

868 pass # pure gap (inside block) column

869 elif len(columns_batch) >= min_width:

870 block = copy(self)

871 block.columns = columns_batch

872 columns_batch = []

873 result.append(block)

874 else:

875 columns_batch = []

876 if len(columns_batch) >= min_width:

877 block = copy(self)

878 block.columns = columns_batch

879 result.append(block)

880 return result

881

882 def monomers(self):

883 """ Return list of all monomers in this block """

884 result = []

885 for column in self.columns:

886 for sequence in self.sequences:

887 if sequence in column:

888 result.append(column[sequence])

889 return result

890

891 class _SequenceSecondaryStructureMarkup(base.SequenceMarkup):

892 """ WARNING! WARNING! WARNING!

893 This class will soon be eliminated and replaced with totally incompatible one.

894 Only use on your own risk!

895

896 Secondary structure markup for sequence.

897

898 Depends on dsspcmbi program.

899 Sequence should be structure.SequenceMixin, pdb should be loaded.

900 Note that DSSP cannot handle mutiple models!

901 Note that dssp executable name is hardcoded (=dsspcmbi).

902 Note that new version of dssp can not be used, so use DSSPold.

903

904 Notation:

905 * H -- alpha-helix

906 * B -- Isolated beta-bridge residue

907 * E -- Strand

908 * G -- 3-10 helix

909 * I -- pi-helix

910 * T -- Turn

911 * S -- Bend

912 * - -- Other

913 """

914

915 name = 'ss'

916 io_class = 'SequenceSecondaryStructureMarkup'

917

918 def refresh(self):

919 chain = self.sequence.pdb_chain

920 model = chain.get_parent()

921 pdb_file = NamedTemporaryFile(delete=False)

922 self.sequence.save_pdb(pdb_file)

923 pdb_file.close()

924 with Silence(dup="stderr"):

925 dssp=DSSP(model, pdb_file.name, dssp='dsspcmbi')

926 for monomer in self.sequence:

927 try:

928 monomer.ss = dssp[(chain.get_id(), monomer.pdb_residue.get_id())][1]

929 except:

930 monomer.ss = '?' # FIXME

931 os.unlink(pdb_file.name)

932 markups.SequenceSecondaryStructureMarkup = _SequenceSecondaryStructureMarkup

933 markups.update()

934

935 # vim: set ts=4 sts=4 sw=4 et: