allpy: 06659228cd6f allpy/pdb.py

allpy

view allpy/pdb.py @ 386:06659228cd6f

fix bug in allpy/pdb.py

author	boris <bnagaev@gmail.com>
date	Wed, 02 Feb 2011 16:21:04 +0300
parents	d7fc6865ce58
children

line source

1 """ Functions to get pdb information from fasta id

2 and to generate fasta id from pdb information

4 pdb information: code, chain, model

6 TODO: same for local pdb files

7 """

9 import re

10 import os

11 import os.path

12 from tempfile import NamedTemporaryFile

13 import urllib2

15 from Bio.PDB import PDBParser

16 from Bio.PDB import Superimposer, CaPPBuilder, PDBIO

17 from Bio.PDB.DSSP import make_dssp_dict

19 import base

20 import processors

21 import config

22 from graph import Graph

25 # for pdb-codes

26 re1 = re.compile(r"(^|[^a-z0-9])(?P<code>[0-9][0-9a-z]{3})([^a-z0-9](?P<chain>[0-9a-z ]?)(?P<model>[^a-z0-9]([0-9]{1,3}))?)?", re.I)

28 def pdb_id_parse(ID):

29 match = re1.search(ID)

30 if not match:

31 return None

32 d = match.groupdict()

33 if 'chain' not in d or not d['chain']:

34 d['chain'] = ' '

35 if 'model' not in d or not d['model']:

36 d['model'] = 0

37 return d

40 def get_structure(file, name):

41 return PDBParser().get_structure(name, file)

43 class SequenceMixin(base.Sequence):

44 """Mixin for adding PDB data to a Sequence.

46 Please note: since this is a mixin, no objects of this class should be

47 created. This class is intended to be subclassed together with one of

48 Sequence classes.

50 Attributes:

52 * pdb_chain -- Bio.PDB.Chain

53 * pdb_residues -- {Monomer: Bio.PDB.Residue}

55 ?TODO: global pdb_structures

56 """

58 def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain='A', pdb_model=0):

59 """ Reads Pdb chain from file

61 and align each Monomer with PDB.Residue (TODO)

62 """

63 structure = get_structure(pdb_file, self.name)

64 chain = structure[pdb_model][pdb_chain]

65 self.pdb_chain = chain

66 self.pdb_residues = {}

67 pdb_sequence = self.__class__.from_pdb_chain(chain)

68 Alignment = self.types.Alignment

69 a = Alignment()

70 a.append_sequence(self)

71 a.append_sequence(pdb_sequence)

72 a.process(processors.Muscle())

73 for monomer, pdb_monomer in a.columns_as_lists():

74 residue = pdb_sequence.pdb_residues[pdb_monomer]

75 self.pdb_residues[monomer] = residue

77 def pdb_unload(self):

78 """ Delete all pdb-connected links """

79 del self.pdb_chain

80 del self.pdb_residues

82 @classmethod

83 def from_pdb_chain(cls, chain):

84 """ Returns Sequence with Monomers with link to Bio.PDB.Residue

86 chain is Bio.PDB.Chain

87 """

88 cappbuilder = CaPPBuilder()

89 peptides = cappbuilder.build_peptides(chain)

90 Sequence = cls

91 Monomer = Sequence.types.Monomer

92 sequence = Sequence()

93 sequence.pdb_chain = chain

94 sequence.pdb_residues = {}

95 for peptide in peptides:

96 for ca_atom in peptide.get_ca_list():

97 residue = ca_atom.get_parent()

98 monomer = Monomer.from_code3(residue.get_resname())

99 sequence.pdb_residues[monomer] = residue

100 sequence.append(monomer)

101 return sequence

102

103 def auto_pdb(self, conformity=None, dir='/tmp', mask='%s.pdb'):

104 """ Adds pdb information to each monomer

105

106 Returns if information has been successfully added

107 TODO: conformity_file

108

109 id-format lava flow

110 """

111 match = pdb_id_parse(self.name)

112 if not match:

113 return False

114 code = match['code']

115 chain = match['chain']

116 model = match['model']

117 user_path = os.path.join(dir, mask % self.name)

118 path = user_path

119 if not os.path.exists(path) or not os.path.getsize(path):

120 path = config.pdb_path % code

121 if not os.path.exists(path) or not os.path.getsize(path):

122 url = config.pdb_url % code

123 print "Download %s" % url

124 path = user_path

125 data = urllib2.urlopen(url).read()

126 pdb_file = open(path, 'w')

127 pdb_file.write(data)

128 pdb_file.close()

129 print "Save %s" % path

130 self.set_pdb_chain(open(path), code, chain, model)

131 return True

132

133 def pdb_save(self, out_filename, pdb_chain):

134 """ Saves pdb_chain to out_file """

135 class MySelect(Select):

136 def accept_chain(self, chain):

137 if chain == pdb_chain:

138 return 1

139 else:

140 return 0

141 io = PDBIO()

142 structure = chain.get_parent()

143 io.set_structure(structure)

144 io.save(out_filename, MySelect())

145

146 def ca_atoms(self, sequence):

147 """ Iterates Ca-atom of monomers of this sequence """

148 return (pdb_residues[monomer]['CA'] for monomer in self)

149

150 class AlignmentMixin(base.Alignment):

151 """Mixin to add 3D properties to alignments.

152

153 Please note: since this is a mixin, no objects of this class should be

154 created. This class is intended to be subclassed together with one of

155 Alignment classes.

156 """

157 pass

158

159 class BlockMixin(base.Block):

160 """Mixin to add 3D properties to blocks.

161

162 Please note: since this is a mixin, no objects of this class should be

163 created. This class is intended to be subclassed together with one of

164 Block classes.

165 """

166

167 def geometrical_cores(self, max_delta=config.delta,

168 timeout=config.timeout, minsize=config.minsize,

169 ac_new_atoms=config.ac_new_atoms, ac_count=config.ac_count):

170 """ Returns length-sorted list of GCs

171 GC is set of columns

172

173 * max_delta -- threshold of distance spreading

174 * timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm)

175 * minsize -- min size of each core

176 * ac_new_atoms -- min part or new atoms in new alternative core

177 current GC is compared with each of already selected GCs if

178 difference is less then ac_new_atoms, current GC is skipped

179 difference = part of new atoms in current core

180 * ac_count -- max number of cores (including main core)

181 -1 means infinity

182

183 cost is calculated as 1 / (delta + 1)

184 delta in [0, +inf) => cost in (0, 1]

185 """

186 nodes = self.columns

187 lines = {}

188 for i, column1 in enumerate(self.columns):

189 for j, column2 in enumerate(self.columns):

190 if i < j:

191 distances = []

192 for sequence in self.sequences:

193 if sequence in column1 and sequence in column2:

194 m1 = column1[sequence]

195 m2 = column2[sequence]

196 r1 = sequence.pdb_residues[m1]

197 r2 = sequence.pdb_residues[m2]

198 ca1 = r1['CA']

199 ca2 = r2['CA']

200 d = ca1 - ca2 # Bio.PDB feature

201 distances.append(d)

202 if len(distances) >= 2:

203 delta = max(distances) - min(distances)

204 if delta <= max_delta:

205 line = Graph.line(column1, column2)

206 lines[line] = 1.0 / (1.0 + max_delta)

207 graph = Graph(nodes, lines)

208 cliques = graph.cliques(timeout=timeout, minsize=minsize)

209 GCs = []

210 for clique in cliques:

211 for GC in GCs:

212 new_nodes = clique - GC

213 if len(new_nodes) < ac_new_atoms * len(clique):

214 break

215 else:

216 GCs.append(clique)

217 return GCs

218

219

220

221 def superimpose(self):

222 """ Superimpose all pdb_chains in this block """

223 sequences_chains = list(self.sequences_chains())

224 if len(sequences_chains) >= 1:

225 sup = Superimposer()

226 fixed_sequence, fixed_chain = sequences_chains.pop()

227 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain)

228 for sequence, chain in sequences_chains:

229 moving_atoms = self.ca_atoms(sequence, chain)

230 sup.set_atoms(fixed_atoms, moving_atoms)

231 # Apply rotation/translation to the moving atoms

232 sup.apply(moving_atoms)

233

234 def pdb_save(self, out_file):

235 """ Save all sequences

236

237 Returns {(sequence, chain): CHAIN}

238 CHAIN is chain letter in new file

239 """

240 tmp_file = NamedTemporaryFile(delete=False)

241 tmp_file.close()

242

243 for sequence, chain in self.sequences_chains():

244 sequence.pdb_save(tmp_file.name, chain)

245 # TODO: read from tmp_file.name

246 # change CHAIN

247 # add to out_file

248

249 os.unlink(NamedTemporaryFile)

250

251 # vim: set ts=4 sts=4 sw=4 et: