allpy
diff allpy/pdb.py @ 385:d7fc6865ce58
refactoring of pdb-related code
* remove (transiently) secstr from pdb.py
* refactoring of pdb loading
* create module protein_pdb with 4 classes with both properties
* make geometrical-core work (with errors and without saving to fasta)
author | boris <bnagaev@gmail.com> |
---|---|
date | Wed, 02 Feb 2011 15:50:20 +0300 |
parents | 91b015afd92b |
children | 06659228cd6f |
line diff
1.1 --- a/allpy/pdb.py Wed Feb 02 15:37:15 2011 +0300 1.2 +++ b/allpy/pdb.py Wed Feb 02 15:50:20 2011 +0300 1.3 @@ -17,23 +17,14 @@ 1.4 from Bio.PDB.DSSP import make_dssp_dict 1.5 1.6 import base 1.7 +import processors 1.8 +import config 1.9 from graph import Graph 1.10 1.11 1.12 # for pdb-codes 1.13 re1 = re.compile(r"(^|[^a-z0-9])(?P<code>[0-9][0-9a-z]{3})([^a-z0-9](?P<chain>[0-9a-z ]?)(?P<model>[^a-z0-9]([0-9]{1,3}))?)?", re.I) 1.14 1.15 -#~ # for files 1.16 -#~ re2 = re.compile(r"(^)([^^]+\.(ent|pdb))([^a-zA-Z0-9]([0-9A-Za-z ]?)([^a-zA-Z0-9]([0-9]{1,3}))?)?$") 1.17 - 1.18 -def std_id(pdb_id, pdb_chain, pdb_model=None): 1.19 - if pdb_model: 1.20 - return "%s_%s_%s" % \ 1.21 - (pdb_id.lower().strip(), pdb_chain.upper().strip(), pdb_model) 1.22 - else: 1.23 - return "%s_%s" % \ 1.24 - (pdb_id.lower().strip(), pdb_chain.upper().strip()) 1.25 - 1.26 def pdb_id_parse(ID): 1.27 match = re1.search(ID) 1.28 if not match: 1.29 @@ -49,32 +40,6 @@ 1.30 def get_structure(file, name): 1.31 return PDBParser().get_structure(name, file) 1.32 1.33 -#~ def std_id_parse(ID): 1.34 - #~ """ 1.35 - #~ Parse standart ID to pdb_code, chain and model 1.36 - #~ """ 1.37 - #~ if '.ent' in ID.lower() or '.pdb' in ID.lower(): 1.38 - #~ # it is file 1.39 - #~ parseO = self.re2.search(ID) # files 1.40 - #~ else: 1.41 - #~ parseO = self.re1.search(ID.lower()) # pdb codes 1.42 - #~ if not parseO: 1.43 - #~ return None 1.44 - #~ parse = parseO.groups() 1.45 - #~ if len(parse) < 2: 1.46 - #~ return None 1.47 - #~ code = parse[1] 1.48 - #~ chain = '' 1.49 - #~ model = None 1.50 - #~ if len(parse) >= 4: 1.51 - #~ chain = parse[3] 1.52 - #~ if chain: 1.53 - #~ chain = chain.upper() 1.54 - #~ if len(parse) >= 6: 1.55 - #~ if parse[5]: 1.56 - #~ model = parse[5] 1.57 - #~ return code, chain, model 1.58 - 1.59 class SequenceMixin(base.Sequence): 1.60 """Mixin for adding PDB data to a Sequence. 1.61 1.62 @@ -85,79 +50,57 @@ 1.63 Attributes: 1.64 1.65 * pdb_chain -- Bio.PDB.Chain 1.66 - * pdb_file -- file object 1.67 - 1.68 * pdb_residues -- {Monomer: Bio.PDB.Residue} 1.69 - * pdb_secstr -- {Monomer: 'Secondary structure'} 1.70 - Code Secondary structure 1.71 - H alpha-helix 1.72 - B Isolated beta-bridge residue 1.73 - E Strand 1.74 - G 3-10 helix 1.75 - I pi-helix 1.76 - T Turn 1.77 - S Bend 1.78 - - Other 1.79 - 1.80 1.81 ?TODO: global pdb_structures 1.82 """ 1.83 1.84 - def __init__(self, *args, **kw): 1.85 - self.pdb_chains = [] 1.86 - self.pdb_files = {} 1.87 - self.pdb_residues = {} 1.88 - self.pdb_secstr = {} 1.89 - 1.90 - def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain, pdb_model=0): 1.91 + def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain='A', pdb_model=0): 1.92 """ Reads Pdb chain from file 1.93 1.94 and align each Monomer with PDB.Residue (TODO) 1.95 """ 1.96 - name = std_id(pdb_id, pdb_chain, pdb_model) 1.97 - structure = get_structure(pdb_file, name) 1.98 + structure = get_structure(pdb_file, self.name) 1.99 chain = structure[pdb_model][pdb_chain] 1.100 - self.pdb_chains.append(chain) 1.101 - self.pdb_residues[chain] = {} 1.102 - self.pdb_secstr[chain] = {} 1.103 - pdb_sequence = Sequence.from_pdb_chain(chain) 1.104 - a = alignment.Alignment.from_sequences(self, pdb_sequence) 1.105 - a.muscle_align() 1.106 - for monomer, pdb_monomer in a.column(sequence=pdb_sequence, original=self): 1.107 - if pdb_sequence.pdb_has(chain, pdb_monomer): 1.108 - residue = pdb_sequence.pdb_residues[chain][pdb_monomer] 1.109 - self.pdb_residues[chain][monomer] = residue 1.110 - self.pdb_files[chain] = pdb_file 1.111 + self.pdb_chain = chain 1.112 + self.pdb_residues = {} 1.113 + pdb_sequence = self.__class__.from_pdb_chain(chain) 1.114 + Alignment = self.types.Alignment 1.115 + a = Alignment() 1.116 + a.append_sequence(self) 1.117 + a.append_sequence(pdb_sequence) 1.118 + a.process(processors.Muscle()) 1.119 + for monomer, pdb_monomer in a.columns_as_lists(): 1.120 + residue = pdb_sequence.pdb_residues[pdb_monomer] 1.121 + self.pdb_residues[monomer] = residue 1.122 1.123 def pdb_unload(self): 1.124 """ Delete all pdb-connected links """ 1.125 - #~ gc.get_referrers(self.pdb_chains[0]) 1.126 - self.pdb_chains = [] 1.127 - self.pdb_residues = {} 1.128 - self.pdb_secstr = {} # FIXME 1.129 - self.pdb_files = {} # FIXME 1.130 + del self.pdb_chain 1.131 + del self.pdb_residues 1.132 1.133 - @staticmethod 1.134 - def from_pdb_chain(chain): 1.135 + @classmethod 1.136 + def from_pdb_chain(cls, chain): 1.137 """ Returns Sequence with Monomers with link to Bio.PDB.Residue 1.138 1.139 chain is Bio.PDB.Chain 1.140 """ 1.141 cappbuilder = CaPPBuilder() 1.142 peptides = cappbuilder.build_peptides(chain) 1.143 + Sequence = cls 1.144 + Monomer = Sequence.types.Monomer 1.145 sequence = Sequence() 1.146 - sequence.pdb_chains = [chain] 1.147 - sequence.pdb_residues[chain] = {} 1.148 - sequence.pdb_secstr[chain] = {} 1.149 + sequence.pdb_chain = chain 1.150 + sequence.pdb_residues = {} 1.151 for peptide in peptides: 1.152 for ca_atom in peptide.get_ca_list(): 1.153 residue = ca_atom.get_parent() 1.154 - monomer = AminoAcidType.from_pdb_residue(residue).instance() 1.155 - sequence.pdb_residues[chain][monomer] = residue 1.156 - sequence.monomers.append(monomer) 1.157 + monomer = Monomer.from_code3(residue.get_resname()) 1.158 + sequence.pdb_residues[monomer] = residue 1.159 + sequence.append(monomer) 1.160 return sequence 1.161 1.162 - def pdb_auto_add(self, conformity_info=None, pdb_directory='./tmp'): 1.163 + def auto_pdb(self, conformity=None, dir='/tmp', mask='%s.pdb'): 1.164 """ Adds pdb information to each monomer 1.165 1.166 Returns if information has been successfully added 1.167 @@ -165,31 +108,31 @@ 1.168 1.169 id-format lava flow 1.170 """ 1.171 - if not conformity_info: 1.172 - path = os.path.join(pdb_directory, self.name) 1.173 - if os.path.exists(path) and os.path.getsize(path): 1.174 - match = pdb_id_parse(self.name) 1.175 - self.pdb_chain_add(open(path), match['code'], 1.176 - match['chain'], match['model']) 1.177 - else: 1.178 - match = pdb_id_parse(self.name) 1.179 - if match: 1.180 - code = match['code'] 1.181 - pdb_filename = config.pdb_dir % code 1.182 - if not os.path.exists(pdb_filename) or not os.path.getsize(pdb_filename): 1.183 - url = config.pdb_url % code 1.184 - print "Download %s" % url 1.185 - pdb_file = open(pdb_filename, 'w') 1.186 - data = urllib2.urlopen(url).read() 1.187 - pdb_file.write(data) 1.188 - pdb_file.close() 1.189 - print "Save %s" % pdb_filename 1.190 - pdb_file = open(pdb_filename) 1.191 - self.pdb_chain_add(pdb_file, code, match['chain'], match['model']) 1.192 + match = pdb_id_parse(self.name) 1.193 + if not match: 1.194 + return False 1.195 + code = match['code'] 1.196 + chain = match['chain'] 1.197 + model = match['model'] 1.198 + user_path = os.path.join(dir, mask % self.name) 1.199 + path = user_path 1.200 + if not os.path.exists(path) or not os.path.getsize(path): 1.201 + path = config.pdb_path % code 1.202 + if not os.path.exists(path) or not os.path.getsize(path): 1.203 + url = config.pdb_url % code 1.204 + print "Download %s" % url 1.205 + path = user_path 1.206 + data = urllib2.urlopen(url).read() 1.207 + pdb_file = open(path, 'w') 1.208 + pdb_file.write(data) 1.209 + pdb_file.close() 1.210 + print "Save %s" % path 1.211 + self.set_pdb_chain(open(path), code, chain, model) 1.212 + return True 1.213 1.214 def pdb_save(self, out_filename, pdb_chain): 1.215 """ Saves pdb_chain to out_file """ 1.216 - class GlySelect(Select): 1.217 + class MySelect(Select): 1.218 def accept_chain(self, chain): 1.219 if chain == pdb_chain: 1.220 return 1 1.221 @@ -198,32 +141,11 @@ 1.222 io = PDBIO() 1.223 structure = chain.get_parent() 1.224 io.set_structure(structure) 1.225 - io.save(out_filename, GlySelect()) 1.226 + io.save(out_filename, MySelect()) 1.227 1.228 - 1.229 - def pdb_add_sec_str(self, pdb_chain): 1.230 - """ Add secondary structure data """ 1.231 - tmp_file = NamedTemporaryFile(delete=False) 1.232 - tmp_file.close() 1.233 - pdb_file = self.pdb_files[pdb_chain].name 1.234 - os.system("dsspcmbi %(pdb)s %(tmp)s" % {'pdb': pdb_file, 'tmp': tmp_file.name}) 1.235 - dssp, keys = make_dssp_dict(tmp_file.name) 1.236 - for monomer in self.monomers: 1.237 - if self.pdb_has(pdb_chain, monomer): 1.238 - residue = self.pdb_residues[pdb_chain][monomer] 1.239 - try: 1.240 - d = dssp[(pdb_chain.get_id(), residue.get_id())] 1.241 - self.pdb_secstr[pdb_chain][monomer] = d[1] 1.242 - except: 1.243 - print "No dssp information about %s at %s" % (monomer, pdb_chain) 1.244 - os.unlink(tmp_file.name) 1.245 - 1.246 - def pdb_has(self, chain, monomer): 1.247 - return chain in self.pdb_residues and monomer in self.pdb_residues[chain] 1.248 - 1.249 - def secstr_has(self, chain, monomer): 1.250 - return chain in self.pdb_secstr and monomer in self.pdb_secstr[chain] 1.251 - 1.252 + def ca_atoms(self, sequence): 1.253 + """ Iterates Ca-atom of monomers of this sequence """ 1.254 + return (pdb_residues[monomer]['CA'] for monomer in self) 1.255 1.256 class AlignmentMixin(base.Alignment): 1.257 """Mixin to add 3D properties to alignments. 1.258 @@ -232,12 +154,7 @@ 1.259 created. This class is intended to be subclassed together with one of 1.260 Alignment classes. 1.261 """ 1.262 - 1.263 - def secstr(self, sequence, pdb_chain, gap='-'): 1.264 - """ Returns string representing secondary structure """ 1.265 - return ''.join([ 1.266 - (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap) 1.267 - for m in self.body[sequence]]) 1.268 + pass 1.269 1.270 class BlockMixin(base.Block): 1.271 """Mixin to add 3D properties to blocks. 1.272 @@ -249,9 +166,9 @@ 1.273 1.274 def geometrical_cores(self, max_delta=config.delta, 1.275 timeout=config.timeout, minsize=config.minsize, 1.276 - ac_new_atoms=config.ac_new_atoms, 1.277 - ac_count=config.ac_count): 1.278 - """ Returns length-sorted list of blocks, representing GCs 1.279 + ac_new_atoms=config.ac_new_atoms, ac_count=config.ac_count): 1.280 + """ Returns length-sorted list of GCs 1.281 + GC is set of columns 1.282 1.283 * max_delta -- threshold of distance spreading 1.284 * timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm) 1.285 @@ -263,50 +180,43 @@ 1.286 * ac_count -- max number of cores (including main core) 1.287 -1 means infinity 1.288 1.289 - If more than one pdb chain for some sequence provided, consider all of them 1.290 cost is calculated as 1 / (delta + 1) 1.291 - 1.292 delta in [0, +inf) => cost in (0, 1] 1.293 """ 1.294 - nodes = self.positions 1.295 + nodes = self.columns 1.296 lines = {} 1.297 - for i in self.positions: 1.298 - for j in self.positions: 1.299 + for i, column1 in enumerate(self.columns): 1.300 + for j, column2 in enumerate(self.columns): 1.301 if i < j: 1.302 distances = [] 1.303 for sequence in self.sequences: 1.304 - for chain in sequence.pdb_chains: 1.305 - m1 = self.alignment.body[sequence][i] 1.306 - m2 = self.alignment.body[sequence][j] 1.307 - if m1 and m2: 1.308 - r1 = sequence.pdb_residues[chain][m1] 1.309 - r2 = sequence.pdb_residues[chain][m2] 1.310 - ca1 = r1['CA'] 1.311 - ca2 = r2['CA'] 1.312 - d = ca1 - ca2 # Bio.PDB feature 1.313 - distances.append(d) 1.314 + if sequence in column1 and sequence in column2: 1.315 + m1 = column1[sequence] 1.316 + m2 = column2[sequence] 1.317 + r1 = sequence.pdb_residues[m1] 1.318 + r2 = sequence.pdb_residues[m2] 1.319 + ca1 = r1['CA'] 1.320 + ca2 = r2['CA'] 1.321 + d = ca1 - ca2 # Bio.PDB feature 1.322 + distances.append(d) 1.323 if len(distances) >= 2: 1.324 delta = max(distances) - min(distances) 1.325 if delta <= max_delta: 1.326 - lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta) 1.327 + line = Graph.line(column1, column2) 1.328 + lines[line] = 1.0 / (1.0 + max_delta) 1.329 graph = Graph(nodes, lines) 1.330 cliques = graph.cliques(timeout=timeout, minsize=minsize) 1.331 GCs = [] 1.332 for clique in cliques: 1.333 for GC in GCs: 1.334 - if len(clique - set(GC.positions)) < ac_new_atoms * len(clique): 1.335 + new_nodes = clique - GC 1.336 + if len(new) < ac_new_atoms * len(clique): 1.337 break 1.338 + else: 1.339 + GCs.append(clique) 1.340 + return GCs 1.341 1.342 - def ca_atoms(self, sequence, pdb_chain): 1.343 - """ Iterates Ca-atom of monomers of this sequence from this block """ 1.344 - return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers()) 1.345 1.346 - def sequences_chains(self): 1.347 - """ Iterates pairs (sequence, chain) """ 1.348 - for sequence in self.alignment.sequences: 1.349 - if sequence in self.sequences: 1.350 - for chain in sequence.pdb_chains: 1.351 - yield (sequence, chain) 1.352 1.353 def superimpose(self): 1.354 """ Superimpose all pdb_chains in this block """