allpy
diff allpy/base.py @ 262:e361a7b7d9aa
Moved contents of allpy.sequence into allpy.base
author | Daniil Alexeyevsky <me.dendik@gmail.com> |
---|---|
date | Tue, 14 Dec 2010 21:06:42 +0300 |
parents | d60628e29b24 |
children | e3783fca343e |
line diff
1.1 --- a/allpy/base.py Tue Dec 14 21:02:49 2010 +0300 1.2 +++ b/allpy/base.py Tue Dec 14 21:06:42 2010 +0300 1.3 @@ -1,10 +1,14 @@ 1.4 import sys 1.5 import os 1.6 +import os.path 1.7 from tempfile import NamedTemporaryFile 1.8 +import urllib2 1.9 1.10 import config 1.11 from graph import Graph 1.12 -from Bio.PDB import Superimposer 1.13 +from Bio.PDB import Superimposer, CaPPBuilder, PDBIO 1.14 +from Bio.PDB.DSSP import make_dssp_dict 1.15 +from allpy.pdb import std_id, pdb_id_parse, get_structure 1.16 from fasta import save_fasta 1.17 import data.codes 1.18 1.19 @@ -125,13 +129,203 @@ 1.20 class Sequence(list): 1.21 """ Sequence of Monomers 1.22 1.23 - list of monomer objects 1.24 + list of monomer objects (aminoacids or nucleotides) 1.25 1.26 Mandatory data: 1.27 * name -- str with the name of sequence 1.28 * description -- str with description of the sequence 1.29 + 1.30 + Optional (may be empty): 1.31 + * source -- source of sequence 1.32 + * pdb_chain -- Bio.PDB.Chain 1.33 + * pdb_file -- file object 1.34 + 1.35 + * pdb_residues -- {Monomer: Bio.PDB.Residue} 1.36 + * pdb_secstr -- {Monomer: 'Secondary structure'} 1.37 + Code Secondary structure 1.38 + H alpha-helix 1.39 + B Isolated beta-bridge residue 1.40 + E Strand 1.41 + G 3-10 helix 1.42 + I pi-helix 1.43 + T Turn 1.44 + S Bend 1.45 + - Other 1.46 + 1.47 + 1.48 + ?TODO: global pdb_structures 1.49 """ 1.50 - pass 1.51 + def __init__(self, monomers=None, name='', description=""): 1.52 + if not monomers: 1.53 + monomers = [] 1.54 + self.name = name 1.55 + self.description = description 1.56 + self.monomers = monomers 1.57 + self.pdb_chains = [] 1.58 + self.pdb_files = {} 1.59 + self.pdb_residues = {} 1.60 + self.pdb_secstr = {} 1.61 + 1.62 + def __len__(self): 1.63 + return len(self.monomers) 1.64 + 1.65 + def __str__(self): 1.66 + """ Returns sequence in one-letter code """ 1.67 + return ''.join([monomer.type.code1 for monomer in self.monomers]) 1.68 + 1.69 + def __eq__(self, other): 1.70 + """ Returns if all corresponding monomers of this sequences are equal 1.71 + 1.72 + If lengths of sequences are not equal, returns False 1.73 + """ 1.74 + return len(self) == len(other) and \ 1.75 + all([a==b for a, b in zip(self.monomers, other.monomers)]) 1.76 + 1.77 + def __ne__(self, other): 1.78 + return not (self == other) 1.79 + 1.80 + def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain, pdb_model=0): 1.81 + """ Reads Pdb chain from file 1.82 + 1.83 + and align each Monomer with PDB.Residue (TODO) 1.84 + """ 1.85 + name = std_id(pdb_id, pdb_chain, pdb_model) 1.86 + structure = get_structure(pdb_file, name) 1.87 + chain = structure[pdb_model][pdb_chain] 1.88 + self.pdb_chains.append(chain) 1.89 + self.pdb_residues[chain] = {} 1.90 + self.pdb_secstr[chain] = {} 1.91 + pdb_sequence = Sequence.from_pdb_chain(chain) 1.92 + a = alignment.Alignment.from_sequences(self, pdb_sequence) 1.93 + a.muscle_align() 1.94 + for monomer, pdb_monomer in a.column(sequence=pdb_sequence, original=self): 1.95 + if pdb_sequence.pdb_has(chain, pdb_monomer): 1.96 + residue = pdb_sequence.pdb_residues[chain][pdb_monomer] 1.97 + self.pdb_residues[chain][monomer] = residue 1.98 + self.pdb_files[chain] = pdb_file 1.99 + 1.100 + def pdb_unload(self): 1.101 + """ Delete all pdb-connected links """ 1.102 + #~ gc.get_referrers(self.pdb_chains[0]) 1.103 + self.pdb_chains = [] 1.104 + self.pdb_residues = {} 1.105 + self.pdb_secstr = {} # FIXME 1.106 + self.pdb_files = {} # FIXME 1.107 + 1.108 + @staticmethod 1.109 + def from_str(fasta_str, name='', description='', monomer_kind=AminoAcidType): 1.110 + """ Import data from one-letter code 1.111 + 1.112 + monomer_kind is class, inherited from MonomerType 1.113 + """ 1.114 + monomers = [monomer_kind.from_code1(aa).instance() for aa in fasta_str] 1.115 + return Sequence(monomers, name, description) 1.116 + 1.117 + @staticmethod 1.118 + def from_pdb_chain(chain): 1.119 + """ Returns Sequence with Monomers with link to Bio.PDB.Residue 1.120 + 1.121 + chain is Bio.PDB.Chain 1.122 + """ 1.123 + cappbuilder = CaPPBuilder() 1.124 + peptides = cappbuilder.build_peptides(chain) 1.125 + sequence = Sequence() 1.126 + sequence.pdb_chains = [chain] 1.127 + sequence.pdb_residues[chain] = {} 1.128 + sequence.pdb_secstr[chain] = {} 1.129 + for peptide in peptides: 1.130 + for ca_atom in peptide.get_ca_list(): 1.131 + residue = ca_atom.get_parent() 1.132 + monomer = AminoAcidType.from_pdb_residue(residue).instance() 1.133 + sequence.pdb_residues[chain][monomer] = residue 1.134 + sequence.monomers.append(monomer) 1.135 + return sequence 1.136 + 1.137 + def pdb_auto_add(self, conformity_info=None, pdb_directory='./tmp'): 1.138 + """ Adds pdb information to each monomer 1.139 + 1.140 + Returns if information has been successfully added 1.141 + TODO: conformity_file 1.142 + 1.143 + id-format lava flow 1.144 + """ 1.145 + if not conformity_info: 1.146 + path = os.path.join(pdb_directory, self.name) 1.147 + if os.path.exists(path) and os.path.getsize(path): 1.148 + match = pdb_id_parse(self.name) 1.149 + self.pdb_chain_add(open(path), match['code'], 1.150 + match['chain'], match['model']) 1.151 + else: 1.152 + match = pdb_id_parse(self.name) 1.153 + if match: 1.154 + code = match['code'] 1.155 + pdb_filename = config.pdb_dir % code 1.156 + if not os.path.exists(pdb_filename) or not os.path.getsize(pdb_filename): 1.157 + url = config.pdb_url % code 1.158 + print "Download %s" % url 1.159 + pdb_file = open(pdb_filename, 'w') 1.160 + data = urllib2.urlopen(url).read() 1.161 + pdb_file.write(data) 1.162 + pdb_file.close() 1.163 + print "Save %s" % pdb_filename 1.164 + pdb_file = open(pdb_filename) 1.165 + self.pdb_chain_add(pdb_file, code, match['chain'], match['model']) 1.166 + 1.167 + def pdb_save(self, out_filename, pdb_chain): 1.168 + """ Saves pdb_chain to out_file """ 1.169 + class GlySelect(Select): 1.170 + def accept_chain(self, chain): 1.171 + if chain == pdb_chain: 1.172 + return 1 1.173 + else: 1.174 + return 0 1.175 + io = PDBIO() 1.176 + structure = chain.get_parent() 1.177 + io.set_structure(structure) 1.178 + io.save(out_filename, GlySelect()) 1.179 + 1.180 + 1.181 + def pdb_add_sec_str(self, pdb_chain): 1.182 + """ Add secondary structure data """ 1.183 + tmp_file = NamedTemporaryFile(delete=False) 1.184 + tmp_file.close() 1.185 + pdb_file = self.pdb_files[pdb_chain].name 1.186 + os.system("dsspcmbi %(pdb)s %(tmp)s" % {'pdb': pdb_file, 'tmp': tmp_file.name}) 1.187 + dssp, keys = make_dssp_dict(tmp_file.name) 1.188 + for monomer in self.monomers: 1.189 + if self.pdb_has(pdb_chain, monomer): 1.190 + residue = self.pdb_residues[pdb_chain][monomer] 1.191 + try: 1.192 + d = dssp[(pdb_chain.get_id(), residue.get_id())] 1.193 + self.pdb_secstr[pdb_chain][monomer] = d[1] 1.194 + except: 1.195 + print "No dssp information about %s at %s" % (monomer, pdb_chain) 1.196 + os.unlink(tmp_file.name) 1.197 + 1.198 + def pdb_has(self, chain, monomer): 1.199 + return chain in self.pdb_residues and monomer in self.pdb_residues[chain] 1.200 + 1.201 + def secstr_has(self, chain, monomer): 1.202 + return chain in self.pdb_secstr and monomer in self.pdb_secstr[chain] 1.203 + 1.204 + @staticmethod 1.205 + def file_slice(file, n_from, n_to, fasta_name='', name='', description='', monomer_kind=AminoAcidType): 1.206 + """ Build and return sequence, consisting of part of sequence from file 1.207 + 1.208 + Does not control gaps 1.209 + """ 1.210 + inside = False 1.211 + number_used = 0 1.212 + s = '' 1.213 + for line in file: 1.214 + line = line.split() 1.215 + if not inside: 1.216 + if line.startswith('>%s' % fasta_name): 1.217 + inside = True 1.218 + else: 1.219 + n = len(line) 1.220 + s += line[(n_from - number_user):(n_to - number_user)] 1.221 + return Sequence.from_str(s, name, description, monomer_kind) 1.222 1.223 class Alignment(dict): 1.224 """ Alignment