allpy

diff allpy/base.py @ 262:e361a7b7d9aa
Moved contents of allpy.sequence into allpy.base
author: Daniil Alexeyevsky <me.dendik@gmail.com>
date: Tue, 14 Dec 2010 21:06:42 +0300
parents: d60628e29b24
children: e3783fca343e
     1.1 --- a/allpy/base.py	Tue Dec 14 21:02:49 2010 +0300
     1.2 +++ b/allpy/base.py	Tue Dec 14 21:06:42 2010 +0300
     1.3 @@ -1,10 +1,14 @@
     1.4  import sys
     1.5  import os
     1.6 +import os.path
     1.7  from tempfile import NamedTemporaryFile
     1.8 +import urllib2
     1.9  
    1.10  import config
    1.11  from graph import Graph
    1.12 -from Bio.PDB import Superimposer
    1.13 +from Bio.PDB import Superimposer, CaPPBuilder, PDBIO
    1.14 +from Bio.PDB.DSSP import make_dssp_dict
    1.15 +from allpy.pdb import std_id, pdb_id_parse, get_structure
    1.16  from fasta import save_fasta
    1.17  import data.codes
    1.18  
    1.19 @@ -125,13 +129,203 @@
    1.20  class Sequence(list):
    1.21      """ Sequence of Monomers
    1.22  
    1.23 -    list of monomer objects 
    1.24 +    list of monomer objects (aminoacids or nucleotides)
    1.25      
    1.26      Mandatory data:
    1.27      *   name -- str with the name of sequence
    1.28      *   description -- str with description of the sequence
    1.29 +    
    1.30 +    Optional (may be empty):
    1.31 +    *   source -- source of sequence
    1.32 +    *   pdb_chain -- Bio.PDB.Chain
    1.33 +    *   pdb_file -- file object
    1.34 +    
    1.35 +    *   pdb_residues -- {Monomer: Bio.PDB.Residue}
    1.36 +    *   pdb_secstr -- {Monomer: 'Secondary structure'}
    1.37 +            Code   Secondary structure
    1.38 +            H      alpha-helix
    1.39 +            B      Isolated beta-bridge residue
    1.40 +            E      Strand
    1.41 +            G      3-10 helix
    1.42 +            I      pi-helix
    1.43 +            T      Turn
    1.44 +            S      Bend
    1.45 +            -      Other
    1.46 +    
    1.47 +    
    1.48 +    ?TODO: global pdb_structures 
    1.49      """
    1.50 -    pass
    1.51 +    def __init__(self, monomers=None, name='', description=""):
    1.52 +        if not monomers:
    1.53 +            monomers = []
    1.54 +        self.name = name
    1.55 +        self.description = description
    1.56 +        self.monomers = monomers 
    1.57 +        self.pdb_chains = []
    1.58 +        self.pdb_files = {}
    1.59 +        self.pdb_residues = {}
    1.60 +        self.pdb_secstr = {}
    1.61 +        
    1.62 +    def __len__(self):
    1.63 +        return len(self.monomers)
    1.64 +    
    1.65 +    def __str__(self):
    1.66 +        """ Returns sequence in one-letter code """
    1.67 +        return ''.join([monomer.type.code1 for monomer in self.monomers])
    1.68 +        
    1.69 +    def __eq__(self, other):
    1.70 +        """ Returns if all corresponding monomers of this sequences are equal
    1.71 +        
    1.72 +        If lengths of sequences are not equal, returns False
    1.73 +        """ 
    1.74 +        return len(self) == len(other) and \
    1.75 +        all([a==b for a, b in zip(self.monomers, other.monomers)])
    1.76 +    
    1.77 +    def __ne__(self, other):
    1.78 +        return not (self == other)
    1.79 +    
    1.80 +    def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain, pdb_model=0):
    1.81 +        """ Reads Pdb chain from file 
    1.82 +        
    1.83 +        and align each Monomer with PDB.Residue (TODO)
    1.84 +        """
    1.85 +        name = std_id(pdb_id, pdb_chain, pdb_model)
    1.86 +        structure = get_structure(pdb_file, name)
    1.87 +        chain = structure[pdb_model][pdb_chain]
    1.88 +        self.pdb_chains.append(chain)
    1.89 +        self.pdb_residues[chain] = {}
    1.90 +        self.pdb_secstr[chain] = {}
    1.91 +        pdb_sequence = Sequence.from_pdb_chain(chain)
    1.92 +        a = alignment.Alignment.from_sequences(self, pdb_sequence)
    1.93 +        a.muscle_align()
    1.94 +        for monomer, pdb_monomer in a.column(sequence=pdb_sequence, original=self):
    1.95 +            if pdb_sequence.pdb_has(chain, pdb_monomer):
    1.96 +                residue = pdb_sequence.pdb_residues[chain][pdb_monomer]
    1.97 +                self.pdb_residues[chain][monomer] = residue
    1.98 +        self.pdb_files[chain] = pdb_file
    1.99 +    
   1.100 +    def pdb_unload(self):
   1.101 +        """ Delete all pdb-connected links """
   1.102 +        #~ gc.get_referrers(self.pdb_chains[0])
   1.103 +        self.pdb_chains = []
   1.104 +        self.pdb_residues = {}
   1.105 +        self.pdb_secstr = {} # FIXME
   1.106 +        self.pdb_files = {} # FIXME
   1.107 +    
   1.108 +    @staticmethod
   1.109 +    def from_str(fasta_str, name='', description='', monomer_kind=AminoAcidType):
   1.110 +        """ Import data from one-letter code
   1.111 +        
   1.112 +        monomer_kind is class, inherited from MonomerType
   1.113 +        """
   1.114 +        monomers = [monomer_kind.from_code1(aa).instance() for aa in fasta_str]
   1.115 +        return Sequence(monomers, name, description)
   1.116 +
   1.117 +    @staticmethod
   1.118 +    def from_pdb_chain(chain):
   1.119 +        """ Returns Sequence with Monomers with link to Bio.PDB.Residue
   1.120 +        
   1.121 +        chain is Bio.PDB.Chain
   1.122 +        """
   1.123 +        cappbuilder = CaPPBuilder()
   1.124 +        peptides = cappbuilder.build_peptides(chain)
   1.125 +        sequence = Sequence()
   1.126 +        sequence.pdb_chains = [chain]
   1.127 +        sequence.pdb_residues[chain] = {}
   1.128 +        sequence.pdb_secstr[chain] = {}
   1.129 +        for peptide in peptides:
   1.130 +            for ca_atom in peptide.get_ca_list():
   1.131 +                residue = ca_atom.get_parent()
   1.132 +                monomer = AminoAcidType.from_pdb_residue(residue).instance()
   1.133 +                sequence.pdb_residues[chain][monomer] = residue
   1.134 +                sequence.monomers.append(monomer)
   1.135 +        return sequence
   1.136 +    
   1.137 +    def pdb_auto_add(self, conformity_info=None, pdb_directory='./tmp'):
   1.138 +        """ Adds pdb information to each monomer
   1.139 +        
   1.140 +        Returns if information has been successfully added
   1.141 +        TODO: conformity_file
   1.142 +        
   1.143 +        id-format lava flow
   1.144 +        """
   1.145 +        if not conformity_info:
   1.146 +            path = os.path.join(pdb_directory, self.name)
   1.147 +            if os.path.exists(path) and os.path.getsize(path):
   1.148 +                match = pdb_id_parse(self.name)
   1.149 +                self.pdb_chain_add(open(path), match['code'], 
   1.150 +                match['chain'], match['model'])
   1.151 +            else:
   1.152 +                match = pdb_id_parse(self.name)
   1.153 +                if match:
   1.154 +                    code = match['code']
   1.155 +                    pdb_filename = config.pdb_dir % code
   1.156 +                    if not os.path.exists(pdb_filename) or not os.path.getsize(pdb_filename):
   1.157 +                        url = config.pdb_url % code
   1.158 +                        print "Download %s" % url
   1.159 +                        pdb_file = open(pdb_filename, 'w')
   1.160 +                        data = urllib2.urlopen(url).read()
   1.161 +                        pdb_file.write(data)
   1.162 +                        pdb_file.close()
   1.163 +                        print "Save %s" % pdb_filename
   1.164 +                    pdb_file = open(pdb_filename)
   1.165 +                    self.pdb_chain_add(pdb_file, code, match['chain'], match['model'])
   1.166 +        
   1.167 +    def pdb_save(self, out_filename, pdb_chain):
   1.168 +        """ Saves pdb_chain to out_file """
   1.169 +        class GlySelect(Select):
   1.170 +            def accept_chain(self, chain):
   1.171 +                if chain == pdb_chain:
   1.172 +                    return 1
   1.173 +                else:
   1.174 +                    return 0
   1.175 +        io = PDBIO()
   1.176 +        structure = chain.get_parent()
   1.177 +        io.set_structure(structure)
   1.178 +        io.save(out_filename, GlySelect())
   1.179 +        
   1.180 +    
   1.181 +    def pdb_add_sec_str(self, pdb_chain):
   1.182 +        """ Add secondary structure data """
   1.183 +        tmp_file = NamedTemporaryFile(delete=False)
   1.184 +        tmp_file.close()
   1.185 +        pdb_file = self.pdb_files[pdb_chain].name
   1.186 +        os.system("dsspcmbi %(pdb)s %(tmp)s" % {'pdb': pdb_file, 'tmp': tmp_file.name})
   1.187 +        dssp, keys = make_dssp_dict(tmp_file.name)
   1.188 +        for monomer in self.monomers:
   1.189 +            if self.pdb_has(pdb_chain, monomer):
   1.190 +                residue = self.pdb_residues[pdb_chain][monomer]
   1.191 +                try:
   1.192 +                    d = dssp[(pdb_chain.get_id(), residue.get_id())]
   1.193 +                    self.pdb_secstr[pdb_chain][monomer] = d[1]
   1.194 +                except:
   1.195 +                    print "No dssp information about %s at %s" % (monomer, pdb_chain)
   1.196 +        os.unlink(tmp_file.name)
   1.197 +    
   1.198 +    def pdb_has(self, chain, monomer):
   1.199 +        return chain in self.pdb_residues and monomer in self.pdb_residues[chain]
   1.200 +    
   1.201 +    def secstr_has(self, chain, monomer):
   1.202 +        return chain in self.pdb_secstr and monomer in self.pdb_secstr[chain]
   1.203 +
   1.204 +    @staticmethod
   1.205 +    def file_slice(file, n_from, n_to, fasta_name='', name='', description='', monomer_kind=AminoAcidType):
   1.206 +        """ Build and return sequence, consisting of part of sequence from file 
   1.207 +
   1.208 +        Does not control gaps
   1.209 +        """
   1.210 +        inside = False
   1.211 +        number_used = 0
   1.212 +        s = ''
   1.213 +        for line in file:
   1.214 +            line = line.split()
   1.215 +            if not inside:
   1.216 +                if line.startswith('>%s' % fasta_name):
   1.217 +                    inside = True
   1.218 +            else:
   1.219 +                n = len(line)
   1.220 +                s += line[(n_from - number_user):(n_to - number_user)]
   1.221 +        return Sequence.from_str(s, name, description, monomer_kind)
   1.222  
   1.223  class Alignment(dict):
   1.224      """ Alignment
author	Daniil Alexeyevsky <me.dendik@gmail.com>
date	Tue, 14 Dec 2010 21:06:42 +0300
parents	d60628e29b24
children	e3783fca343e