allpy
changeset 784:78597d31d423
refactoring of pair_cores.py
* add pdb-markup reading (user can provide pdb markup or alignment)
* add CachedDownloadPdb as pdb_getter and allow change chache_dir
* drop useless removing of monomers without structure
(it should be already done by previous steps)
* add messages to exceptions and documentation
* rename run() to homology_from_3d()
* hardening: raise if a structure at least one of sequences can't be loaded
see #86
It seems not to work with markups (returns no blocks)
author | boris (kodomo) <bnagaev@gmail.com> |
---|---|
date | Tue, 12 Jul 2011 23:57:18 +0400 |
parents | 84ac7e748e26 |
children | 6c0d6d3ceafe b6c5178a4337 |
files | pair_cores/pair_cores.py |
diffstat | 1 files changed, 59 insertions(+), 61 deletions(-) [+] |
line diff
1.1 --- a/pair_cores/pair_cores.py Wed Jul 13 00:41:11 2011 +0400 1.2 +++ b/pair_cores/pair_cores.py Tue Jul 12 23:57:18 2011 +0400 1.3 @@ -8,88 +8,86 @@ 1.4 from copy import copy 1.5 1.6 from protein_pdb import Alignment, Block, Monomer, Sequence 1.7 -from allpy import processors 1.8 +from allpy import processors, markups 1.9 import allpy.base 1.10 from html import html_template 1.11 +from allpy.structure import CachedDownloadPdb, cached_download_pdb 1.12 1.13 -def remove_monomers_without_structure(block): 1.14 - monomers_with_structure = set() 1.15 +for path in list(sys.path): 1.16 + if 'allpy' in path: 1.17 + sys.path.append(path + '/utils') 1.18 +try: 1.19 + import extract_pfam 1.20 +except ImportError: 1.21 + print "source SETPATH script!" 1.22 + sys.exit() 1.23 1.24 - for sequence in block.sequences: 1.25 - for monomer in sequence: 1.26 - if hasattr(monomer, 'ca_xyz'): 1.27 - monomers_with_structure.add(monomer) 1.28 +markups.SequencePdbResiMarkup = extract_pfam.SequencePdbResiMarkup 1.29 1.30 - for sequence in block.sequences: 1.31 - to_delete = [] 1.32 - for i, monomer in enumerate(sequence): 1.33 - if monomer not in monomers_with_structure: 1.34 - to_delete.append(i) 1.35 - for n, i in enumerate(to_delete): 1.36 - assert not hasattr(sequence.pop(i-n), 'ca_xyz') 1.37 +def homology_from_3d(markup_file, homology_file, max_delta, alignment_file=None, 1.38 + out_alignment_file=None, out_pair_cores_file=None, out_html_file=None, 1.39 + pdb_getter=cached_download_pdb): 1.40 + """ Turn pdb markup into homology_file 1.41 1.42 - for column in block.columns: 1.43 - for s, m in column.items(): 1.44 - if m not in monomers_with_structure: 1.45 - assert not hasattr(column.pop(s), 'ca_xyz') 1.46 - 1.47 -def run(args): 1.48 - 1.49 + * markup_file -- file with pdb markup of alignment 1.50 + * homology_file -- output file with homology_file 1.51 + * alignment_file -- file with alignment_file (do not use with markup_file) 1.52 + Identifiers of pdb residues are calculated using pairwise alignment. 1.53 + * out_alignment_file -- output file with alignment (backwards-compatibility) 1.54 + * out_pair_cores_file -- output file with pair geomatrical cores 1.55 + * out_html_file -- output file with HTML representation of blocks 1.56 + * pdb_getter -- see structure.SequenceMixin.auto_pdb 1.57 + """ 1.58 + if markup_file: 1.59 + input_file = markup_file 1.60 + input_file_format = 'markup' 1.61 + elif alignment_file: 1.62 + input_file = alignment_file 1.63 + input_file_format = 'fasta' 1.64 + else: 1.65 + raise Exception("Provide input alignment or markup file") 1.66 try: 1.67 - alignment = Alignment().append_file(args.i) 1.68 + alignment = Alignment().append_file(input_file, format=input_file_format) 1.69 except: 1.70 - raise Exception() 1.71 - 1.72 - unique_sequences = list(set([s.name for s in alignment.sequences])) 1.73 - if len(unique_sequences) < len(alignment.sequences): 1.74 - alignment.sequences = unique_sequences 1.75 - 1.76 - bad_sequences = set() 1.77 + raise Exception("Can not read input file %s" % input_file) 1.78 + unique_sequence_names = set([s.name for s in alignment.sequences]) 1.79 + if len(unique_sequence_names) != len(alignment.sequences): 1.80 + raise Exception("Duplicate sequence names in %s" % input_file) 1.81 for sequence in copy(alignment.sequences): 1.82 try: 1.83 - sequence.auto_pdb(xyz_only=True) 1.84 + if markup_file: 1.85 + sequence.markups['pdb_resi'].add_pdb() 1.86 + else: 1.87 + sequence.auto_pdb(xyz_only=True, pdb_getter=pdb_getter) 1.88 except: 1.89 - bad_sequences.add(sequence) 1.90 - if bad_sequences: 1.91 - alignment.sequences = list(set(alignment.sequences)-bad_sequences) 1.92 - 1.93 + raise Exception("Can't load structure for sequence %s from file %s" % (sequence.name, input_file)) 1.94 if len(alignment.sequences) < 2: 1.95 - raise Exception() 1.96 - 1.97 + raise Exception("Alignment from file %s has too few sequences" % input_file) 1.98 block = Block.from_alignment(alignment) 1.99 - remove_monomers_without_structure(block) 1.100 - 1.101 - blocks = block.pair_core_parts(max_delta=args.d, timeout=0) 1.102 - 1.103 - alignment.blocks_to_homology(args.y, blocks) 1.104 - if args.o: 1.105 - block.to_file(args.o) 1.106 - if args.b: 1.107 - block.blocks_to_file(args.b, blocks) 1.108 - if args.H: 1.109 - block.blocks_to_html(args.H, blocks, open(html_template).read()) 1.110 - 1.111 -class A(object): 1.112 - pass 1.113 + blocks = block.pair_core_parts(max_delta=max_delta, timeout=0) 1.114 + alignment.blocks_to_homology(homology_file, blocks) 1.115 + if out_alignment_file: 1.116 + block.to_file(out_alignment_file) 1.117 + if out_pair_cores_file: 1.118 + block.blocks_to_file(out_pair_cores_file, blocks) 1.119 + if out_html_file: 1.120 + block.blocks_to_html(out_html_file, blocks, open(html_template).read()) 1.121 1.122 if __name__ == '__main__': 1.123 - 1.124 r = argparse.FileType('r') 1.125 w = argparse.FileType('w') 1.126 - 1.127 - p = argparse.ArgumentParser( 1.128 - description='PairCores', 1.129 - formatter_class=argparse.ArgumentDefaultsHelpFormatter, 1.130 - ) 1.131 - 1.132 + p = argparse.ArgumentParser(description='PairCores') 1.133 p.add_argument('-v','--version',action='version',version='%(prog)s 2.0') 1.134 - p.add_argument('-i',help='Input alignment file',metavar='FILE',type=r,required=True) 1.135 + p.add_argument('-m',help='Input pdb markup file',metavar='FILE',type=r) 1.136 + p.add_argument('-i',help='Input alignment file',metavar='FILE',type=r) 1.137 p.add_argument('-y',help='Output homology file',metavar='FILE',type=w, required=True) 1.138 + p.add_argument('-c',help='Pdb cache directory',metavar='FILE',type=str, default='pdb_cache') 1.139 p.add_argument('-d',help='Distance spreading',metavar='float',type=float,default=2.0) 1.140 p.add_argument('-o',help='Output alignment file',metavar='FILE',type=w) 1.141 p.add_argument('-b',help='Output pair_cores file',metavar='FILE',type=w) 1.142 p.add_argument('-H',help='Output HTML file',metavar='FILE',type=w) 1.143 + args = p.parse_args() 1.144 + homology_from_3d(markup_file=args.m, homology_file=args.y, max_delta=args.d, 1.145 + alignment_file=args.i, out_alignment_file=args.o, out_pair_cores_file=args.b, 1.146 + out_html_file=args.H, pdb_getter=CachedDownloadPdb(cache_dir=args.c)) 1.147 1.148 - args = p.parse_args() 1.149 - run(args) 1.150 -