Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/annotate/e361a7b7d9aa/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Mar 2 07:05:04 2014
Кодировка:
allpy: allpy/base.py annotate

allpy

annotate allpy/base.py @ 262:e361a7b7d9aa

Moved contents of allpy.sequence into allpy.base
author Daniil Alexeyevsky <me.dendik@gmail.com>
date Tue, 14 Dec 2010 21:06:42 +0300
parents d60628e29b24
children e3783fca343e
rev   line source
me@261 1 import sys
me@261 2 import os
me@262 3 import os.path
me@261 4 from tempfile import NamedTemporaryFile
me@262 5 import urllib2
me@261 6
me@261 7 import config
me@261 8 from graph import Graph
me@262 9 from Bio.PDB import Superimposer, CaPPBuilder, PDBIO
me@262 10 from Bio.PDB.DSSP import make_dssp_dict
me@262 11 from allpy.pdb import std_id, pdb_id_parse, get_structure
bnagaev@249 12 from fasta import save_fasta
me@260 13 import data.codes
me@260 14
me@260 15 class MonomerType(object):
me@260 16 """Class of monomer types.
me@260 17
me@260 18 Each MonomerType object represents a known monomer type, e.g. Valine,
me@260 19 and is referenced to by each instance of monomer in a given sequence.
me@260 20
me@260 21 - `name`: full name of monomer type
me@260 22 - `code1`: one-letter code
me@260 23 - `code3`: three-letter code
me@260 24 - `is_modified`: either of True or False
me@260 25
me@260 26 class atributes:
me@260 27
me@260 28 - `by_code1`: a mapping from one-letter code to MonomerType object
me@260 29 - `by_code3`: a mapping from three-letter code to MonomerType object
me@260 30 - `by_name`: a mapping from monomer name to MonomerType object
me@260 31 - `instance_type`: class of Monomer objects to use when creating new
me@260 32 objects; this must be redefined in descendent classes
me@260 33
me@260 34 All of the class attributes MUST be redefined when subclassing.
me@260 35 """
me@260 36
me@260 37 by_code1 = {}
me@260 38 by_code3 = {}
me@260 39 by_name = {}
me@260 40 instance_type = None
me@260 41
me@260 42 def __init__(self, name="", code1="", code3="", is_modified=False):
me@260 43 self.name = name.capitalize()
me@260 44 self.code1 = code1.upper()
me@260 45 self.code3 = code3.upper()
me@260 46 self.is_modified = bool(is_modified)
me@260 47 if not is_modified:
me@260 48 self.by_code1[self.code1] = self
me@260 49 self.by_code3[code3] = self
me@260 50 self.by_name[name] = self
me@260 51 # We duplicate distinguished long names into MonomerType itself,
me@260 52 # so that we can use MonomerType.from_code3 to create the relevant
me@260 53 # type of monomer.
me@260 54 MonomerType.by_code3[code3] = self
me@260 55 MonomerType.by_name[name] = self
me@260 56
me@260 57 @classmethod
me@260 58 def _initialize(cls, type_letter, codes=data.codes.codes):
me@260 59 """Create all relevant instances of MonomerType.
me@260 60
me@260 61 `type_letter` is either of:
me@260 62
me@260 63 - 'p' for protein
me@260 64 - 'd' for DNA
me@260 65 - 'r' for RNA
me@260 66
me@260 67 `codes` is a table of monomer codes
me@260 68 """
me@260 69 for type, code1, is_modified, code3, name in codes:
me@260 70 if type == type_letter:
me@260 71 cls(name, code1, code3, is_modified)
me@260 72
me@260 73 @classmethod
me@260 74 def from_code1(cls, code1):
me@260 75 """Return monomer type by one-letter code."""
me@260 76 return cls.by_code1[code1.upper()]
me@260 77
me@260 78 @classmethod
me@260 79 def from_code3(cls, code3):
me@260 80 """Return monomer type by three-letter code."""
me@260 81 return cls.by_code3[code3.upper()]
me@260 82
me@260 83 @classmethod
me@260 84 def from_name(cls, name):
me@260 85 """Return monomer type by name."""
me@260 86 return cls.by_name[name.capitalize()]
me@260 87
me@260 88 def instance(self):
me@260 89 """Create a new monomer of given type."""
me@260 90 return self.instance_type(self)
me@260 91
me@260 92 def __eq__(self, other):
me@260 93 if hasattr(other, "type"):
me@260 94 return self is other.type
me@260 95 return self is other
me@260 96
me@260 97 class Monomer(object):
me@260 98 """Monomer object.
me@260 99
me@260 100 attributes:
me@260 101
me@260 102 - `type`: type of monomer (a MonomerType object)
me@260 103
me@260 104 class attribute `monomer_type` is MonomerType or either of it's subclasses,
me@260 105 it is used when creating new monomers. It MUST be redefined when subclassing Monomer.
me@260 106 """
me@260 107 monomer_type = MonomerType
me@260 108
me@260 109 def __init__(self, type):
me@260 110 self.type = type
me@260 111
me@260 112 @classmethod
me@260 113 def from_code1(cls, code1):
me@260 114 return cls(cls.monomer_type.by_code1[code1.upper()])
me@260 115
me@260 116 @classmethod
me@260 117 def from_code3(cls, code3):
me@260 118 return cls(cls.monomer_type.by_code3[code3.upper()])
me@260 119
me@260 120 @classmethod
me@260 121 def from_name(cls, name):
me@260 122 return cls(cls.monomer_type.by_name[name.capitalize()])
me@260 123
me@260 124 def __eq__(self, other):
me@260 125 if hasattr(other, "type"):
me@260 126 return self.type is other.type
me@260 127 return self.type is other
bnagaev@239 128
bnagaev@239 129 class Sequence(list):
bnagaev@243 130 """ Sequence of Monomers
bnagaev@243 131
me@262 132 list of monomer objects (aminoacids or nucleotides)
bnagaev@243 133
bnagaev@243 134 Mandatory data:
bnagaev@243 135 * name -- str with the name of sequence
bnagaev@243 136 * description -- str with description of the sequence
me@262 137
me@262 138 Optional (may be empty):
me@262 139 * source -- source of sequence
me@262 140 * pdb_chain -- Bio.PDB.Chain
me@262 141 * pdb_file -- file object
me@262 142
me@262 143 * pdb_residues -- {Monomer: Bio.PDB.Residue}
me@262 144 * pdb_secstr -- {Monomer: 'Secondary structure'}
me@262 145 Code Secondary structure
me@262 146 H alpha-helix
me@262 147 B Isolated beta-bridge residue
me@262 148 E Strand
me@262 149 G 3-10 helix
me@262 150 I pi-helix
me@262 151 T Turn
me@262 152 S Bend
me@262 153 - Other
me@262 154
me@262 155
me@262 156 ?TODO: global pdb_structures
bnagaev@243 157 """
me@262 158 def __init__(self, monomers=None, name='', description=""):
me@262 159 if not monomers:
me@262 160 monomers = []
me@262 161 self.name = name
me@262 162 self.description = description
me@262 163 self.monomers = monomers
me@262 164 self.pdb_chains = []
me@262 165 self.pdb_files = {}
me@262 166 self.pdb_residues = {}
me@262 167 self.pdb_secstr = {}
me@262 168
me@262 169 def __len__(self):
me@262 170 return len(self.monomers)
me@262 171
me@262 172 def __str__(self):
me@262 173 """ Returns sequence in one-letter code """
me@262 174 return ''.join([monomer.type.code1 for monomer in self.monomers])
me@262 175
me@262 176 def __eq__(self, other):
me@262 177 """ Returns if all corresponding monomers of this sequences are equal
me@262 178
me@262 179 If lengths of sequences are not equal, returns False
me@262 180 """
me@262 181 return len(self) == len(other) and \
me@262 182 all([a==b for a, b in zip(self.monomers, other.monomers)])
me@262 183
me@262 184 def __ne__(self, other):
me@262 185 return not (self == other)
me@262 186
me@262 187 def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain, pdb_model=0):
me@262 188 """ Reads Pdb chain from file
me@262 189
me@262 190 and align each Monomer with PDB.Residue (TODO)
me@262 191 """
me@262 192 name = std_id(pdb_id, pdb_chain, pdb_model)
me@262 193 structure = get_structure(pdb_file, name)
me@262 194 chain = structure[pdb_model][pdb_chain]
me@262 195 self.pdb_chains.append(chain)
me@262 196 self.pdb_residues[chain] = {}
me@262 197 self.pdb_secstr[chain] = {}
me@262 198 pdb_sequence = Sequence.from_pdb_chain(chain)
me@262 199 a = alignment.Alignment.from_sequences(self, pdb_sequence)
me@262 200 a.muscle_align()
me@262 201 for monomer, pdb_monomer in a.column(sequence=pdb_sequence, original=self):
me@262 202 if pdb_sequence.pdb_has(chain, pdb_monomer):
me@262 203 residue = pdb_sequence.pdb_residues[chain][pdb_monomer]
me@262 204 self.pdb_residues[chain][monomer] = residue
me@262 205 self.pdb_files[chain] = pdb_file
me@262 206
me@262 207 def pdb_unload(self):
me@262 208 """ Delete all pdb-connected links """
me@262 209 #~ gc.get_referrers(self.pdb_chains[0])
me@262 210 self.pdb_chains = []
me@262 211 self.pdb_residues = {}
me@262 212 self.pdb_secstr = {} # FIXME
me@262 213 self.pdb_files = {} # FIXME
me@262 214
me@262 215 @staticmethod
me@262 216 def from_str(fasta_str, name='', description='', monomer_kind=AminoAcidType):
me@262 217 """ Import data from one-letter code
me@262 218
me@262 219 monomer_kind is class, inherited from MonomerType
me@262 220 """
me@262 221 monomers = [monomer_kind.from_code1(aa).instance() for aa in fasta_str]
me@262 222 return Sequence(monomers, name, description)
me@262 223
me@262 224 @staticmethod
me@262 225 def from_pdb_chain(chain):
me@262 226 """ Returns Sequence with Monomers with link to Bio.PDB.Residue
me@262 227
me@262 228 chain is Bio.PDB.Chain
me@262 229 """
me@262 230 cappbuilder = CaPPBuilder()
me@262 231 peptides = cappbuilder.build_peptides(chain)
me@262 232 sequence = Sequence()
me@262 233 sequence.pdb_chains = [chain]
me@262 234 sequence.pdb_residues[chain] = {}
me@262 235 sequence.pdb_secstr[chain] = {}
me@262 236 for peptide in peptides:
me@262 237 for ca_atom in peptide.get_ca_list():
me@262 238 residue = ca_atom.get_parent()
me@262 239 monomer = AminoAcidType.from_pdb_residue(residue).instance()
me@262 240 sequence.pdb_residues[chain][monomer] = residue
me@262 241 sequence.monomers.append(monomer)
me@262 242 return sequence
me@262 243
me@262 244 def pdb_auto_add(self, conformity_info=None, pdb_directory='./tmp'):
me@262 245 """ Adds pdb information to each monomer
me@262 246
me@262 247 Returns if information has been successfully added
me@262 248 TODO: conformity_file
me@262 249
me@262 250 id-format lava flow
me@262 251 """
me@262 252 if not conformity_info:
me@262 253 path = os.path.join(pdb_directory, self.name)
me@262 254 if os.path.exists(path) and os.path.getsize(path):
me@262 255 match = pdb_id_parse(self.name)
me@262 256 self.pdb_chain_add(open(path), match['code'],
me@262 257 match['chain'], match['model'])
me@262 258 else:
me@262 259 match = pdb_id_parse(self.name)
me@262 260 if match:
me@262 261 code = match['code']
me@262 262 pdb_filename = config.pdb_dir % code
me@262 263 if not os.path.exists(pdb_filename) or not os.path.getsize(pdb_filename):
me@262 264 url = config.pdb_url % code
me@262 265 print "Download %s" % url
me@262 266 pdb_file = open(pdb_filename, 'w')
me@262 267 data = urllib2.urlopen(url).read()
me@262 268 pdb_file.write(data)
me@262 269 pdb_file.close()
me@262 270 print "Save %s" % pdb_filename
me@262 271 pdb_file = open(pdb_filename)
me@262 272 self.pdb_chain_add(pdb_file, code, match['chain'], match['model'])
me@262 273
me@262 274 def pdb_save(self, out_filename, pdb_chain):
me@262 275 """ Saves pdb_chain to out_file """
me@262 276 class GlySelect(Select):
me@262 277 def accept_chain(self, chain):
me@262 278 if chain == pdb_chain:
me@262 279 return 1
me@262 280 else:
me@262 281 return 0
me@262 282 io = PDBIO()
me@262 283 structure = chain.get_parent()
me@262 284 io.set_structure(structure)
me@262 285 io.save(out_filename, GlySelect())
me@262 286
me@262 287
me@262 288 def pdb_add_sec_str(self, pdb_chain):
me@262 289 """ Add secondary structure data """
me@262 290 tmp_file = NamedTemporaryFile(delete=False)
me@262 291 tmp_file.close()
me@262 292 pdb_file = self.pdb_files[pdb_chain].name
me@262 293 os.system("dsspcmbi %(pdb)s %(tmp)s" % {'pdb': pdb_file, 'tmp': tmp_file.name})
me@262 294 dssp, keys = make_dssp_dict(tmp_file.name)
me@262 295 for monomer in self.monomers:
me@262 296 if self.pdb_has(pdb_chain, monomer):
me@262 297 residue = self.pdb_residues[pdb_chain][monomer]
me@262 298 try:
me@262 299 d = dssp[(pdb_chain.get_id(), residue.get_id())]
me@262 300 self.pdb_secstr[pdb_chain][monomer] = d[1]
me@262 301 except:
me@262 302 print "No dssp information about %s at %s" % (monomer, pdb_chain)
me@262 303 os.unlink(tmp_file.name)
me@262 304
me@262 305 def pdb_has(self, chain, monomer):
me@262 306 return chain in self.pdb_residues and monomer in self.pdb_residues[chain]
me@262 307
me@262 308 def secstr_has(self, chain, monomer):
me@262 309 return chain in self.pdb_secstr and monomer in self.pdb_secstr[chain]
me@262 310
me@262 311 @staticmethod
me@262 312 def file_slice(file, n_from, n_to, fasta_name='', name='', description='', monomer_kind=AminoAcidType):
me@262 313 """ Build and return sequence, consisting of part of sequence from file
me@262 314
me@262 315 Does not control gaps
me@262 316 """
me@262 317 inside = False
me@262 318 number_used = 0
me@262 319 s = ''
me@262 320 for line in file:
me@262 321 line = line.split()
me@262 322 if not inside:
me@262 323 if line.startswith('>%s' % fasta_name):
me@262 324 inside = True
me@262 325 else:
me@262 326 n = len(line)
me@262 327 s += line[(n_from - number_user):(n_to - number_user)]
me@262 328 return Sequence.from_str(s, name, description, monomer_kind)
bnagaev@243 329
bnagaev@249 330 class Alignment(dict):
bnagaev@249 331 """ Alignment
bnagaev@249 332
bnagaev@249 333 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
bnagaev@249 334 keys are the Sequence objects, values are the lists, which
bnagaev@249 335 contain monomers of those sequences or None for gaps in the
bnagaev@249 336 corresponding sequence of alignment
bnagaev@249 337 """
bnagaev@249 338 # _sequences -- list of Sequence objects. Sequences don't contain gaps
bnagaev@249 339 # - see sequence.py module
bnagaev@249 340
bnagaev@249 341 def __init__(self, *args):
bnagaev@249 342 """overloaded constructor
bnagaev@249 343
bnagaev@249 344 Alignment() -> new empty Alignment
bnagaev@249 345 Alignment(sequences, body) -> new Alignment with sequences and
bnagaev@249 346 body initialized from arguments
bnagaev@249 347 Alignment(fasta_file) -> new Alignment, read body and sequences
bnagaev@249 348 from fasta file
bnagaev@249 349
bnagaev@249 350 """
bnagaev@249 351 if len(args)>1:#overloaded constructor
bnagaev@249 352 self.sequences=args[0]
bnagaev@249 353 self.body=args[1]
bnagaev@249 354 elif len(args)==0:
bnagaev@249 355 self.sequences=[]
bnagaev@249 356 self.body={}
bnagaev@249 357 else:
bnagaev@249 358 self.sequences, self.body = Alignment.from_fasta(args[0])
bnagaev@249 359
bnagaev@249 360 def length(self):
bnagaev@249 361 """ Returns width, ie length of each sequence with gaps """
bnagaev@249 362 return max([len(line) for line in self.body.values()])
bnagaev@249 363
bnagaev@249 364 def height(self):
bnagaev@249 365 """ The number of sequences in alignment (it's thickness). """
bnagaev@249 366 return len(self.body)
bnagaev@249 367
bnagaev@249 368 def identity(self):
bnagaev@249 369 """ Calculate the identity of alignment positions for colouring.
bnagaev@249 370
bnagaev@249 371 For every (row, column) in alignment the percentage of the exactly
bnagaev@249 372 same residue in the same column in the alignment is calculated.
bnagaev@249 373 The data structure is just like the Alignment.body, but istead of
bnagaev@249 374 monomers it contains float percentages.
bnagaev@249 375 """
bnagaev@249 376 # Oh, God, that's awful! Absolutely not understandable.
bnagaev@249 377 # First, calculate percentages of amino acids in every column
bnagaev@249 378 contribution = 1.0 / len(self.sequences)
bnagaev@249 379 all_columns = []
bnagaev@249 380 for position in range(len(self)):
bnagaev@249 381 column_percentage = {}
bnagaev@249 382 for seq in self.body:
bnagaev@249 383 if self.body[seq][position] is not None:
bnagaev@249 384 aa = self.body[seq][position].code
bnagaev@249 385 else:
bnagaev@249 386 aa = None
bnagaev@249 387 if aa in allpy.data.amino_acids:
bnagaev@249 388 if aa in column_percentage.keys():
bnagaev@249 389 column_percentage[aa] += contribution
bnagaev@249 390 else:
bnagaev@249 391 column_percentage[aa] = contribution
bnagaev@249 392 all_columns.append(column_percentage)
bnagaev@249 393 # Second, map these percentages onto the alignment
bnagaev@249 394 self.identity_percentages = {}
bnagaev@249 395 for seq in self.sequences:
bnagaev@249 396 self.identity_percentages[seq] = []
bnagaev@249 397 for seq in self.identity_percentages:
bnagaev@249 398 line = self.identity_percentages[seq]
bnagaev@249 399 for position in range(len(self)):
bnagaev@249 400 if self.body[seq][position] is not None:
bnagaev@249 401 aa = self.body[seq][position].code
bnagaev@249 402 else:
bnagaev@249 403 aa = None
bnagaev@249 404 line.append(all_columns[position].get(aa))
bnagaev@249 405 return self.identity_percentages
bnagaev@249 406
bnagaev@249 407 @staticmethod
bnagaev@249 408 def from_fasta(file, monomer_kind=AminoAcidType):
bnagaev@249 409 """ Import data from fasta file
bnagaev@249 410
bnagaev@249 411 monomer_kind is class, inherited from MonomerType
bnagaev@249 412
bnagaev@249 413 >>> import alignment
bnagaev@249 414 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))
bnagaev@249 415 """
bnagaev@249 416 import re
bnagaev@249 417
bnagaev@249 418 sequences = []
bnagaev@249 419 body = {}
bnagaev@249 420
bnagaev@249 421 raw_sequences = file.read().split(">")
bnagaev@249 422 if len(raw_sequences) <= 1:
bnagaev@249 423 raise Exception("Wrong format of fasta-file %s" % file.name)
bnagaev@249 424
bnagaev@249 425 raw_sequences = raw_sequences[1:] #ignore everything before the first >
bnagaev@249 426 for raw in raw_sequences:
bnagaev@249 427 parsed_raw_sequence = raw.split("\n")
bnagaev@249 428 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
bnagaev@249 429 name_and_description = parsed_raw_sequence[0]
bnagaev@249 430 name_and_description = name_and_description.split(" ",1)
bnagaev@249 431 if len(name_and_description) == 2:
bnagaev@249 432 name, description = name_and_description
bnagaev@249 433 elif len(name_and_description) == 1:
bnagaev@249 434 #if there is description
bnagaev@249 435 name = name_and_description[0]
bnagaev@249 436 description = ''
bnagaev@249 437 else:
bnagaev@249 438 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \
bnagaev@249 439 {'name': name, 'file': file.name})
bnagaev@249 440
bnagaev@249 441 if len(parsed_raw_sequence) <= 1:
bnagaev@249 442 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \
bnagaev@249 443 {'name': name, 'file': file.name})
bnagaev@249 444 string = ""
bnagaev@249 445 for piece in parsed_raw_sequence[1:]:
bnagaev@249 446 piece_without_whitespace_chars = re.sub("\s", "", piece)
bnagaev@249 447 string += piece_without_whitespace_chars
bnagaev@249 448 monomers = [] #convert into Monomer objects
bnagaev@249 449 body_list = [] #create the respective list in body dict
bnagaev@249 450 for current_monomer in string:
bnagaev@249 451 if current_monomer not in ["-", ".", "~"]:
bnagaev@249 452 monomers.append(monomer_kind.from_code1(current_monomer).instance())
bnagaev@249 453 body_list.append(monomers[-1])
bnagaev@249 454 else:
bnagaev@249 455 body_list.append(None)
bnagaev@249 456 s = sequence.Sequence(monomers, name, description)
bnagaev@249 457 sequences.append(s)
bnagaev@249 458 body[s] = body_list
bnagaev@249 459 return sequences, body
bnagaev@249 460
bnagaev@249 461 @staticmethod
bnagaev@249 462 def from_sequences(*sequences):
bnagaev@249 463 """ Constructs new alignment from sequences
bnagaev@249 464
bnagaev@249 465 Add None's to right end to make equal lengthes of alignment sequences
bnagaev@249 466 """
bnagaev@249 467 alignment = Alignment()
bnagaev@249 468 alignment.sequences = sequences
bnagaev@249 469 max_length = max(len(sequence) for sequence in sequences)
bnagaev@249 470 for sequence in sequences:
bnagaev@249 471 gaps_count = max_length - len(sequence)
bnagaev@249 472 alignment.body[sequence] = sequence.monomers + [None] * gaps_count
bnagaev@249 473 return alignment
bnagaev@249 474
bnagaev@249 475 def save_fasta(self, out_file, long_line=70, gap='-'):
bnagaev@249 476 """ Saves alignment to given file
bnagaev@249 477
bnagaev@249 478 Splits long lines to substrings of length=long_line
bnagaev@249 479 To prevent this, set long_line=None
bnagaev@249 480 """
bnagaev@249 481 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)
bnagaev@249 482
bnagaev@249 483 def muscle_align(self):
bnagaev@249 484 """ Simple align ths alignment using sequences (muscle)
bnagaev@249 485
bnagaev@249 486 uses old Monomers and Sequences objects
bnagaev@249 487 """
bnagaev@249 488 tmp_file = NamedTemporaryFile(delete=False)
bnagaev@249 489 self.save_fasta(tmp_file)
bnagaev@249 490 tmp_file.close()
bnagaev@249 491 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})
bnagaev@249 492 sequences, body = Alignment.from_fasta(open(tmp_file.name))
bnagaev@249 493 for sequence in self.sequences:
bnagaev@249 494 try:
bnagaev@249 495 new_sequence = [i for i in sequences if sequence==i][0]
bnagaev@249 496 except:
bnagaev@249 497 raise Exception("Align: Cann't find sequence %s in muscle output" % \
bnagaev@249 498 sequence.name)
bnagaev@249 499 old_monomers = iter(sequence.monomers)
bnagaev@249 500 self.body[sequence] = []
bnagaev@249 501 for monomer in body[new_sequence]:
bnagaev@249 502 if not monomer:
bnagaev@249 503 self.body[sequence].append(monomer)
bnagaev@249 504 else:
bnagaev@249 505 old_monomer = old_monomers.next()
bnagaev@249 506 if monomer != old_monomer:
bnagaev@249 507 raise Exception("Align: alignment errors")
bnagaev@249 508 self.body[sequence].append(old_monomer)
bnagaev@249 509 os.unlink(tmp_file.name)
bnagaev@249 510
bnagaev@249 511 def column(self, sequence=None, sequences=None, original=None):
bnagaev@249 512 """ returns list of columns of alignment
bnagaev@249 513
bnagaev@249 514 sequence or sequences:
bnagaev@249 515 if sequence is given, then column is (original_monomer, monomer)
bnagaev@249 516 if sequences is given, then column is (original_monomer, {sequence: monomer})
bnagaev@249 517 if both of them are given, it is an error
bnagaev@249 518 original (Sequence type):
bnagaev@249 519 if given, this filters only columns represented by original sequence
bnagaev@249 520 """
bnagaev@249 521 if sequence and sequences:
bnagaev@249 522 raise Exception("Wrong usage. read help")
bnagaev@249 523 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])
bnagaev@249 524 alignment = self.body.items()
bnagaev@249 525 alignment.sort(key=lambda i: indexes[i[0]])
bnagaev@249 526 alignment = [monomers for seq, monomers in alignment]
bnagaev@249 527 for column in zip(*alignment):
bnagaev@249 528 if not original or column[indexes[original]]:
bnagaev@249 529 if sequence:
bnagaev@249 530 yield (column[indexes[original]], column[indexes[sequence]])
bnagaev@249 531 else:
bnagaev@249 532 yield (column[indexes[original]],
bnagaev@249 533 dict([(s, column[indexes[s]]) for s in sequences]))
bnagaev@249 534
bnagaev@249 535 def secstr(self, sequence, pdb_chain, gap='-'):
bnagaev@249 536 """ Returns string representing secondary structure """
bnagaev@249 537 return ''.join([
bnagaev@249 538 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)
bnagaev@249 539 for m in self.body[sequence]])
bnagaev@249 540
bnagaev@249 541 class Block(object):
me@261 542 """ Block of alignment
me@261 543
me@261 544 Mandatory data:
me@261 545 * self.alignment -- alignment object, which the block belongs to
me@261 546 * self.sequences - set of sequence objects that contain monomers
me@261 547 and/or gaps, that constitute the block
me@261 548 * self.positions -- list of positions of the alignment.body that
me@261 549 are included in the block; position[i+1] is always to the right from position[i]
me@261 550
me@261 551 Don't change self.sequences -- it may be a link to other block.sequences
me@261 552
me@261 553 How to create a new block:
me@261 554 >>> import alignment
me@261 555 >>> import block
me@261 556 >>> proj = alignment.Alignment(open("test.fasta"))
me@261 557 >>> block1 = block.Block(proj)
me@261 558 """
me@261 559
me@261 560 def __init__(self, alignment, sequences=None, positions=None):
me@261 561 """ Builds new block from alignment
me@261 562
me@261 563 if sequences==None, all sequences are used
me@261 564 if positions==None, all positions are used
me@261 565 """
me@261 566 if sequences == None:
me@261 567 sequences = set(alignment.sequences) # copy
me@261 568 if positions == None:
me@261 569 positions = range(len(alignment))
me@261 570 self.alignment = alignment
me@261 571 self.sequences = sequences
me@261 572 self.positions = positions
me@261 573
me@261 574 def save_fasta(self, out_file, long_line=70, gap='-'):
me@261 575 """ Saves alignment to given file in fasta-format
me@261 576
me@261 577 No changes in the names, descriptions or order of the sequences
me@261 578 are made.
me@261 579 """
me@261 580 for sequence in self.sequences:
me@261 581 alignment_monomers = self.alignment.body[sequence]
me@261 582 block_monomers = [alignment_monomers[i] for i in self.positions]
me@261 583 string = ''.join([m.type.code1 if m else '-' for m in block_monomers])
me@261 584 save_fasta(out_file, string, sequence.name, sequence.description, long_line)
me@261 585
me@261 586 def geometrical_cores(self, max_delta=config.delta,
me@261 587 timeout=config.timeout, minsize=config.minsize,
me@261 588 ac_new_atoms=config.ac_new_atoms,
me@261 589 ac_count=config.ac_count):
me@261 590 """ Returns length-sorted list of blocks, representing GCs
me@261 591
me@261 592 max_delta -- threshold of distance spreading
me@261 593 timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm)
me@261 594 minsize -- min size of each core
me@261 595 ac_new_atoms -- min part or new atoms in new alternative core
me@261 596 current GC is compared with each of already selected GCs
me@261 597 if difference is less then ac_new_atoms, current GC is skipped
me@261 598 difference = part of new atoms in current core
me@261 599 ac_count -- max number of cores (including main core)
me@261 600 -1 means infinity
me@261 601 If more than one pdb chain for some sequence provided, consider all of them
me@261 602 cost is calculated as 1 / (delta + 1)
me@261 603 delta in [0, +inf) => cost in (0, 1]
me@261 604 """
me@261 605 nodes = self.positions
me@261 606 lines = {}
me@261 607 for i in self.positions:
me@261 608 for j in self.positions:
me@261 609 if i < j:
me@261 610 distances = []
me@261 611 for sequence in self.sequences:
me@261 612 for chain in sequence.pdb_chains:
me@261 613 m1 = self.alignment.body[sequence][i]
me@261 614 m2 = self.alignment.body[sequence][j]
me@261 615 if m1 and m2:
me@261 616 r1 = sequence.pdb_residues[chain][m1]
me@261 617 r2 = sequence.pdb_residues[chain][m2]
me@261 618 ca1 = r1['CA']
me@261 619 ca2 = r2['CA']
me@261 620 d = ca1 - ca2 # Bio.PDB feature
me@261 621 distances.append(d)
me@261 622 if len(distances) >= 2:
me@261 623 delta = max(distances) - min(distances)
me@261 624 if delta <= max_delta:
me@261 625 lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta)
me@261 626 graph = Graph(nodes, lines)
me@261 627 cliques = graph.cliques(timeout=timeout, minsize=minsize)
me@261 628 GCs = []
me@261 629 for clique in cliques:
me@261 630 for GC in GCs:
me@261 631 if len(clique - set(GC.positions)) < ac_new_atoms * len(clique):
me@261 632 break
me@261 633 else:
me@261 634 GCs.append(Block(self.alignment, self.sequences, clique))
me@261 635 if ac_count != -1 and len(GCs) >= ac_count:
me@261 636 break
me@261 637 return GCs
me@261 638
me@261 639 def xstring(self, x='X', gap='-'):
me@261 640 """ Returns string consisting of gap chars and chars x at self.positions
me@261 641
me@261 642 Length of returning string = length of alignment
me@261 643 """
me@261 644 monomers = [False] * len(self.alignment)
me@261 645 for i in self.positions:
me@261 646 monomers[i] = True
me@261 647 return ''.join([x if m else gap for m in monomers])
me@261 648
me@261 649 def save_xstring(self, out_file, name, description='', x='X', gap='-', long_line=70):
me@261 650 """ Save xstring and name in fasta format """
me@261 651 save_fasta(out_file, self.xstring(x=x, gap=gap), name, description, long_line)
me@261 652
me@261 653 def monomers(self, sequence):
me@261 654 """ Iterates monomers of this sequence from this block """
me@261 655 alignment_sequence = self.alignment.body[sequence]
me@261 656 return (alignment_sequence[i] for i in self.positions)
me@261 657
me@261 658 def ca_atoms(self, sequence, pdb_chain):
me@261 659 """ Iterates Ca-atom of monomers of this sequence from this block """
me@261 660 return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers())
me@261 661
me@261 662 def sequences_chains(self):
me@261 663 """ Iterates pairs (sequence, chain) """
me@261 664 for sequence in self.alignment.sequences:
me@261 665 if sequence in self.sequences:
me@261 666 for chain in sequence.pdb_chains:
me@261 667 yield (sequence, chain)
me@261 668
me@261 669 def superimpose(self):
me@261 670 """ Superimpose all pdb_chains in this block """
me@261 671 sequences_chains = list(self.sequences_chains())
me@261 672 if len(sequences_chains) >= 1:
me@261 673 sup = Superimposer()
me@261 674 fixed_sequence, fixed_chain = sequences_chains.pop()
me@261 675 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain)
me@261 676 for sequence, chain in sequences_chains:
me@261 677 moving_atoms = self.ca_atoms(sequence, chain)
me@261 678 sup.set_atoms(fixed_atoms, moving_atoms)
me@261 679 # Apply rotation/translation to the moving atoms
me@261 680 sup.apply(moving_atoms)
me@261 681
me@261 682 def pdb_save(self, out_file):
me@261 683 """ Save all sequences
me@261 684
me@261 685 Returns {(sequence, chain): CHAIN}
me@261 686 CHAIN is chain letter in new file
me@261 687 """
me@261 688 tmp_file = NamedTemporaryFile(delete=False)
me@261 689 tmp_file.close()
me@261 690
me@261 691 for sequence, chain in self.sequences_chains():
me@261 692 sequence.pdb_save(tmp_file.name, chain)
me@261 693 # TODO: read from tmp_file.name
me@261 694 # change CHAIN
me@261 695 # add to out_file
me@261 696
me@261 697 os.unlink(NamedTemporaryFile)
bnagaev@239 698
me@260 699 # vim: set ts=4 sts=4 sw=4 et: