allpy: d7ad71def6c8 allpy/base.py

allpy

view allpy/base.py @ 289:d7ad71def6c8

Experimenting with styles of docstrings in allpy.base.Alignment (to be recognized by Sphinx)

author	Daniil Alexeyevsky <me.dendik@gmail.com>
date	Thu, 16 Dec 2010 01:25:54 +0300
parents	9c68d8eab8f5
children	751048390c45

line source

1 import sys

2 import os

3 import os.path

4 from tempfile import NamedTemporaryFile

5 import urllib2

7 import config

8 import fasta

9 from graph import Graph

10 from Bio.PDB.DSSP import make_dssp_dict

11 import data.codes

13 class MonomerType(object):

14 """Class of monomer types.

16 Each MonomerType object represents a known monomer type, e.g. Valine,

17 and is referenced to by each instance of monomer in a given sequence.

19 - `name`: full name of monomer type

20 - `code1`: one-letter code

21 - `code3`: three-letter code

22 - `is_modified`: either of True or False

24 class atributes:

26 - `by_code1`: a mapping from one-letter code to MonomerType object

27 - `by_code3`: a mapping from three-letter code to MonomerType object

28 - `by_name`: a mapping from monomer name to MonomerType object

29 - `instance_type`: class of Monomer objects to use when creating new

30 objects; this must be redefined in descendent classes

32 All of the class attributes MUST be redefined when subclassing.

33 """

35 by_code1 = {}

36 by_code3 = {}

37 by_name = {}

38 instance_type = None

40 def __init__(self, name="", code1="", code3="", is_modified=False):

41 self.name = name.capitalize()

42 self.code1 = code1.upper()

43 self.code3 = code3.upper()

44 self.is_modified = bool(is_modified)

45 if not is_modified:

46 self.by_code1[self.code1] = self

47 self.by_code3[code3] = self

48 self.by_name[name] = self

49 # We duplicate distinguished long names into MonomerType itself,

50 # so that we can use MonomerType.from_code3 to create the relevant

51 # type of monomer.

52 MonomerType.by_code3[code3] = self

53 MonomerType.by_name[name] = self

55 @classmethod

56 def _initialize(cls, type_letter, codes=data.codes.codes):

57 """Create all relevant instances of MonomerType.

59 `type_letter` is either of:

61 - 'p' for protein

62 - 'd' for DNA

63 - 'r' for RNA

65 `codes` is a table of monomer codes

66 """

67 for type, code1, is_modified, code3, name in codes:

68 if type == type_letter:

69 cls(name, code1, code3, is_modified)

71 @classmethod

72 def from_code1(cls, code1):

73 """Return monomer type by one-letter code."""

74 return cls.by_code1[code1.upper()]

76 @classmethod

77 def from_code3(cls, code3):

78 """Return monomer type by three-letter code."""

79 return cls.by_code3[code3.upper()]

81 @classmethod

82 def from_name(cls, name):

83 """Return monomer type by name."""

84 return cls.by_name[name.capitalize()]

86 def instance(self):

87 """Create a new monomer of given type."""

88 return self.instance_type(self)

90 def __eq__(self, other):

91 if hasattr(other, "type"):

92 return self is other.type

93 return self is other

95 class Monomer(object):

96 """Monomer object.

98 attributes:

100 - `type`: type of monomer (a MonomerType object)

101

102 class attributes:

103

104 - `monomer_type`: either MonomerType or one of it's subclasses, it is used

105 when creating new monomers. It SHOULD be redefined when subclassing

106 Monomer.

107 """

108 monomer_type = MonomerType

109

110 def __init__(self, type):

111 self.type = type

112

113 @classmethod

114 def from_code1(cls, code1):

115 return cls(cls.monomer_type.by_code1[code1.upper()])

116

117 @classmethod

118 def from_code3(cls, code3):

119 return cls(cls.monomer_type.by_code3[code3.upper()])

120

121 @classmethod

122 def from_name(cls, name):

123 return cls(cls.monomer_type.by_name[name.capitalize()])

124

125 def __eq__(self, other):

126 if hasattr(other, "type"):

127 return self.type is other.type

128 return self.type is other

129

130 class Sequence(list):

131 """Sequence of Monomers.

132

133 This behaves like list of monomer objects. In addition to standard list

134 behaviour, Sequence has the following attributes:

135

136 * name -- str with the name of the sequence

137 * description -- str with description of the sequence

138 * source -- str denoting source of the sequence

139

140 Any of them may be empty (i.e. hold empty string)

141

142 Class attributes:

143

144 * monomer_type -- type of monomers in sequence, must be redefined when

145 subclassing

146 """

147

148 monomer_type = Monomer

149

150 name = ''

151 description = ''

152 source = ''

153

154 def __init__(self, sequence=[], name=None, description=None, source=None):

155 super(Sequence, self).__init__(sequence)

156 if hasattr(sequence, 'name'):

157 vars(self).update(vars(sequence))

158 if name:

159 self.name = name

160 if description:

161 self.description = description

162 if source:

163 self.source = source

164

165 def __str__(self):

166 """Returns sequence in one-letter code."""

167 return ''.join(monomer.code1 for monomer in self)

168

169 @classmethod

170 def from_string(cls, string, name='', description=''):

171 """Create sequences from string of one-letter codes."""

172 monomer = cls.monomer_type.from_code1

173 monomers = [monomer(letter) for letter in string]

174 return cls(monomers, name, description)

175

176 @classmethod

177 def from_fasta(cls, file):

178 """Read sequence from FASTA file.

179

180 File must contain exactly one sequence.

181 """

182 sequences = fasta.parse_file(file)

183 assert len(sequences) == 1

184 name, description = sequences.keys()[0]

185 return cls(sequences[header], name, description, file.name)

186

187 class Alignment(list):

188 """Alignment. Behaves like a list of Columns."""

189

190 sequence_type = Sequence

191 """Type of sequences in alignment. SHOULD be redefined when subclassing."""

192

193 sequences = None

194 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

195

196 def __init__(self):

197 """Initialize empty alignment."""

198 super(Alignment, self).__init__()

199 self.sequences = []

200

201 def add_gapped_line(self, line, name='', description='', source=''):

202 """Add row from a line of one-letter codes and gaps."""

203 Sequence = cls.sequence_type

204 not_gap = lambda (i, char): char != "-"

205 no_gaps = line.replace("-", "")

206 sequence = Sequence(no_gaps, name, description, source)

207 for i, (j, char) in enumerate(filter(not_gap, enumerate(line))):

208 self[j][seq] = sequence[i]

209 self.sequences.append(sequence)

210

211 @classmethod

212 def from_fasta(cls, file):

213 """Create new alignment from FASTA file."""

214 self = cls()

215 for ((name, description), body) in fasta.parse_file(file):

216 self.add_gapped_line(body, name, description)

217 return self

218

219 def length(self):

220 """Return width, ie length of each sequence with gaps."""

221 return max([len(line) for line in self.body.values()])

222

223 def height(self):

224 """ The number of sequences in alignment (it's thickness). """

225 return len(self.body)

226

227 def identity(self):

228 """ Calculate the identity of alignment positions for colouring.

229

230 For every (row, column) in alignment the percentage of the exactly

231 same residue in the same column in the alignment is calculated.

232 The data structure is just like the Alignment.body, but istead of

233 monomers it contains float percentages.

234 """

235 # Oh, God, that's awful! Absolutely not understandable.

236 # First, calculate percentages of amino acids in every column

237 contribution = 1.0 / len(self.sequences)

238 all_columns = []

239 for position in range(len(self)):

240 column_percentage = {}

241 for seq in self.body:

242 if self.body[seq][position] is not None:

243 aa = self.body[seq][position].code

244 else:

245 aa = None

246 if aa in allpy.data.amino_acids:

247 if aa in column_percentage.keys():

248 column_percentage[aa] += contribution

249 else:

250 column_percentage[aa] = contribution

251 all_columns.append(column_percentage)

252 # Second, map these percentages onto the alignment

253 self.identity_percentages = {}

254 for seq in self.sequences:

255 self.identity_percentages[seq] = []

256 for seq in self.identity_percentages:

257 line = self.identity_percentages[seq]

258 for position in range(len(self)):

259 if self.body[seq][position] is not None:

260 aa = self.body[seq][position].code

261 else:

262 aa = None

263 line.append(all_columns[position].get(aa))

264 return self.identity_percentages

265

266 @staticmethod

267 def from_sequences(*sequences):

268 """ Constructs new alignment from sequences

269

270 Add None's to right end to make equal lengthes of alignment sequences

271 """

272 alignment = Alignment()

273 alignment.sequences = sequences

274 max_length = max(len(sequence) for sequence in sequences)

275 for sequence in sequences:

276 gaps_count = max_length - len(sequence)

277 alignment.body[sequence] = sequence.monomers + [None] * gaps_count

278 return alignment

279

280 def save_fasta(self, out_file, long_line=70, gap='-'):

281 """ Saves alignment to given file

282

283 Splits long lines to substrings of length=long_line

284 To prevent this, set long_line=None

285 """

286 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)

287

288 def muscle_align(self):

289 """ Simple align ths alignment using sequences (muscle)

290

291 uses old Monomers and Sequences objects

292 """

293 tmp_file = NamedTemporaryFile(delete=False)

294 self.save_fasta(tmp_file)

295 tmp_file.close()

296 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})

297 sequences, body = Alignment.from_fasta(open(tmp_file.name))

298 for sequence in self.sequences:

299 try:

300 new_sequence = [i for i in sequences if sequence==i][0]

301 except:

302 raise Exception("Align: Cann't find sequence %s in muscle output" % \

303 sequence.name)

304 old_monomers = iter(sequence.monomers)

305 self.body[sequence] = []

306 for monomer in body[new_sequence]:

307 if not monomer:

308 self.body[sequence].append(monomer)

309 else:

310 old_monomer = old_monomers.next()

311 if monomer != old_monomer:

312 raise Exception("Align: alignment errors")

313 self.body[sequence].append(old_monomer)

314 os.unlink(tmp_file.name)

315

316 def column(self, sequence=None, sequences=None, original=None):

317 """ returns list of columns of alignment

318

319 sequence or sequences:

320 * if sequence is given, then column is (original_monomer, monomer)

321 * if sequences is given, then column is (original_monomer, {sequence: monomer})

322 * if both of them are given, it is an error

323

324 original (Sequence type):

325 * if given, this filters only columns represented by original sequence

326 """

327 if sequence and sequences:

328 raise Exception("Wrong usage. read help")

329 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])

330 alignment = self.body.items()

331 alignment.sort(key=lambda i: indexes[i[0]])

332 alignment = [monomers for seq, monomers in alignment]

333 for column in zip(*alignment):

334 if not original or column[indexes[original]]:

335 if sequence:

336 yield (column[indexes[original]], column[indexes[sequence]])

337 else:

338 yield (column[indexes[original]],

339 dict([(s, column[indexes[s]]) for s in sequences]))

340

341 def secstr(self, sequence, pdb_chain, gap='-'):

342 """ Returns string representing secondary structure """

343 return ''.join([

344 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)

345 for m in self.body[sequence]])

346

347 class Block(object):

348 """ Block of alignment

349

350 Mandatory data:

351

352 * self.alignment -- alignment object, which the block belongs to

353 * self.sequences - set of sequence objects that contain monomers

354 and/or gaps, that constitute the block

355 * self.positions -- list of positions of the alignment.body that

356 are included in the block; position[i+1] is always to the right from position[i]

357

358 Don't change self.sequences -- it may be a link to other block.sequences

359

360 How to create a new block:

361

362 >>> import alignment

363 >>> import block

364 >>> proj = alignment.Alignment(open("test.fasta"))

365 >>> block1 = block.Block(proj)

366 """

367

368 def __init__(self, alignment, sequences=None, positions=None):

369 """ Builds new block from alignment

370

371 if sequences==None, all sequences are used

372 if positions==None, all positions are used

373 """

374 if sequences == None:

375 sequences = set(alignment.sequences) # copy

376 if positions == None:

377 positions = range(len(alignment))

378 self.alignment = alignment

379 self.sequences = sequences

380 self.positions = positions

381

382 def save_fasta(self, out_file, long_line=70, gap='-'):

383 """ Saves alignment to given file in fasta-format

384

385 No changes in the names, descriptions or order of the sequences

386 are made.

387 """

388 for sequence in self.sequences:

389 alignment_monomers = self.alignment.body[sequence]

390 block_monomers = [alignment_monomers[i] for i in self.positions]

391 string = ''.join([m.type.code1 if m else '-' for m in block_monomers])

392 save_fasta(out_file, string, sequence.name, sequence.description, long_line)

393

394 def geometrical_cores(self, max_delta=config.delta,

395 timeout=config.timeout, minsize=config.minsize,

396 ac_new_atoms=config.ac_new_atoms,

397 ac_count=config.ac_count):

398 """ Returns length-sorted list of blocks, representing GCs

399

400 * max_delta -- threshold of distance spreading

401 * timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm)

402 * minsize -- min size of each core

403 * ac_new_atoms -- min part or new atoms in new alternative core

404 current GC is compared with each of already selected GCs if

405 difference is less then ac_new_atoms, current GC is skipped

406 difference = part of new atoms in current core

407 * ac_count -- max number of cores (including main core)

408 -1 means infinity

409

410 If more than one pdb chain for some sequence provided, consider all of them

411 cost is calculated as 1 / (delta + 1)

412

413 delta in [0, +inf) => cost in (0, 1]

414 """

415 nodes = self.positions

416 lines = {}

417 for i in self.positions:

418 for j in self.positions:

419 if i < j:

420 distances = []

421 for sequence in self.sequences:

422 for chain in sequence.pdb_chains:

423 m1 = self.alignment.body[sequence][i]

424 m2 = self.alignment.body[sequence][j]

425 if m1 and m2:

426 r1 = sequence.pdb_residues[chain][m1]

427 r2 = sequence.pdb_residues[chain][m2]

428 ca1 = r1['CA']

429 ca2 = r2['CA']

430 d = ca1 - ca2 # Bio.PDB feature

431 distances.append(d)

432 if len(distances) >= 2:

433 delta = max(distances) - min(distances)

434 if delta <= max_delta:

435 lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta)

436 graph = Graph(nodes, lines)

437 cliques = graph.cliques(timeout=timeout, minsize=minsize)

438 GCs = []

439 for clique in cliques:

440 for GC in GCs:

441 if len(clique - set(GC.positions)) < ac_new_atoms * len(clique):

442 break

443 else:

444 GCs.append(Block(self.alignment, self.sequences, clique))

445 if ac_count != -1 and len(GCs) >= ac_count:

446 break

447 return GCs

448

449 def xstring(self, x='X', gap='-'):

450 """ Returns string consisting of gap chars and chars x at self.positions

451

452 Length of returning string = length of alignment

453 """

454 monomers = [False] * len(self.alignment)

455 for i in self.positions:

456 monomers[i] = True

457 return ''.join([x if m else gap for m in monomers])

458

459 def save_xstring(self, out_file, name, description='', x='X', gap='-', long_line=70):

460 """ Save xstring and name in fasta format """

461 save_fasta(out_file, self.xstring(x=x, gap=gap), name, description, long_line)

462

463 def monomers(self, sequence):

464 """ Iterates monomers of this sequence from this block """

465 alignment_sequence = self.alignment.body[sequence]

466 return (alignment_sequence[i] for i in self.positions)

467

468 def ca_atoms(self, sequence, pdb_chain):

469 """ Iterates Ca-atom of monomers of this sequence from this block """

470 return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers())

471

472 def sequences_chains(self):

473 """ Iterates pairs (sequence, chain) """

474 for sequence in self.alignment.sequences:

475 if sequence in self.sequences:

476 for chain in sequence.pdb_chains:

477 yield (sequence, chain)

478

479 def superimpose(self):

480 """ Superimpose all pdb_chains in this block """

481 sequences_chains = list(self.sequences_chains())

482 if len(sequences_chains) >= 1:

483 sup = Superimposer()

484 fixed_sequence, fixed_chain = sequences_chains.pop()

485 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain)

486 for sequence, chain in sequences_chains:

487 moving_atoms = self.ca_atoms(sequence, chain)

488 sup.set_atoms(fixed_atoms, moving_atoms)

489 # Apply rotation/translation to the moving atoms

490 sup.apply(moving_atoms)

491

492 def pdb_save(self, out_file):

493 """ Save all sequences

494

495 Returns {(sequence, chain): CHAIN}

496 CHAIN is chain letter in new file

497 """

498 tmp_file = NamedTemporaryFile(delete=False)

499 tmp_file.close()

500

501 for sequence, chain in self.sequences_chains():

502 sequence.pdb_save(tmp_file.name, chain)

503 # TODO: read from tmp_file.name

504 # change CHAIN

505 # add to out_file

506

507 os.unlink(NamedTemporaryFile)

508

509 # vim: set ts=4 sts=4 sw=4 et: