allpy: bee4d155f526 allpy/base.py

allpy

view allpy/base.py @ 284:bee4d155f526

Added base.Sequence.from_fasta

author	Daniil Alexeyevsky <me.dendik@gmail.com>
date	Wed, 15 Dec 2010 23:44:20 +0300
parents	59320c160dae
children	cf6cdc3b7ec5

line source

1 import sys

2 import os

3 import os.path

4 from tempfile import NamedTemporaryFile

5 import urllib2

7 import config

8 import fasta

9 from graph import Graph

10 from Bio.PDB.DSSP import make_dssp_dict

11 import data.codes

13 class MonomerType(object):

14 """Class of monomer types.

16 Each MonomerType object represents a known monomer type, e.g. Valine,

17 and is referenced to by each instance of monomer in a given sequence.

19 - `name`: full name of monomer type

20 - `code1`: one-letter code

21 - `code3`: three-letter code

22 - `is_modified`: either of True or False

24 class atributes:

26 - `by_code1`: a mapping from one-letter code to MonomerType object

27 - `by_code3`: a mapping from three-letter code to MonomerType object

28 - `by_name`: a mapping from monomer name to MonomerType object

29 - `instance_type`: class of Monomer objects to use when creating new

30 objects; this must be redefined in descendent classes

32 All of the class attributes MUST be redefined when subclassing.

33 """

35 by_code1 = {}

36 by_code3 = {}

37 by_name = {}

38 instance_type = None

40 def __init__(self, name="", code1="", code3="", is_modified=False):

41 self.name = name.capitalize()

42 self.code1 = code1.upper()

43 self.code3 = code3.upper()

44 self.is_modified = bool(is_modified)

45 if not is_modified:

46 self.by_code1[self.code1] = self

47 self.by_code3[code3] = self

48 self.by_name[name] = self

49 # We duplicate distinguished long names into MonomerType itself,

50 # so that we can use MonomerType.from_code3 to create the relevant

51 # type of monomer.

52 MonomerType.by_code3[code3] = self

53 MonomerType.by_name[name] = self

55 @classmethod

56 def _initialize(cls, type_letter, codes=data.codes.codes):

57 """Create all relevant instances of MonomerType.

59 `type_letter` is either of:

61 - 'p' for protein

62 - 'd' for DNA

63 - 'r' for RNA

65 `codes` is a table of monomer codes

66 """

67 for type, code1, is_modified, code3, name in codes:

68 if type == type_letter:

69 cls(name, code1, code3, is_modified)

71 @classmethod

72 def from_code1(cls, code1):

73 """Return monomer type by one-letter code."""

74 return cls.by_code1[code1.upper()]

76 @classmethod

77 def from_code3(cls, code3):

78 """Return monomer type by three-letter code."""

79 return cls.by_code3[code3.upper()]

81 @classmethod

82 def from_name(cls, name):

83 """Return monomer type by name."""

84 return cls.by_name[name.capitalize()]

86 def instance(self):

87 """Create a new monomer of given type."""

88 return self.instance_type(self)

90 def __eq__(self, other):

91 if hasattr(other, "type"):

92 return self is other.type

93 return self is other

95 class Monomer(object):

96 """Monomer object.

98 attributes:

100 - `type`: type of monomer (a MonomerType object)

101

102 class attributes:

103

104 - `monomer_type`: either MonomerType or one of it's subclasses, it is used

105 when creating new monomers. It SHOULD be redefined when subclassing

106 Monomer.

107 """

108 monomer_type = MonomerType

109

110 def __init__(self, type):

111 self.type = type

112

113 @classmethod

114 def from_code1(cls, code1):

115 return cls(cls.monomer_type.by_code1[code1.upper()])

116

117 @classmethod

118 def from_code3(cls, code3):

119 return cls(cls.monomer_type.by_code3[code3.upper()])

120

121 @classmethod

122 def from_name(cls, name):

123 return cls(cls.monomer_type.by_name[name.capitalize()])

124

125 def __eq__(self, other):

126 if hasattr(other, "type"):

127 return self.type is other.type

128 return self.type is other

129

130 class Sequence(list):

131 """Sequence of Monomers.

132

133 This behaves like list of monomer objects. In addition to standard list

134 behaviour, Sequence has the following attributes:

135

136 * name -- str with the name of the sequence

137 * description -- str with description of the sequence

138 * source -- str denoting source of the sequence

139

140 Any of them may be empty (i.e. hold empty string)

141

142 Class attributes:

143

144 * monomer_type -- type of monomers in sequence, must be redefined when

145 subclassing

146 """

147

148 monomer_type = Monomer

149

150 name = ''

151 description = ''

152 source = ''

153

154 def __init__(self, sequence=[], name=None, description=None, source=None):

155 super(Sequence, self).__init__(sequence)

156 if hasattr(sequence, 'name'):

157 vars(self).update(vars(sequence))

158 if name:

159 self.name = name

160 if description:

161 self.description = description

162 if source:

163 self.source = source

164

165 def __str__(self):

166 """Returns sequence in one-letter code."""

167 return ''.join(monomer.code1 for monomer in self)

168

169 @classmethod

170 def from_string(cls, string, name='', description=''):

171 """Create sequences from string of one-letter codes."""

172 monomer = cls.monomer_type.from_code1

173 monomers = [monomer(letter) for letter in string]

174 return cls(monomers, name, description)

175

176 @classmethod

177 def from_fasta(cls, file):

178 """Read sequence from FASTA file.

179

180 File must contain exactly one sequence.

181 """

182 sequences = fasta.parse_file(file)

183 assert len(sequences) == 1

184 header = sequences.keys()[0]

185 name, _, description = header.partition(" ")

186 return cls(sequences[header], name, description, file.name)

187

188 class Alignment(dict):

189 """Alignment.

190

191 Behaves like a list of Columns.

192 """

193 # _sequences -- list of Sequence objects. Sequences don't contain gaps

194 # - see sequence.py module

195

196 def __init__(self, *args):

197 """overloaded constructor

198

199 Alignment()

200 new empty Alignment

201

202 Alignment(sequences, body)

203 new Alignment with sequences and body initialized from arguments

204

205 Alignment(fasta_file)

206 new Alignment, read body and sequences from fasta file

207 """

208 if len(args)>1:#overloaded constructor

209 self.sequences=args[0]

210 self.body=args[1]

211 elif len(args)==0:

212 self.sequences=[]

213 self.body={}

214 else:

215 self.sequences, self.body = Alignment.from_fasta(args[0])

216

217 def length(self):

218 """ Returns width, ie length of each sequence with gaps """

219 return max([len(line) for line in self.body.values()])

220

221 def height(self):

222 """ The number of sequences in alignment (it's thickness). """

223 return len(self.body)

224

225 def identity(self):

226 """ Calculate the identity of alignment positions for colouring.

227

228 For every (row, column) in alignment the percentage of the exactly

229 same residue in the same column in the alignment is calculated.

230 The data structure is just like the Alignment.body, but istead of

231 monomers it contains float percentages.

232 """

233 # Oh, God, that's awful! Absolutely not understandable.

234 # First, calculate percentages of amino acids in every column

235 contribution = 1.0 / len(self.sequences)

236 all_columns = []

237 for position in range(len(self)):

238 column_percentage = {}

239 for seq in self.body:

240 if self.body[seq][position] is not None:

241 aa = self.body[seq][position].code

242 else:

243 aa = None

244 if aa in allpy.data.amino_acids:

245 if aa in column_percentage.keys():

246 column_percentage[aa] += contribution

247 else:

248 column_percentage[aa] = contribution

249 all_columns.append(column_percentage)

250 # Second, map these percentages onto the alignment

251 self.identity_percentages = {}

252 for seq in self.sequences:

253 self.identity_percentages[seq] = []

254 for seq in self.identity_percentages:

255 line = self.identity_percentages[seq]

256 for position in range(len(self)):

257 if self.body[seq][position] is not None:

258 aa = self.body[seq][position].code

259 else:

260 aa = None

261 line.append(all_columns[position].get(aa))

262 return self.identity_percentages

263

264 @classmethod

265 def from_fasta(file):

266 """ Import data from fasta file

267

268 >>> import alignment

269 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))

270 """

271 import re

272

273 sequences = []

274 body = {}

275

276 raw_sequences = file.read().split(">")

277 if len(raw_sequences) <= 1:

278 raise Exception("Wrong format of fasta-file %s" % file.name)

279

280 raw_sequences = raw_sequences[1:] #ignore everything before the first >

281 for raw in raw_sequences:

282 parsed_raw_sequence = raw.split("\n")

283 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]

284 name_and_description = parsed_raw_sequence[0]

285 name_and_description = name_and_description.split(" ",1)

286 if len(name_and_description) == 2:

287 name, description = name_and_description

288 elif len(name_and_description) == 1:

289 #if there is description

290 name = name_and_description[0]

291 description = ''

292 else:

293 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \

294 {'name': name, 'file': file.name})

295

296 if len(parsed_raw_sequence) <= 1:

297 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \

298 {'name': name, 'file': file.name})

299 string = ""

300 for piece in parsed_raw_sequence[1:]:

301 piece_without_whitespace_chars = re.sub("\s", "", piece)

302 string += piece_without_whitespace_chars

303 monomers = [] #convert into Monomer objects

304 body_list = [] #create the respective list in body dict

305 for current_monomer in string:

306 if current_monomer not in ["-", ".", "~"]:

307 monomers.append(cls.monomer_type.from_code1(current_monomer))

308 body_list.append(monomers[-1])

309 else:

310 body_list.append(None)

311 s = sequence.Sequence(monomers, name, description)

312 sequences.append(s)

313 body[s] = body_list

314 return sequences, body

315

316 @staticmethod

317 def from_sequences(*sequences):

318 """ Constructs new alignment from sequences

319

320 Add None's to right end to make equal lengthes of alignment sequences

321 """

322 alignment = Alignment()

323 alignment.sequences = sequences

324 max_length = max(len(sequence) for sequence in sequences)

325 for sequence in sequences:

326 gaps_count = max_length - len(sequence)

327 alignment.body[sequence] = sequence.monomers + [None] * gaps_count

328 return alignment

329

330 def save_fasta(self, out_file, long_line=70, gap='-'):

331 """ Saves alignment to given file

332

333 Splits long lines to substrings of length=long_line

334 To prevent this, set long_line=None

335 """

336 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)

337

338 def muscle_align(self):

339 """ Simple align ths alignment using sequences (muscle)

340

341 uses old Monomers and Sequences objects

342 """

343 tmp_file = NamedTemporaryFile(delete=False)

344 self.save_fasta(tmp_file)

345 tmp_file.close()

346 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})

347 sequences, body = Alignment.from_fasta(open(tmp_file.name))

348 for sequence in self.sequences:

349 try:

350 new_sequence = [i for i in sequences if sequence==i][0]

351 except:

352 raise Exception("Align: Cann't find sequence %s in muscle output" % \

353 sequence.name)

354 old_monomers = iter(sequence.monomers)

355 self.body[sequence] = []

356 for monomer in body[new_sequence]:

357 if not monomer:

358 self.body[sequence].append(monomer)

359 else:

360 old_monomer = old_monomers.next()

361 if monomer != old_monomer:

362 raise Exception("Align: alignment errors")

363 self.body[sequence].append(old_monomer)

364 os.unlink(tmp_file.name)

365

366 def column(self, sequence=None, sequences=None, original=None):

367 """ returns list of columns of alignment

368

369 sequence or sequences:

370 * if sequence is given, then column is (original_monomer, monomer)

371 * if sequences is given, then column is (original_monomer, {sequence: monomer})

372 * if both of them are given, it is an error

373

374 original (Sequence type):

375 * if given, this filters only columns represented by original sequence

376 """

377 if sequence and sequences:

378 raise Exception("Wrong usage. read help")

379 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])

380 alignment = self.body.items()

381 alignment.sort(key=lambda i: indexes[i[0]])

382 alignment = [monomers for seq, monomers in alignment]

383 for column in zip(*alignment):

384 if not original or column[indexes[original]]:

385 if sequence:

386 yield (column[indexes[original]], column[indexes[sequence]])

387 else:

388 yield (column[indexes[original]],

389 dict([(s, column[indexes[s]]) for s in sequences]))

390

391 def secstr(self, sequence, pdb_chain, gap='-'):

392 """ Returns string representing secondary structure """

393 return ''.join([

394 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)

395 for m in self.body[sequence]])

396

397 class Block(object):

398 """ Block of alignment

399

400 Mandatory data:

401

402 * self.alignment -- alignment object, which the block belongs to

403 * self.sequences - set of sequence objects that contain monomers

404 and/or gaps, that constitute the block

405 * self.positions -- list of positions of the alignment.body that

406 are included in the block; position[i+1] is always to the right from position[i]

407

408 Don't change self.sequences -- it may be a link to other block.sequences

409

410 How to create a new block:

411

412 >>> import alignment

413 >>> import block

414 >>> proj = alignment.Alignment(open("test.fasta"))

415 >>> block1 = block.Block(proj)

416 """

417

418 def __init__(self, alignment, sequences=None, positions=None):

419 """ Builds new block from alignment

420

421 if sequences==None, all sequences are used

422 if positions==None, all positions are used

423 """

424 if sequences == None:

425 sequences = set(alignment.sequences) # copy

426 if positions == None:

427 positions = range(len(alignment))

428 self.alignment = alignment

429 self.sequences = sequences

430 self.positions = positions

431

432 def save_fasta(self, out_file, long_line=70, gap='-'):

433 """ Saves alignment to given file in fasta-format

434

435 No changes in the names, descriptions or order of the sequences

436 are made.

437 """

438 for sequence in self.sequences:

439 alignment_monomers = self.alignment.body[sequence]

440 block_monomers = [alignment_monomers[i] for i in self.positions]

441 string = ''.join([m.type.code1 if m else '-' for m in block_monomers])

442 save_fasta(out_file, string, sequence.name, sequence.description, long_line)

443

444 def geometrical_cores(self, max_delta=config.delta,

445 timeout=config.timeout, minsize=config.minsize,

446 ac_new_atoms=config.ac_new_atoms,

447 ac_count=config.ac_count):

448 """ Returns length-sorted list of blocks, representing GCs

449

450 * max_delta -- threshold of distance spreading

451 * timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm)

452 * minsize -- min size of each core

453 * ac_new_atoms -- min part or new atoms in new alternative core

454 current GC is compared with each of already selected GCs if

455 difference is less then ac_new_atoms, current GC is skipped

456 difference = part of new atoms in current core

457 * ac_count -- max number of cores (including main core)

458 -1 means infinity

459

460 If more than one pdb chain for some sequence provided, consider all of them

461 cost is calculated as 1 / (delta + 1)

462

463 delta in [0, +inf) => cost in (0, 1]

464 """

465 nodes = self.positions

466 lines = {}

467 for i in self.positions:

468 for j in self.positions:

469 if i < j:

470 distances = []

471 for sequence in self.sequences:

472 for chain in sequence.pdb_chains:

473 m1 = self.alignment.body[sequence][i]

474 m2 = self.alignment.body[sequence][j]

475 if m1 and m2:

476 r1 = sequence.pdb_residues[chain][m1]

477 r2 = sequence.pdb_residues[chain][m2]

478 ca1 = r1['CA']

479 ca2 = r2['CA']

480 d = ca1 - ca2 # Bio.PDB feature

481 distances.append(d)

482 if len(distances) >= 2:

483 delta = max(distances) - min(distances)

484 if delta <= max_delta:

485 lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta)

486 graph = Graph(nodes, lines)

487 cliques = graph.cliques(timeout=timeout, minsize=minsize)

488 GCs = []

489 for clique in cliques:

490 for GC in GCs:

491 if len(clique - set(GC.positions)) < ac_new_atoms * len(clique):

492 break

493 else:

494 GCs.append(Block(self.alignment, self.sequences, clique))

495 if ac_count != -1 and len(GCs) >= ac_count:

496 break

497 return GCs

498

499 def xstring(self, x='X', gap='-'):

500 """ Returns string consisting of gap chars and chars x at self.positions

501

502 Length of returning string = length of alignment

503 """

504 monomers = [False] * len(self.alignment)

505 for i in self.positions:

506 monomers[i] = True

507 return ''.join([x if m else gap for m in monomers])

508

509 def save_xstring(self, out_file, name, description='', x='X', gap='-', long_line=70):

510 """ Save xstring and name in fasta format """

511 save_fasta(out_file, self.xstring(x=x, gap=gap), name, description, long_line)

512

513 def monomers(self, sequence):

514 """ Iterates monomers of this sequence from this block """

515 alignment_sequence = self.alignment.body[sequence]

516 return (alignment_sequence[i] for i in self.positions)

517

518 def ca_atoms(self, sequence, pdb_chain):

519 """ Iterates Ca-atom of monomers of this sequence from this block """

520 return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers())

521

522 def sequences_chains(self):

523 """ Iterates pairs (sequence, chain) """

524 for sequence in self.alignment.sequences:

525 if sequence in self.sequences:

526 for chain in sequence.pdb_chains:

527 yield (sequence, chain)

528

529 def superimpose(self):

530 """ Superimpose all pdb_chains in this block """

531 sequences_chains = list(self.sequences_chains())

532 if len(sequences_chains) >= 1:

533 sup = Superimposer()

534 fixed_sequence, fixed_chain = sequences_chains.pop()

535 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain)

536 for sequence, chain in sequences_chains:

537 moving_atoms = self.ca_atoms(sequence, chain)

538 sup.set_atoms(fixed_atoms, moving_atoms)

539 # Apply rotation/translation to the moving atoms

540 sup.apply(moving_atoms)

541

542 def pdb_save(self, out_file):

543 """ Save all sequences

544

545 Returns {(sequence, chain): CHAIN}

546 CHAIN is chain letter in new file

547 """

548 tmp_file = NamedTemporaryFile(delete=False)

549 tmp_file.close()

550

551 for sequence, chain in self.sequences_chains():

552 sequence.pdb_save(tmp_file.name, chain)

553 # TODO: read from tmp_file.name

554 # change CHAIN

555 # add to out_file

556

557 os.unlink(NamedTemporaryFile)

558

559 # vim: set ts=4 sts=4 sw=4 et: