allpy: 0d6c1227ede8 allpy/base.py

allpy

view allpy/base.py @ 794:0d6c1227ede8

Automated merge with ssh://kodomo/allpy

author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Wed, 13 Jul 2011 11:22:59 +0400
parents	fbf79117dd18
children	91e73fb1ac79

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = set((".", "-", "~"))

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 module = vars(data.monomers)[cls.type]

41 TheMonomer.__name__ = re.sub(r"\W", "_", name)

42 TheMonomer.__module__ = module.__name__

43 TheMonomer.name = name

44 TheMonomer.code1 = code1

45 TheMonomer.code3 = code3

46 TheMonomer.is_modified = is_modified

47 # Save the class in data.monomers so that it can be pickled

48 # Some names are not unique, we append underscores to them

49 # in order to fix it.

50 while TheMonomer.__name__ in vars(module):

51 TheMonomer.__name__ += "_"

52 vars(module)[TheMonomer.__name__] = TheMonomer

53 if not is_modified:

54 cls.by_code1[code1] = TheMonomer

55 if code3 not in cls.by_code3 or not is_modified:

56 cls.by_code3[code3] = TheMonomer

57 cls.by_name[name] = TheMonomer

58 # We duplicate distinguished long names into Monomer itself, so that we

59 # can use Monomer.from_code3 to create the relevant type of monomer.

60 if code3 not in Monomer.by_code3 or not is_modified:

61 Monomer.by_code3[code3] = TheMonomer

62 Monomer.by_name[name] = TheMonomer

64 @classmethod

65 def _initialize(cls, codes=None):

66 """Create all relevant subclasses of Monomer."""

67 for code1, is_modified, code3, name in codes:

68 cls._subclass(name, code1, code3, is_modified)

70 @classmethod

71 def from_code1(cls, code1):

72 """Create new monomer from 1-letter code."""

73 monomer = cls.by_code1[code1.upper()]()

74 monomer.input_code1 = code1

75 return monomer

77 @classmethod

78 def from_code3(cls, code3):

79 """Create new monomer from 3-letter code."""

80 return cls.by_code3[code3.upper()]()

82 @classmethod

83 def from_name(cls, name):

84 """Create new monomer from full name."""

85 return cls.by_name[name.strip().capitalize()]()

87 def __repr__(self):

88 return "<Monomer %s>" % str(self.code1)

90 def __str__(self):

91 """Returns one-letter code"""

92 return self.code1

94 def __eq__(self, other):

95 """Monomers within same monomer type are compared by code1."""

96 if not other:

97 return False

98 assert self.type == other.type

99 return self.code1 == other.code1

100

101 def __ne__(self, other):

102 return not (self == other)

103

104 class Sequence(list):

105 """Sequence of Monomers.

106

107 This behaves like list of monomer objects. In addition to standard list

108 behaviour, Sequence has the following attributes:

109

110 * name -- str with the name of the sequence

111 * description -- str with description of the sequence

112 * source -- str denoting source of the sequence

113

114 Any of them may be empty (i.e. hold empty string)

115 """

116

117 types = base

118 """Mapping of related types. SHOULD be redefined in subclasses."""

119

120 name = ''

121 description = ''

122 source = ''

123

124 def __init__(self, *args):

125 self.markups = {}

126 list.__init__(self, *args)

127

128 @classmethod

129 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

130 """Create sequence from a list of monomer objecst."""

131 result = cls(monomers)

132 if name:

133 result.name = name

134 if description:

135 result.description = description

136 if source:

137 result.source = source

138 return result

139

140 @classmethod

141 def from_string(cls, string, name='', description='', source=''):

142 """Create sequences from string of one-letter codes."""

143 monomer = cls.types.Monomer.from_code1

144 monomers = [monomer(letter) for letter in string]

145 return cls.from_monomers(monomers, name, description, source)

146

147 def __repr__(self):

148 if self.name:

149 return '<Sequence %s>' % str(self.name)

150 else:

151 return '<Sequence %s>' % str(self)

152

153 def __str__(self):

154 """Returns sequence of one-letter codes."""

155 return ''.join(monomer.code1 for monomer in self)

156

157 def __hash__(self):

158 """Hash sequence by identity."""

159 return id(self)

160

161 class Alignment(object):

162 """Alignment. It is a list of Columns."""

163

164 types = base

165 """Mapping of related types. SHOULD be redefined in subclasses."""

166

167 sequences = None

168 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

169

170 def __init__(self):

171 """Initialize empty alignment."""

172 self.sequences = []

173 self.columns = []

174 self.markups = {}

175

176 # Alignment grow & IO methods

177 # ==============================

178

179 def append_sequence(self, sequence):

180 """Add sequence to alignment. Return self.

181

182 If sequence is too short, pad it with gaps on the right.

183 """

184 self.sequences.append(sequence)

185 self._pad_to_width(len(sequence))

186 for column, monomer in zip(self.columns, sequence):

187 column[sequence] = monomer

188 return self

189

190 def append_row_from_string(self, string,

191 name='', description='', source='', gaps=default_gaps):

192 """Add row from a string of one-letter codes and gaps. Return self."""

193 Sequence = self.types.Sequence

194 without_gaps = util.remove_each(string, gaps)

195 sequence = Sequence.from_string(without_gaps, name, description, source)

196 self._pad_to_width(len(string))

197 non_gap_columns = [column

198 for column, char in zip(self.columns, string)

199 if char not in gaps

200 ]

201 for monomer, column in zip(sequence, non_gap_columns):

202 column[sequence] = monomer

203 self.sequences.append(sequence)

204 return self

205

206 def append_row_with_gaps(self, row, sequence):

207 """Add row from row_as_list representation and sequence. Return self."""

208 self.sequences.append(sequence)

209 self._pad_to_width(len(row))

210 for column, monomer in zip(self.columns, row):

211 if monomer:

212 column[sequence] = monomer

213 return self

214

215 def _pad_to_width(self, n):

216 """Pad alignment with empty columns on the right to width n."""

217 for i in range(len(self.columns), n):

218 self.columns.append(Column())

219

220 def append_file(self, file, format='fasta', gaps=default_gaps):

221 """Append sequences from file to alignment. Return self.

222

223 If sequences in file have gaps (detected as characters belonging to

224 `gaps` set), treat them accordingly.

225 """

226 fileio.File(file, format, gaps=gaps).read_alignment(self)

227 return self

228

229 def to_file(self, file, format='fasta', gap='-'):

230 """Write alignment in FASTA file as sequences with gaps."""

231 fileio.File(file, format, gaps=gap).write_alignment(self)

232 return self

233

234 # Data access methods for alignment

235 # =================================

236

237 def rows(self):

238 """Return list of rows (temporary objects) in alignment.

239

240 Each row is a dictionary of { column : monomer }.

241

242 For gap positions there is no key for the column in row.

243

244 Each row has attribute `sequence` pointing to the sequence the row is

245 describing.

246

247 Modifications of row have no effect on the alignment.

248 """

249 # For now, the function returns a list rather than iterator.

250 # It is yet to see, whether memory performance here becomes critical,

251 # or is random access useful.

252 rows = []

253 for sequence in self.sequences:

254 row = util.UserDict()

255 row.sequence = sequence

256 for column in self.columns:

257 if sequence in column:

258 row[column] = column[sequence]

259 rows.append(row)

260 return rows

261

262 def rows_as_lists(self):

263 """Return list of rows (temporary objects) in alignment.

264

265 Each row here is a list of either monomer or None (for gaps).

266

267 Each row has attribute `sequence` pointing to the sequence of row.

268

269 Modifications of row have no effect on the alignment.

270 """

271 rows = []

272 for sequence in self.sequences:

273 row = util.UserList()

274 row.sequence = sequence

275 for column in self.columns:

276 row.append(column.get(sequence))

277 rows.append(row)

278 return rows

279

280 def rows_as_strings(self, gap='-'):

281 """Return list of string representation of rows in alignment.

282

283 Each row has attribute `sequence` pointing to the sequence of row.

284

285 `gap` is the symbol to use for gap.

286 """

287 rows = []

288 for sequence in self.sequences:

289 string = ""

290 for column in self.columns:

291 if sequence in column:

292 string += column[sequence].code1

293 else:

294 string += gap

295 string = util.UserString(string)

296 string.sequence = sequence

297 rows.append(string)

298 return rows

299

300 def row_as_list(self, sequence):

301 """Return representaion of row as list with `Monomers` and `None`s."""

302 return [column.get(sequence) for column in self.columns]

303

304 def row_as_string(self, sequence, gap='-'):

305 """Return string representaion of row in alignment.

306

307 String will have gaps represented by `gap` symbol (defaults to '-').

308 """

309 def char(monomer):

310 if monomer:

311 return monomer.code1

312 return gap

313 row = self.row_as_list(sequence)

314 return "".join(map(char, row))

315

316 def columns_as_lists(self):

317 """Return list of columns (temorary objects) in alignment.

318

319 Each column here is a list of either monomer or None (for gaps).

320

321 Items of column are sorted in the same way as alignment.sequences.

322

323 Modifications of column have no effect on the alignment.

324 """

325 columns = []

326 for column in self.columns:

327 col = util.UserList()

328 col.column = column

329 for sequence in self.sequences:

330 col.append(column.get(sequence))

331 columns.append(col)

332 return columns

333

334 # Alignment / Block editing methods

335 # =================================

336

337 def flush(self, whence='left'):

338 """Remove all gaps from alignment and flush results to one side.

339

340 `whence` must be one of 'left', 'right' or 'center'

341 """

342 if whence == 'left':

343 from processors import Left as Flush

344 elif whence == 'right':

345 from processors import Right as Flush

346 elif whence == 'center':

347 from processors import Center as Flush

348 else:

349 raise AssertionError, "Whence must be left, right or center"

350 self.realign(Flush())

351

352 def remove_gap_columns(self):

353 """Remove all empty columns."""

354 for n, column in reversed(list(enumerate(self.columns))):

355 if column == {}:

356 self.columns[n:n+1] = []

357

358 def _wipe_row(self, sequence):

359 """Turn all row positions into gaps (but keep sequences intact)."""

360 for column in self.columns:

361 if sequence in column:

362 del column[sequence]

363

364 def _merge(self, dst, new, merge):

365 """Replace contents of `dst` with those of `new`.

366

367 Replace contents of elements using function `merge(dst_el, new_le)`.

368 """

369 for el, new_el in zip(dst, new):

370 merge(el, new_el)

371 dst[len(dst):] = new[len(dst):]

372 del dst[len(new):]

373

374 def _replace_sequence_contents(self, new, copy_descriptions):

375 """Replace contents of sequences with those of `new` alignment."""

376 # XXX: we manually copy sequence contents here

377 # XXX: we only copy, overlapping parts and link to the rest

378 def merge_monomers(dst, new):

379 dst.__class__ = new.__class__

380 def merge_sequences(dst, new):

381 if copy_descriptions:

382 vars(dst).update(vars(new))

383 self._merge(dst, new, merge_monomers)

384 self._merge(self.sequences, new.sequences, merge_sequences)

385

386 def _replace_column_contents(self, new):

387 """Replace column contents with those of `new` alignment.

388

389 In other words: copy gap patterns from `new` to `self`.

390

391 `self.sequences` and `new.sequences` should have the same contents.

392 """

393 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

394 sequence = row.sequence

395 monomers = filter(None, row)

396 assert len(monomers) == len(filter(None, new_row))

397 self._wipe_row(sequence)

398 non_gap_columns = [column

399 for column, monomer in zip(self.columns, new_row)

400 if monomer

401 ]

402 for monomer, column in zip(monomers, non_gap_columns):

403 column[sequence] = monomer

404

405 def _replace_contents(self, new, copy_descriptions, copy_contents):

406 """Replace alignment contents with those of other alignment."""

407 if copy_contents:

408 self._replace_sequence_contents(new, copy_descriptions)

409 self._replace_column_contents(new)

410

411 def process(self, function, copy_descriptions=True, copy_contents=True):

412 """Apply function to the alignment (or block); inject results back.

413

414 - `function(block)` must return block with same line order.

415 - if `copy_descriptions` is False, ignore new sequence names.

416 - if `copy_contents` is False, don't copy sequence contents too.

417

418 `function` (object) may have attributes `copy_descriptions` and

419 `copy_contents`, which override the same named arguments.

420 """

421 new = function(self)

422 if hasattr(function, 'copy_descriptions'):

423 copy_descriptions = function.copy_descriptions

424 if hasattr(function, 'copy_contents'):

425 copy_contents = function.copy_contents

426 self._replace_contents(new, copy_descriptions, copy_contents)

427

428 def realign(self, function):

429 """Realign self.

430

431 I.e.: apply function to self to produce a new alignment, then update

432 self to have the same gap patterns as the new alignment.

433

434 This is the same as process(function, False, False)

435 """

436 new = function(self)

437 self._replace_column_contents(new)

438

439 class Column(dict):

440 """Column of alignment.

441

442 Column is a dict of { sequence : monomer }.

443

444 For sequences that have gaps in current row, given key is not present in

445 the column.

446 """

447

448 types = base

449 """Mapping of related types. SHOULD be redefined in subclasses."""

450

451 def __hash__(self):

452 """Return hash by identity."""

453 return id(self)

454

455 class Block(Alignment):

456 """Block of alignment.

457

458 Block is an intersection of several rows & columns. (The collections of

459 rows and columns are represented as ordered lists, to retain display order

460 of Alignment or add ability to tweak it). Most of blocks look like

461 rectangular part of alignment if you shuffle alignment rows the right way.

462 """

463

464 alignment = None

465 """Alignment the block belongs to."""

466

467 sequences = ()

468 """List of sequences in block."""

469

470 columns = ()

471 """List of columns in block."""

472

473 @classmethod

474 def from_alignment(cls, alignment, sequences=None, columns=None):

475 """Build new block from alignment.

476

477 If sequences are not given, the block uses all sequences in alignment.

478

479 If columns are not given, the block uses all columns in alignment.

480

481 In both cases we use exactly the list used in alignment, thus, if new

482 sequences or columns are added to alignment, the block tracks this too.

483 """

484 if sequences is None:

485 sequences = alignment.sequences

486 if columns is None:

487 columns = alignment.columns

488 block = cls()

489 block.alignment = alignment

490 block.sequences = sequences

491 block.columns = columns

492 return block

493

494 class Markup(object):

495 """Base class for sequence and alignment markups.

496

497 We shall call either sequence or alignment a container. And we shall call

498 either monomers or columns elements respectively.

499

500 Markup behaves like a dictionary of [element] -> value.

501

502 Every container has a dictionary of [name] -> markup. It is Markup's

503 responsibility to add itself to this dictionary and to avoid collisions

504 while doing it.

505 """

506

507 name = None

508 """Name of markup elements"""

509

510 def _register(self, container, name):

511 """Register self within container.

512

513 Assure the name is not taken before. If name is not given, look in the

514 class. Make sure we have some name at all.

515 """

516 if name:

517 self.name = name

518 assert self.name is not None

519 assert self.name not in container.markups

520 container.markups[self.name] = self

521

522 def refresh(self):

523 """Recalculate markup values (if they are generated automatically)."""

524 pass

525

526 @classmethod

527 def from_record(cls, container, record, name=None):

528 """Restore markup from `record`. (Used for loading from file).

529

530 `record` is a dict of all metadata and data related to one markup. All

531 keys and values in `record` are strings, markup must parse them itself.

532

533 Markup values should be stored in `record['markup']`, which is a list

534 of items separated with either `record['separator']` or a comma.

535 """

536 return cls(container, name)

537

538 def to_record(self):

539 """Save markup to `record`, for saving to file.

540

541 For description of `record` see docstring for `from_record` method.

542 """

543 return {}

544

545 def sorted_keys(self):

546 """Return list of elements in the container in proper order."""

547 raise NotImplementedError()

548

549 def sorted_values(self):

550 """Return list of markup values in container."""

551 raise NotImplementedError()

552

553 class SequenceMarkup(Markup):

554 """Markup for sequence.

555

556 Behaves like a dictionary of [monomer] -> value. Value may be anything

557 or something specific, depending on subclass.

558

559 Actual values are stored in monomers themselves as attributes.

560 """

561

562 def __init__(self, sequence, name=None):

563 self.sequence = sequence

564 self._register(sequence, name)

565 self.refresh()

566

567 def sorted_keys(self):

568 """Return list of monomers."""

569 return self.sequence

570

571 def sorted_values(self):

572 """Return list of markup values, if every monomer is marked up."""

573 return (self[monomer] for monomer in self.sequence)

574

575 def get(self, key, value=None):

576 """Part of Mapping collection interface."""

577 if key not in self:

578 return value

579 return self[key]

580

581 def __contains__(self, monomer):

582 """Part of Mapping collection interface."""

583 return hasattr(monomer, self.name)

584

585 def __getitem__(self, monomer):

586 """Part of Mapping collection interface."""

587 return getattr(monomer, self.name)

588

589 def __setitem__(self, monomer, value):

590 """Part of Mapping collection interface."""

591 return setattr(monomer, self.name, value)

592

593 class AlignmentMarkup(dict, Markup):

594 """Markupf for alignment.

595

596 Is a dictionary of [column] -> value. Value may be anything or something

597 specific, depending on subclass.

598 """

599

600 def __init__(self, alignment, name=None):

601 self.alignment = alignment

602 self._register(alignment, name)

603 self.refresh()

604

605 def sorted_keys(self):

606 """Return a list of columns."""

607 return self.alignment.columns

608

609 def sorted_values(self):

610 """Return a list of makrup values, if every column is marked up."""

611 return (self[column] for column in self.alignment.columns)

612

613 # vim: set ts=4 sts=4 sw=4 et: