allpy: 7ebba94eece2 allpy/base.py

allpy

view allpy/base.py @ 900:7ebba94eece2

MarkupIOMixin: added attribute quotes to allow quotation around markup serialized string; fixed implementation of SequenceCaseMarkup based on that; [closes #125]

author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Tue, 11 Oct 2011 17:21:44 +0400
parents	d550aec04455
children	61f28f17f027 116f5bfc39b8

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = ("-", ".", "~")

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 module = vars(data.monomers)[cls.type]

41 TheMonomer.__name__ = re.sub(r"\W", "_", name)

42 TheMonomer.__module__ = module.__name__

43 TheMonomer.name = name

44 TheMonomer.code1 = code1

45 TheMonomer.code3 = code3

46 TheMonomer.is_modified = is_modified

47 # Save the class in data.monomers so that it can be pickled

48 # Some names are not unique, we append underscores to them

49 # in order to fix it.

50 while TheMonomer.__name__ in vars(module):

51 TheMonomer.__name__ += "_"

52 vars(module)[TheMonomer.__name__] = TheMonomer

53 if not is_modified:

54 cls.by_code1[code1] = TheMonomer

55 if code3 not in cls.by_code3 or not is_modified:

56 cls.by_code3[code3] = TheMonomer

57 cls.by_name[name] = TheMonomer

58 # We duplicate distinguished long names into Monomer itself, so that we

59 # can use Monomer.from_code3 to create the relevant type of monomer.

60 if code3 not in Monomer.by_code3 or not is_modified:

61 Monomer.by_code3[code3] = TheMonomer

62 Monomer.by_name[name] = TheMonomer

64 @classmethod

65 def _initialize(cls, codes=None):

66 """Create all relevant subclasses of Monomer."""

67 for code1, is_modified, code3, name in codes:

68 cls._subclass(name, code1, code3, is_modified)

70 @classmethod

71 def from_code1(cls, code1):

72 """Create new monomer from 1-letter code."""

73 monomer = cls.by_code1[code1.upper()]()

74 monomer.input_code1 = code1

75 return monomer

77 @classmethod

78 def from_code3(cls, code3):

79 """Create new monomer from 3-letter code."""

80 return cls.by_code3[code3.upper()]()

82 @classmethod

83 def from_name(cls, name):

84 """Create new monomer from full name."""

85 return cls.by_name[name.strip().capitalize()]()

87 def __repr__(self):

88 return "<Monomer %s>" % str(self.code1)

90 def __str__(self):

91 """Returns one-letter code"""

92 return self.code1

94 def __eq__(self, other):

95 """Monomers within same monomer type are compared by code1."""

96 if not other:

97 return False

98 assert self.type == other.type

99 return self.code1 == other.code1

100

101 def __ne__(self, other):

102 return not (self == other)

103

104 class MarkupContainerMixin(object):

105 """Common functions for alignment and sequence for dealing with markups.

106 """

107

108 def _init(self):

109 """Hook to be called from __init__ of actual class."""

110 self.markups = {}

111

112 def add_markup(self, name, markup_class=None, use_existing=False, **kws):

113 """Create a markup object, add to self. Return the created markup.

114

115 - `name` is name for markup in `self.markups` dictionary

116 - optional `markup_class` is class for created markup

117 - if optional `use_existing` is true, it is no error, if same named

118 markup already exists (in this case, nothing is changed)

119 - optional keyword arguments are passed on to the markup constructor

120

121 For user markups you have to specify `name` and `markup_class`,

122 for the standard automatical markups just `name` is enough.

123 """

124 # We have to import markups here, and not in the module header

125 # so as not to create bad import loops.

126 # `base` module is used extensively in `markups` for inherinance,

127 # so breaking the loop here seems a lot easier.

128 import markups

129 if markup_class is None:

130 kind = self.kind + "_" + "markup"

131 markup_class = markups.by_name[kind, name]

132 if use_existing and name in self.markups:

133 assert self.markups[name].__class__ is markup_class

134 return self.markups[name]

135 assert name not in self.markups

136 markup = markup_class(self, name, caller='container', **kws)

137 self.markups[name] = markup

138 return markup

139

140 def remove_markup(self, name):

141 """Remove markup."""

142 self.markups[name].remove()

143 del self.markups[name]

144

145 class Sequence(list, MarkupContainerMixin):

146 """Sequence of Monomers.

147

148 This behaves like list of monomer objects. In addition to standard list

149 behaviour, Sequence has the following attributes:

150

151 * name -- str with the name of the sequence

152 * description -- str with description of the sequence

153 * source -- str denoting source of the sequence

154

155 Any of them may be empty (i.e. hold empty string)

156 """

157

158 types = base

159 """Mapping of related types. SHOULD be redefined in subclasses."""

160

161 kind = 'sequence'

162 """Description of object kind."""

163

164 name = ''

165 description = ''

166 source = ''

167

168 def __init__(self, *args):

169 list.__init__(self, *args)

170 MarkupContainerMixin._init(self)

171

172 @classmethod

173 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

174 """Create sequence from a list of monomer objecst."""

175 result = cls(monomers)

176 if name:

177 result.name = name

178 if description:

179 result.description = description

180 if source:

181 result.source = source

182 return result

183

184 @classmethod

185 def from_string(cls, string, name='', description='', source=''):

186 """Create sequences from string of one-letter codes."""

187 monomer = cls.types.Monomer.from_code1

188 monomers = [monomer(letter) for letter in string]

189 return cls.from_monomers(monomers, name, description, source)

190

191 def __repr__(self):

192 if self.name:

193 return '<Sequence %s>' % str(self.name)

194 else:

195 return '<Sequence %s>' % str(self)

196

197 def __str__(self):

198 """Returns sequence of one-letter codes."""

199 return ''.join(monomer.code1 for monomer in self)

200

201 def __hash__(self):

202 """Hash sequence by identity."""

203 return id(self)

204

205 class Alignment(MarkupContainerMixin):

206 """Alignment. It is a list of Columns."""

207

208 types = base

209 """Mapping of related types. SHOULD be redefined in subclasses."""

210

211 sequences = None

212 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

213

214 kind = 'alignment'

215 """Description of object kind."""

216

217 def __init__(self):

218 """Initialize empty alignment."""

219 self.sequences = []

220 self.columns = []

221 MarkupContainerMixin._init(self)

222

223 # Alignment grow & IO methods

224 # ==============================

225

226 def append_sequence(self, sequence):

227 """Add sequence to alignment. Return self.

228

229 If sequence is too short, pad it with gaps on the right.

230 """

231 self.sequences.append(sequence)

232 self._pad_to_width(len(sequence))

233 for column, monomer in zip(self.columns, sequence):

234 column[sequence] = monomer

235 return self

236

237 def append_row_from_string(self, string,

238 name='', description='', source='', gaps=default_gaps):

239 """Add row from a string of one-letter codes and gaps. Return self."""

240 Sequence = self.types.Sequence

241 without_gaps = util.remove_each(string, gaps)

242 sequence = Sequence.from_string(without_gaps, name, description, source)

243 self._pad_to_width(len(string))

244 non_gap_columns = [column

245 for column, char in zip(self.columns, string)

246 if char not in gaps

247 ]

248 for monomer, column in zip(sequence, non_gap_columns):

249 column[sequence] = monomer

250 self.sequences.append(sequence)

251 return self

252

253 def append_row_with_gaps(self, row, sequence):

254 """Add row from row_as_list representation and sequence. Return self."""

255 self.sequences.append(sequence)

256 self._pad_to_width(len(row))

257 for column, monomer in zip(self.columns, row):

258 if monomer:

259 column[sequence] = monomer

260 return self

261

262 def _pad_to_width(self, n):

263 """Pad alignment with empty columns on the right to width n."""

264 for i in range(len(self.columns), n):

265 self.columns.append(Column())

266

267 def append_file(self, file, format='fasta', gaps=default_gaps):

268 """Append sequences from file to alignment. Return self.

269

270 If sequences in file have gaps (detected as characters belonging to

271 `gaps` set), treat them accordingly.

272 """

273 fileio.File(file, format, gaps=gaps).read_alignment(self)

274 return self

275

276 def to_file(self, file, format='fasta', gap='-'):

277 """Write alignment in FASTA file as sequences with gaps."""

278 fileio.File(file, format, gaps=gap).write_alignment(self)

279 return self

280

281 # Data access methods for alignment

282 # =================================

283

284 def rows(self):

285 """Return list of rows (temporary objects) in alignment.

286

287 Each row is a dictionary of { column : monomer }.

288

289 For gap positions there is no key for the column in row.

290

291 Each row has attribute `sequence` pointing to the sequence the row is

292 describing.

293

294 Modifications of row have no effect on the alignment.

295 """

296 # For now, the function returns a list rather than iterator.

297 # It is yet to see, whether memory performance here becomes critical,

298 # or is random access useful.

299 rows = []

300 for sequence in self.sequences:

301 row = util.UserDict()

302 row.sequence = sequence

303 for column in self.columns:

304 if sequence in column:

305 row[column] = column[sequence]

306 rows.append(row)

307 return rows

308

309 def rows_as_lists(self):

310 """Return list of rows (temporary objects) in alignment.

311

312 Each row here is a list of either monomer or None (for gaps).

313

314 Each row has attribute `sequence` pointing to the sequence of row.

315

316 Modifications of row have no effect on the alignment.

317 """

318 rows = []

319 for sequence in self.sequences:

320 row = util.UserList()

321 row.sequence = sequence

322 for column in self.columns:

323 row.append(column.get(sequence))

324 rows.append(row)

325 return rows

326

327 def rows_as_strings(self, gap='-'):

328 """Return list of string representation of rows in alignment.

329

330 Each row has attribute `sequence` pointing to the sequence of row.

331

332 `gap` is the symbol to use for gap.

333 """

334 rows = []

335 for sequence in self.sequences:

336 string = ""

337 for column in self.columns:

338 if sequence in column:

339 string += column[sequence].code1

340 else:

341 string += gap

342 string = util.UserString(string)

343 string.sequence = sequence

344 rows.append(string)

345 return rows

346

347 def row_as_list(self, sequence):

348 """Return representaion of row as list with `Monomers` and `None`s."""

349 return [column.get(sequence) for column in self.columns]

350

351 def row_as_string(self, sequence, gap='-'):

352 """Return string representaion of row in alignment.

353

354 String will have gaps represented by `gap` symbol (defaults to '-').

355 """

356 def char(monomer):

357 if monomer:

358 return monomer.code1

359 return gap

360 row = self.row_as_list(sequence)

361 return "".join(map(char, row))

362

363 def columns_as_lists(self):

364 """Return list of columns (temorary objects) in alignment.

365

366 Each column here is a list of either monomer or None (for gaps).

367

368 Items of column are sorted in the same way as alignment.sequences.

369

370 Modifications of column have no effect on the alignment.

371 """

372 columns = []

373 for column in self.columns:

374 col = util.UserList()

375 col.column = column

376 for sequence in self.sequences:

377 col.append(column.get(sequence))

378 columns.append(col)

379 return columns

380

381 # Alignment / Block editing methods

382 # =================================

383

384 def flush(self, whence='left'):

385 """Remove all gaps from alignment and flush results to one side.

386

387 `whence` must be one of 'left', 'right' or 'center'

388 """

389 deprecated(

390 "aln.flush('left') is deprecated in favor of aln.realign(Left())"

391 )

392 if whence == 'left':

393 from processors import Left as Flush

394 elif whence == 'right':

395 from processors import Right as Flush

396 elif whence == 'center':

397 from processors import Center as Flush

398 else:

399 raise AssertionError, "Whence must be left, right or center"

400 self.realign(Flush())

401

402 def remove_gap_columns(self):

403 """Remove all empty columns."""

404 for n, column in reversed(list(enumerate(self.columns))):

405 if column == {}:

406 self.columns[n:n+1] = []

407

408 def _wipe_row(self, sequence):

409 """Turn all row positions into gaps (but keep sequences intact)."""

410 for column in self.columns:

411 if sequence in column:

412 del column[sequence]

413

414 def _merge(self, dst, new, merge):

415 """Replace contents of `dst` with those of `new`.

416

417 Replace contents of elements using function `merge(dst_el, new_le)`.

418 """

419 for el, new_el in zip(dst, new):

420 merge(el, new_el)

421 dst[len(dst):] = new[len(dst):]

422 del dst[len(new):]

423

424 def _replace_sequence_contents(self, new, copy_descriptions):

425 """Replace contents of sequences with those of `new` alignment."""

426 # XXX: we manually copy sequence contents here

427 # XXX: we only copy, overlapping parts and link to the rest

428 def merge_monomers(dst, new):

429 dst.__class__ = new.__class__

430 def merge_sequences(dst, new):

431 if copy_descriptions:

432 vars(dst).update(vars(new))

433 self._merge(dst, new, merge_monomers)

434 self._merge(self.sequences, new.sequences, merge_sequences)

435

436 def _replace_column_contents(self, new):

437 """Replace column contents with those of `new` alignment.

438

439 In other words: copy gap patterns from `new` to `self`.

440

441 `self.sequences` and `new.sequences` should have the same contents.

442 """

443 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

444 sequence = row.sequence

445 monomers = filter(None, row)

446 assert len(monomers) == len(filter(None, new_row))

447 self._wipe_row(sequence)

448 non_gap_columns = [column

449 for column, monomer in zip(self.columns, new_row)

450 if monomer

451 ]

452 for monomer, column in zip(monomers, non_gap_columns):

453 column[sequence] = monomer

454

455 def _replace_contents(self, new, copy_descriptions, copy_contents):

456 """Replace alignment contents with those of other alignment."""

457 if copy_contents:

458 self._replace_sequence_contents(new, copy_descriptions)

459 self._replace_column_contents(new)

460

461 def process(self, function, copy_descriptions=True, copy_contents=True):

462 """Apply function to the alignment (or block); inject results back.

463

464 - `function(block)` must return block with same line order.

465 - if `copy_descriptions` is False, ignore new sequence names.

466 - if `copy_contents` is False, don't copy sequence contents too.

467

468 `function` (object) may have attributes `copy_descriptions` and

469 `copy_contents`, which override the same named arguments.

470 """

471 deprecated(

472 "aln.process() is deprecated and may cause damage to your"

473 " alignment! It will be removed from 1.4.1 realease."

474 " Use aln.realign() instead."

475 )

476 new = function(self)

477 if hasattr(function, 'copy_descriptions'):

478 copy_descriptions = function.copy_descriptions

479 if hasattr(function, 'copy_contents'):

480 copy_contents = function.copy_contents

481 self._replace_contents(new, copy_descriptions, copy_contents)

482

483 def realign(self, function):

484 """Realign self.

485

486 I.e.: apply function to self to produce a new alignment, then update

487 self to have the same gap patterns as the new alignment.

488

489 This is the same as process(function, False, False)

490 """

491 new = function(self)

492 self._replace_column_contents(new)

493

494 class Column(dict):

495 """Column of alignment.

496

497 Column is a dict of { sequence : monomer }.

498

499 For sequences that have gaps in current row, given key is not present in

500 the column.

501 """

502

503 types = base

504 """Mapping of related types. SHOULD be redefined in subclasses."""

505

506 def __hash__(self):

507 """Return hash by identity."""

508 return id(self)

509

510 class Block(Alignment):

511 """Block of alignment.

512

513 Block is an intersection of several rows & columns. (The collections of

514 rows and columns are represented as ordered lists, to retain display order

515 of Alignment or add ability to tweak it). Most of blocks look like

516 rectangular part of alignment if you shuffle alignment rows the right way.

517 """

518

519 alignment = None

520 """Alignment the block belongs to."""

521

522 sequences = ()

523 """List of sequences in block."""

524

525 columns = ()

526 """List of columns in block."""

527

528 @classmethod

529 def from_alignment(cls, alignment, sequences=None, columns=None):

530 """Build new block from alignment.

531

532 If sequences are not given, the block uses all sequences in alignment.

533

534 If columns are not given, the block uses all columns in alignment.

535

536 In both cases we use exactly the list used in alignment, thus, if new

537 sequences or columns are added to alignment, the block tracks this too.

538 """

539 if sequences is None:

540 sequences = alignment.sequences

541 if columns is None:

542 columns = alignment.columns

543 block = cls()

544 block.alignment = alignment

545 block.sequences = sequences

546 block.columns = columns

547 return block

548

549 class Markup(object):

550 """Base class for sequence and alignment markups.

551

552 We shall call either sequence or alignment a container. And we shall call

553 either monomers or columns elements respectively.

554

555 Markup behaves like a dictionary of [element] -> value.

556

557 Every container has a dictionary of [name] -> markup. It is Markup's

558 responsibility to add itself to this dictionary and to avoid collisions

559 while doing it.

560 """

561

562 name = None

563 """Name of markup elements."""

564

565 save = True

566 """If set to false, fileio should not save this markup."""

567

568 def __init__(self, container, name, **kwargs):

569 """Markup takes mandatory container and name and optional kwargs.

570

571 Markups should never be created by the user. They are created by

572 Sequence or Alignment.

573 """

574 self.name = name

575 assert kwargs.get('caller') == 'container', "Improper call"

576 self.refresh()

577

578 def refresh(self):

579 """Recalculate markup values (if they are generated automatically)."""

580 pass

581

582 def remove(self):

583 """Remove the traces of markup object. Do not call this yourself!"""

584 pass

585

586 @classmethod

587 def from_record(cls, container, record, name=None):

588 """Restore markup from `record`. (Used for loading from file).

589

590 `record` is a dict of all metadata and data related to one markup. All

591 keys and values in `record` are strings, markup must parse them itself.

592

593 Markup values should be stored in `record['markup']`, which is a list

594 of items separated with either `record['separator']` or a comma.

595 """

596 return container.add_markup(name, markup_class=cls)

597

598 def to_record(self):

599 """Save markup to `record`, for saving to file.

600

601 For description of `record` see docstring for `from_record` method.

602 """

603 return {}

604

605 def sorted_keys(self):

606 """Return list of elements in the container in proper order."""

607 raise NotImplementedError()

608

609 def sorted_values(self, **kw):

610 """Return list of markup values in container.

611

612 Possible arguments:

613

614 - `map` -- a function, applied to each existing value

615 - `default` -- a value to return for non-existing values

616

617 If `default` is not specified, the function fails on markups that do

618 not have all of the values set.

619 """

620 default_exists = 'default' in kw

621 default = kw.get('default')

622 map = kw.get('map', lambda x: x)

623 for item in self.sorted_keys():

624 if item not in self and default_exists:

625 yield default

626 else:

627 yield map(self[item])

628

629 class SequenceMarkup(Markup):

630 """Markup for sequence.

631

632 Behaves like a dictionary of [monomer] -> value. Value may be anything

633 or something specific, depending on subclass.

634

635 Actual values are stored in monomers themselves as attributes.

636 """

637

638 kind = 'sequence_markup'

639

640 def __init__(self, sequence, name, **kwargs):

641 self.sequence = sequence

642 Markup.__init__(self, sequence, name, **kwargs)

643

644 def remove(self):

645 """Remove the traces of markup object. Do not call this yourself!"""

646 for monomer in self.monomers:

647 del self[monomer]

648

649 def sorted_keys(self):

650 """Return list of monomers."""

651 return self.sequence

652

653 def get(self, key, value=None):

654 """Part of Mapping collection interface."""

655 if key not in self:

656 return value

657 return self[key]

658

659 def __contains__(self, monomer):

660 """Part of Mapping collection interface."""

661 return hasattr(monomer, self.name)

662

663 def __getitem__(self, monomer):

664 """Part of Mapping collection interface."""

665 return getattr(monomer, self.name)

666

667 def __setitem__(self, monomer, value):

668 """Part of Mapping collection interface."""

669 return setattr(monomer, self.name, value)

670

671 def __delitem__(self, monomer):

672 """Part of Mapping collection interface."""

673 return delattr(monomer, self.name)

674

675 class AlignmentMarkup(dict, Markup):

676 """Markupf for alignment.

677

678 Is a dictionary of [column] -> value. Value may be anything or something

679 specific, depending on subclass.

680 """

681

682 kind = 'alignment_markup'

683

684 def __init__(self, alignment, name, **kwargs):

685 self.alignment = alignment

686 Markup.__init__(self, alignment, name, **kwargs)

687

688 def sorted_keys(self):

689 """Return a list of columns."""

690 return self.alignment.columns

691

692 # vim: set ts=4 sts=4 sw=4 et: