allpy: fd531580b9af allpy/base.py

allpy

view allpy/base.py @ 724:fd531580b9af

Merge between Burkov and others.

author	Boris Burkov <BurkovBA@gmail.com>
date	Fri, 08 Jul 2011 16:20:20 +0400
parents	b71f9c1d1509 83a40cd34923
children	0a7917b4ca71

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = set((".", "-", "~"))

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 module = vars(data.monomers)[cls.type]

41 TheMonomer.__name__ = re.sub(r"\W", "_", name)

42 TheMonomer.__module__ = module.__name__

43 TheMonomer.name = name

44 TheMonomer.code1 = code1

45 TheMonomer.code3 = code3

46 TheMonomer.is_modified = is_modified

47 # Save the class in data.monomers so that it can be pickled

48 # Some names are not unique, we append underscores to them

49 # in order to fix it.

50 while TheMonomer.__name__ in vars(module):

51 TheMonomer.__name__ += "_"

52 vars(module)[TheMonomer.__name__] = TheMonomer

53 if not is_modified:

54 cls.by_code1[code1] = TheMonomer

55 cls.by_code3[code3] = TheMonomer

56 cls.by_name[name] = TheMonomer

57 # We duplicate distinguished long names into Monomer itself, so that we

58 # can use Monomer.from_code3 to create the relevant type of monomer.

59 Monomer.by_code3[code3] = TheMonomer

60 Monomer.by_name[name] = TheMonomer

62 @classmethod

63 def _initialize(cls, codes=None):

64 """Create all relevant subclasses of Monomer."""

65 for code1, is_modified, code3, name in codes:

66 cls._subclass(name, code1, code3, is_modified)

68 @classmethod

69 def from_code1(cls, code1):

70 """Create new monomer from 1-letter code."""

71 return cls.by_code1[code1.upper()]()

73 @classmethod

74 def from_code3(cls, code3):

75 """Create new monomer from 3-letter code."""

76 return cls.by_code3[code3.upper()]()

78 @classmethod

79 def from_name(cls, name):

80 """Create new monomer from full name."""

81 return cls.by_name[name.strip().capitalize()]()

83 def __repr__(self):

84 return "<Monomer %s>" % str(self.code1)

86 def __str__(self):

87 """Returns one-letter code"""

88 return self.code1

90 def __eq__(self, other):

91 """Monomers within same monomer type are compared by code1."""

92 if not other:

93 return False

94 assert self.type == other.type

95 return self.code1 == other.code1

97 def __ne__(self, other):

98 return not (self == other)

100 class Sequence(list):

101 """Sequence of Monomers.

102

103 This behaves like list of monomer objects. In addition to standard list

104 behaviour, Sequence has the following attributes:

105

106 * name -- str with the name of the sequence

107 * description -- str with description of the sequence

108 * source -- str denoting source of the sequence

109

110 Any of them may be empty (i.e. hold empty string)

111 """

112

113 types = base

114 """Mapping of related types. SHOULD be redefined in subclasses."""

115

116 name = ''

117 description = ''

118 source = ''

119

120 def __init__(self, *args):

121 self.markups = {}

122 list.__init__(self, *args)

123

124 @classmethod

125 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

126 """Create sequence from a list of monomer objecst."""

127 result = cls(monomers)

128 if name:

129 result.name = name

130 if description:

131 result.description = description

132 if source:

133 result.source = source

134 return result

135

136 @classmethod

137 def from_string(cls, string, name='', description='', source=''):

138 """Create sequences from string of one-letter codes."""

139 monomer = cls.types.Monomer.from_code1

140 monomers = [monomer(letter) for letter in string]

141 return cls.from_monomers(monomers, name, description, source)

142

143 def __repr__(self):

144 if self.name:

145 return '<Sequence %s>' % str(self.name)

146 else:

147 return '<Sequence %s>' % str(self)

148

149

150 def __str__(self):

151 """Returns sequence of one-letter codes."""

152 return ''.join(monomer.code1 for monomer in self)

153

154 def __hash__(self):

155 """Hash sequence by identity."""

156 return id(self)

157

158

159 class Alignment(object):

160 """Alignment. It is a list of Columns."""

161

162 types = base

163 """Mapping of related types. SHOULD be redefined in subclasses."""

164

165 sequences = None

166 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

167

168 def __init__(self):

169 """Initialize empty alignment."""

170 self.sequences = []

171 self.columns = []

172 self.markups = {}

173

174 # Alignment grow & IO methods

175 # ==============================

176

177 def append_sequence(self, sequence):

178 """Add sequence to alignment. Return self.

179

180 If sequence is too short, pad it with gaps on the right.

181 """

182 self.sequences.append(sequence)

183 self._pad_to_width(len(sequence))

184 for column, monomer in zip(self.columns, sequence):

185 column[sequence] = monomer

186 return self

187

188 def append_row_from_string(self, string,

189 name='', description='', source='', gaps=default_gaps):

190 """Add row from a string of one-letter codes and gaps. Return self."""

191 Sequence = self.types.Sequence

192 without_gaps = util.remove_each(string, gaps)

193 sequence = Sequence.from_string(without_gaps, name, description, source)

194 self._pad_to_width(len(string))

195 non_gap_columns = [column

196 for column, char in zip(self.columns, string)

197 if char not in gaps

198 ]

199 for monomer, column in zip(sequence, non_gap_columns):

200 column[sequence] = monomer

201 self.sequences.append(sequence)

202 return self

203

204 def append_row_with_gaps(self, row, sequence):

205 """Add row from row_as_list representation and sequence. Return self."""

206 self.sequences.append(sequence)

207 self._pad_to_width(len(row))

208 for column, monomer in zip(self.columns, row):

209 if monomer:

210 column[sequence] = monomer

211 return self

212

213 def _pad_to_width(self, n):

214 """Pad alignment with empty columns on the right to width n."""

215 for i in range(len(self.columns), n):

216 self.columns.append(Column())

217

218 def append_file(self, file, format='fasta', gaps=default_gaps):

219 """Append sequences from file to alignment. Return self.

220

221 If sequences in file have gaps (detected as characters belonging to

222 `gaps` set), treat them accordingly.

223 """

224 fileio.File(file, format, gaps=gaps).read_alignment(self)

225 return self

226

227 def to_file(self, file, format='fasta', gap='-'):

228 """Write alignment in FASTA file as sequences with gaps."""

229 fileio.File(file, format, gaps=gap).write_alignment(self)

230 return self

231

232 # Data access methods for alignment

233 # =================================

234

235 def rows(self):

236 """Return list of rows (temporary objects) in alignment.

237

238 Each row is a dictionary of { column : monomer }.

239

240 For gap positions there is no key for the column in row.

241

242 Each row has attribute `sequence` pointing to the sequence the row is

243 describing.

244

245 Modifications of row have no effect on the alignment.

246 """

247 # For now, the function returns a list rather than iterator.

248 # It is yet to see, whether memory performance here becomes critical,

249 # or is random access useful.

250 rows = []

251 for sequence in self.sequences:

252 row = util.UserDict()

253 row.sequence = sequence

254 for column in self.columns:

255 if sequence in column:

256 row[column] = column[sequence]

257 rows.append(row)

258 return rows

259

260 def rows_as_lists(self):

261 """Return list of rows (temporary objects) in alignment.

262

263 Each row here is a list of either monomer or None (for gaps).

264

265 Each row has attribute `sequence` pointing to the sequence of row.

266

267 Modifications of row have no effect on the alignment.

268 """

269 rows = []

270 for sequence in self.sequences:

271 row = util.UserList()

272 row.sequence = sequence

273 for column in self.columns:

274 row.append(column.get(sequence))

275 rows.append(row)

276 return rows

277

278 def rows_as_strings(self, gap='-'):

279 """Return list of string representation of rows in alignment.

280

281 Each row has attribute `sequence` pointing to the sequence of row.

282

283 `gap` is the symbol to use for gap.

284 """

285 rows = []

286 for sequence in self.sequences:

287 string = ""

288 for column in self.columns:

289 if sequence in column:

290 string += column[sequence].code1

291 else:

292 string += gap

293 string = util.UserString(string)

294 string.sequence = sequence

295 rows.append(string)

296 return rows

297

298 def row_as_list(self, sequence):

299 """Return representaion of row as list with `Monomers` and `None`s."""

300 return [column.get(sequence) for column in self.columns]

301

302 def row_as_string(self, sequence, gap='-'):

303 """Return string representaion of row in alignment.

304

305 String will have gaps represented by `gap` symbol (defaults to '-').

306 """

307 def char(monomer):

308 if monomer:

309 return monomer.code1

310 return gap

311 row = self.row_as_list(sequence)

312 return "".join(map(char, row))

313

314 def columns_as_lists(self):

315 """Return list of columns (temorary objects) in alignment.

316

317 Each column here is a list of either monomer or None (for gaps).

318

319 Items of column are sorted in the same way as alignment.sequences.

320

321 Modifications of column have no effect on the alignment.

322 """

323 columns = []

324 for column in self.columns:

325 col = util.UserList()

326 col.column = column

327 for sequence in self.sequences:

328 col.append(column.get(sequence))

329 columns.append(col)

330 return columns

331

332 # Alignment / Block editing methods

333 # =================================

334

335 def flush(self, whence='left'):

336 """Remove all gaps from alignment and flush results to one side.

337

338 `whence` must be one of 'left', 'right' or 'center'

339 """

340 if whence == 'left':

341 from processors import Left as Flush

342 elif whence == 'right':

343 from processors import Right as Flush

344 elif whence == 'center':

345 from processors import Center as Flush

346 else:

347 raise AssertionError, "Whence must be left, right or center"

348 self.realign(Flush())

349

350 def remove_gap_columns(self):

351 """Remove all empty columns."""

352 for n, column in reversed(list(enumerate(self.columns))):

353 if column == {}:

354 self.columns[n:n+1] = []

355

356 def _wipe_row(self, sequence):

357 """Turn all row positions into gaps (but keep sequences intact)."""

358 for column in self.columns:

359 if sequence in column:

360 del column[sequence]

361

362 def _merge(self, dst, new, merge):

363 """Replace contents of `dst` with those of `new`.

364

365 Replace contents of elements using function `merge(dst_el, new_le)`.

366 """

367 for el, new_el in zip(dst, new):

368 merge(el, new_el)

369 dst[len(dst):] = new[len(dst):]

370 del dst[len(new):]

371

372 def _replace_sequence_contents(self, new, copy_descriptions):

373 """Replace contents of sequences with those of `new` alignment."""

374 # XXX: we manually copy sequence contents here

375 # XXX: we only copy, overlapping parts and link to the rest

376 def merge_monomers(dst, new):

377 dst.__class__ = new.__class__

378 def merge_sequences(dst, new):

379 if copy_descriptions:

380 vars(dst).update(vars(new))

381 self._merge(dst, new, merge_monomers)

382 self._merge(self.sequences, new.sequences, merge_sequences)

383

384 def _replace_column_contents(self, new):

385 """Replace column contents with those of `new` alignment.

386

387 In other words: copy gap patterns from `new` to `self`.

388

389 `self.sequences` and `new.sequences` should have the same contents.

390 """

391 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

392 sequence = row.sequence

393 monomers = filter(None, row)

394 assert len(monomers) == len(filter(None, new_row))

395 self._wipe_row(sequence)

396 non_gap_columns = [column

397 for column, monomer in zip(self.columns, new_row)

398 if monomer

399 ]

400 for monomer, column in zip(monomers, non_gap_columns):

401 column[sequence] = monomer

402

403 def _replace_contents(self, new, copy_descriptions, copy_contents):

404 """Replace alignment contents with those of other alignment."""

405 if copy_contents:

406 self._replace_sequence_contents(new, copy_descriptions)

407 self._replace_column_contents(new)

408

409 def process(self, function, copy_descriptions=True, copy_contents=True):

410 """Apply function to the alignment (or block); inject results back.

411

412 - `function(block)` must return block with same line order.

413 - if `copy_descriptions` is False, ignore new sequence names.

414 - if `copy_contents` is False, don't copy sequence contents too.

415

416 `function` (object) may have attributes `copy_descriptions` and

417 `copy_contents`, which override the same named arguments.

418 """

419 new = function(self)

420 if hasattr(function, 'copy_descriptions'):

421 copy_descriptions = function.copy_descriptions

422 if hasattr(function, 'copy_contents'):

423 copy_contents = function.copy_contents

424 self._replace_contents(new, copy_descriptions, copy_contents)

425

426 def realign(self, function):

427 """Realign self.

428

429 I.e.: apply function to self to produce a new alignment, then update

430 self to have the same gap patterns as the new alignment.

431

432 This is the same as process(function, False, False)

433 """

434 new = function(self)

435 self._replace_column_contents(new)

436

437 class Column(dict):

438 """Column of alignment.

439

440 Column is a dict of { sequence : monomer }.

441

442 For sequences that have gaps in current row, given key is not present in

443 the column.

444 """

445

446 types = base

447 """Mapping of related types. SHOULD be redefined in subclasses."""

448

449 def __init__(self, alignment):

450 self.alignment = alignment

451 super(Column, self).__init__()

452

453 def __hash__(self):

454 """Return hash by identity."""

455 return id(self)

456

457 def MyIndex(self):

458 for index, column in enumerate(self.alignment.columns):

459 if column is self: return index

460 raise ValueException

461

462 def __repr__(self):

463 #!!!!!!!!! READ HOW index OF LIST COMPARES OBJECTS AND BASIC TYPES

464 return "<Column %s>"%(str(self.MyIndex()))

469 class Block(Alignment):

470 """Block of alignment.

471

472 Block is an intersection of several rows & columns. (The collections of

473 rows and columns are represented as ordered lists, to retain display order

474 of Alignment or add ability to tweak it). Most of blocks look like

475 rectangular part of alignment if you shuffle alignment rows the right way.

476 """

477

478 alignment = None

479 """Alignment the block belongs to."""

480

481 sequences = ()

482 """List of sequences in block."""

483

484 columns = ()

485 """List of columns in block."""

486

487 @classmethod

488 def from_alignment(cls, alignment, sequences=None, columns=None):

489 """Build new block from alignment.

490

491 If sequences are not given, the block uses all sequences in alignment.

492

493 If columns are not given, the block uses all columns in alignment.

494

495 In both cases we use exactly the list used in alignment, thus, if new

496 sequences or columns are added to alignment, the block tracks this too.

497 """

498 if sequences is None:

499 sequences = alignment.sequences

500 if columns is None:

501 columns = alignment.columns

502 block = cls()

503 block.alignment = alignment

504 block.sequences = sequences

505 block.columns = columns

506 return block

507

508 class Markup(object):

509 """Base class for sequence and alignment markups.

510

511 We shall call either sequence or alignment a container. And we shall call

512 either monomers or columns elements respectively.

513

514 Markup behaves like a dictionary of [element] -> value.

515

516 Every container has a dictionary of [name] -> markup. It is Markup's

517 responsibility to add itself to this dictionary and to avoid collisions

518 while doing it.

519 """

520

521 name = None

522 """Name of markup elements"""

523

524 def _register(self, container, name):

525 """Register self within container.

526

527 Assure the name is not taken before. If name is not given, look in the

528 class. Make sure we have some name at all.

529 """

530 if name:

531 self.name = name

532 assert self.name is not None

533 assert self.name not in container.markups

534 container.markups[self.name] = self

535

536 def refresh(self):

537 """Recalculate markup values (if they are generated automatically)."""

538 pass

539

540 @classmethod

541 def from_record(cls, container, record, name=None):

542 """Restore markup from `record`. (Used for loading from file).

543

544 `record` is a dict of all metadata and data related to one markup. All

545 keys and values in `record` are strings, markup must parse them itself.

546

547 Markup values should be stored in `record['markup']`, which is a list

548 of items separated with either `record['separator']` or a comma.

549 """

550 return cls(container, name)

551

552 def to_record(self):

553 """Save markup to `record`, for saving to file.

554

555 For description of `record` see docstring for `from_record` method.

556 """

557 return {}

558

559 def sorted_keys(self):

560 """Return list of elements in the container in proper order."""

561 raise NotImplementedError()

562

563 def sorted_values(self):

564 """Return list of markup values in container."""

565 raise NotImplementedError()

566

567 class SequenceMarkup(Markup):

568 """Markup for sequence.

569

570 Behaves like a dictionary of [monomer] -> value. Value may be anything

571 or something specific, depending on subclass.

572

573 Actual values are stored in monomers themselves as attributes.

574 """

575

576 def __init__(self, sequence, name=None):

577 self.sequence = sequence

578 self._register(sequence, name)

579 self.refresh()

580

581 def sorted_keys(self):

582 """Return list of monomers."""

583 return self.sequence

584

585 def sorted_values(self):

586 """Return list of markup values, if every monomer is marked up."""

587 return (self[monomer] for monomer in self.sequence)

588

589 def get(self, key, value=None):

590 """Part of Mapping collection interface."""

591 if key not in self:

592 return value

593 return self[key]

594

595 def __contains__(self, monomer):

596 """Part of Mapping collection interface."""

597 return hasattr(monomer, self.name)

598

599 def __getitem__(self, monomer):

600 """Part of Mapping collection interface."""

601 return getattr(monomer, self.name)

602

603 def __setitem__(self, monomer, value):

604 """Part of Mapping collection interface."""

605 return setattr(monomer, self.name, value)

606

607 class AlignmentMarkup(dict, Markup):

608 """Markupf for alignment.

609

610 Is a dictionary of [column] -> value. Value may be anything or something

611 specific, depending on subclass.

612 """

613

614 def __init__(self, alignment, name=None):

615 self.alignment = alignment

616 self._register(alignment, name)

617 self.refresh()

618

619 def sorted_keys(self):

620 """Return a list of columns."""

621 return self.alignment.columns

622

623 def sorted_values(self):

624 """Return a list of makrup values, if every column is marked up."""

625 return (self[column] for column in self.alignment.columns)

626

627 # vim: set ts=4 sts=4 sw=4 et: