allpy: 450cbb7bcf51 allpy/base.py

allpy

view allpy/base.py @ 691:450cbb7bcf51

base: Extracted common part of different base classes of markups to a common ancestor class

author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Tue, 05 Jul 2011 19:10:47 +0400
parents	df624c729ab5
children	80e6db5daf37

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = set((".", "-", "~"))

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 module = vars(data.monomers)[cls.type]

41 TheMonomer.__name__ = re.sub(r"\W", "_", name)

42 TheMonomer.__module__ = module.__name__

43 TheMonomer.name = name

44 TheMonomer.code1 = code1

45 TheMonomer.code3 = code3

46 TheMonomer.is_modified = is_modified

47 # Save the class in data.monomers so that it can be pickled

48 # Some names are not unique, we append underscores to them

49 # in order to fix it.

50 while TheMonomer.__name__ in vars(module):

51 TheMonomer.__name__ += "_"

52 vars(module)[TheMonomer.__name__] = TheMonomer

53 if not is_modified:

54 cls.by_code1[code1] = TheMonomer

55 cls.by_code3[code3] = TheMonomer

56 cls.by_name[name] = TheMonomer

57 # We duplicate distinguished long names into Monomer itself, so that we

58 # can use Monomer.from_code3 to create the relevant type of monomer.

59 Monomer.by_code3[code3] = TheMonomer

60 Monomer.by_name[name] = TheMonomer

62 @classmethod

63 def _initialize(cls, codes=None):

64 """Create all relevant subclasses of Monomer."""

65 for code1, is_modified, code3, name in codes:

66 cls._subclass(name, code1, code3, is_modified)

68 @classmethod

69 def from_code1(cls, code1):

70 """Create new monomer from 1-letter code."""

71 monomer = cls.by_code1[code1.upper()]()

72 monomer.input_code1 = code1

73 return monomer

75 @classmethod

76 def from_code3(cls, code3):

77 """Create new monomer from 3-letter code."""

78 return cls.by_code3[code3.upper()]()

80 @classmethod

81 def from_name(cls, name):

82 """Create new monomer from full name."""

83 return cls.by_name[name.strip().capitalize()]()

85 def __repr__(self):

86 return "<Monomer %s>" % str(self.code1)

88 def __str__(self):

89 """Returns one-letter code"""

90 return self.code1

92 def __eq__(self, other):

93 """Monomers within same monomer type are compared by code1."""

94 if not other:

95 return False

96 assert self.type == other.type

97 return self.code1 == other.code1

99 def __ne__(self, other):

100 return not (self == other)

101

102 class Sequence(list):

103 """Sequence of Monomers.

104

105 This behaves like list of monomer objects. In addition to standard list

106 behaviour, Sequence has the following attributes:

107

108 * name -- str with the name of the sequence

109 * description -- str with description of the sequence

110 * source -- str denoting source of the sequence

111

112 Any of them may be empty (i.e. hold empty string)

113 """

114

115 types = base

116 """Mapping of related types. SHOULD be redefined in subclasses."""

117

118 name = ''

119 description = ''

120 source = ''

121

122 def __init__(self, *args):

123 self.markups = {}

124 list.__init__(self, *args)

125

126 @classmethod

127 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

128 """Create sequence from a list of monomer objecst."""

129 result = cls(monomers)

130 if name:

131 result.name = name

132 if description:

133 result.description = description

134 if source:

135 result.source = source

136 return result

137

138 @classmethod

139 def from_string(cls, string, name='', description='', source=''):

140 """Create sequences from string of one-letter codes."""

141 monomer = cls.types.Monomer.from_code1

142 monomers = [monomer(letter) for letter in string]

143 return cls.from_monomers(monomers, name, description, source)

144

145 def __repr__(self):

146 if self.name:

147 return '<Sequence %s>' % str(self.name)

148 else:

149 return '<Sequence %s>' % str(self)

150

151 def __str__(self):

152 """Returns sequence of one-letter codes."""

153 return ''.join(monomer.code1 for monomer in self)

154

155 def __hash__(self):

156 """Hash sequence by identity."""

157 return id(self)

158

159 class Alignment(object):

160 """Alignment. It is a list of Columns."""

161

162 types = base

163 """Mapping of related types. SHOULD be redefined in subclasses."""

164

165 sequences = None

166 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

167

168 def __init__(self):

169 """Initialize empty alignment."""

170 self.sequences = []

171 self.columns = []

172 self.markups = {}

173

174 # Alignment grow & IO methods

175 # ==============================

176

177 def append_sequence(self, sequence):

178 """Add sequence to alignment. Return self.

179

180 If sequence is too short, pad it with gaps on the right.

181 """

182 self.sequences.append(sequence)

183 self._pad_to_width(len(sequence))

184 for column, monomer in zip(self.columns, sequence):

185 column[sequence] = monomer

186 return self

187

188 def append_row_from_string(self, string,

189 name='', description='', source='', gaps=default_gaps):

190 """Add row from a string of one-letter codes and gaps. Return self."""

191 Sequence = self.types.Sequence

192 without_gaps = util.remove_each(string, gaps)

193 sequence = Sequence.from_string(without_gaps, name, description, source)

194 self._pad_to_width(len(string))

195 non_gap_columns = [column

196 for column, char in zip(self.columns, string)

197 if char not in gaps

198 ]

199 for monomer, column in zip(sequence, non_gap_columns):

200 column[sequence] = monomer

201 self.sequences.append(sequence)

202 return self

203

204 def append_row_with_gaps(self, row, sequence):

205 """Add row from row_as_list representation and sequence. Return self."""

206 self.sequences.append(sequence)

207 self._pad_to_width(len(row))

208 for column, monomer in zip(self.columns, row):

209 if monomer:

210 column[sequence] = monomer

211 return self

212

213 def _pad_to_width(self, n):

214 """Pad alignment with empty columns on the right to width n."""

215 for i in range(len(self.columns), n):

216 self.columns.append(Column())

217

218 def append_file(self, file, format='fasta', gaps=default_gaps):

219 """Append sequences from file to alignment. Return self.

220

221 If sequences in file have gaps (detected as characters belonging to

222 `gaps` set), treat them accordingly.

223 """

224 sequences = []

225 io = fileio.File(file, format)

226 for name, description, body in io.read_strings():

227 self.append_row_from_string(body, name, description, file.name, gaps)

228 return self

229

230 def to_file(self, file, format='fasta', gap='-'):

231 """Write alignment in FASTA file as sequences with gaps."""

232 strings = [(s, s.sequence.name, s.sequence.description)

233 for s in self.rows_as_strings()]

234 fileio.File(file, format).write_strings(strings)

235

236 # Data access methods for alignment

237 # =================================

238

239 def rows(self):

240 """Return list of rows (temporary objects) in alignment.

241

242 Each row is a dictionary of { column : monomer }.

243

244 For gap positions there is no key for the column in row.

245

246 Each row has attribute `sequence` pointing to the sequence the row is

247 describing.

248

249 Modifications of row have no effect on the alignment.

250 """

251 # For now, the function returns a list rather than iterator.

252 # It is yet to see, whether memory performance here becomes critical,

253 # or is random access useful.

254 rows = []

255 for sequence in self.sequences:

256 row = util.UserDict()

257 row.sequence = sequence

258 for column in self.columns:

259 if sequence in column:

260 row[column] = column[sequence]

261 rows.append(row)

262 return rows

263

264 def rows_as_lists(self):

265 """Return list of rows (temporary objects) in alignment.

266

267 Each row here is a list of either monomer or None (for gaps).

268

269 Each row has attribute `sequence` pointing to the sequence of row.

270

271 Modifications of row have no effect on the alignment.

272 """

273 rows = []

274 for sequence in self.sequences:

275 row = util.UserList()

276 row.sequence = sequence

277 for column in self.columns:

278 row.append(column.get(sequence))

279 rows.append(row)

280 return rows

281

282 def rows_as_strings(self, gap='-'):

283 """Return list of string representation of rows in alignment.

284

285 Each row has attribute `sequence` pointing to the sequence of row.

286

287 `gap` is the symbol to use for gap.

288 """

289 rows = []

290 for sequence in self.sequences:

291 string = ""

292 for column in self.columns:

293 if sequence in column:

294 string += column[sequence].code1

295 else:

296 string += gap

297 string = util.UserString(string)

298 string.sequence = sequence

299 rows.append(string)

300 return rows

301

302 def row_as_list(self, sequence):

303 """Return representaion of row as list with `Monomers` and `None`s."""

304 return [column.get(sequence) for column in self.columns]

305

306 def row_as_string(self, sequence, gap='-'):

307 """Return string representaion of row in alignment.

308

309 String will have gaps represented by `gap` symbol (defaults to '-').

310 """

311 def char(monomer):

312 if monomer:

313 return monomer.code1

314 return gap

315 row = self.row_as_list(sequence)

316 return "".join(map(char, row))

317

318 def columns_as_lists(self):

319 """Return list of columns (temorary objects) in alignment.

320

321 Each column here is a list of either monomer or None (for gaps).

322

323 Items of column are sorted in the same way as alignment.sequences.

324

325 Modifications of column have no effect on the alignment.

326 """

327 columns = []

328 for column in self.columns:

329 col = util.UserList()

330 col.column = column

331 for sequence in self.sequences:

332 col.append(column.get(sequence))

333 columns.append(col)

334 return columns

335

336 # Alignment / Block editing methods

337 # =================================

338

339 def flush(self, whence='left'):

340 """Remove all gaps from alignment and flush results to one side.

341

342 `whence` must be one of 'left', 'right' or 'center'

343 """

344 if whence == 'left':

345 from processors import Left as Flush

346 elif whence == 'right':

347 from processors import Right as Flush

348 elif whence == 'center':

349 from processors import Center as Flush

350 else:

351 raise AssertionError, "Whence must be left, right or center"

352 self.realign(Flush())

353

354 def remove_gap_columns(self):

355 """Remove all empty columns."""

356 for n, column in reversed(list(enumerate(self.columns))):

357 if column == {}:

358 self.columns[n:n+1] = []

359

360 def _wipe_row(self, sequence):

361 """Turn all row positions into gaps (but keep sequences intact)."""

362 for column in self.columns:

363 if sequence in column:

364 del column[sequence]

365

366 def _merge(self, dst, new, merge):

367 """Replace contents of `dst` with those of `new`.

368

369 Replace contents of elements using function `merge(dst_el, new_le)`.

370 """

371 for el, new_el in zip(dst, new):

372 merge(el, new_el)

373 dst[len(dst):] = new[len(dst):]

374 del dst[len(new):]

375

376 def _replace_sequence_contents(self, new, copy_descriptions):

377 """Replace contents of sequences with those of `new` alignment."""

378 # XXX: we manually copy sequence contents here

379 # XXX: we only copy, overlapping parts and link to the rest

380 def merge_monomers(dst, new):

381 dst.__class__ = new.__class__

382 def merge_sequences(dst, new):

383 if copy_descriptions:

384 vars(dst).update(vars(new))

385 self._merge(dst, new, merge_monomers)

386 self._merge(self.sequences, new.sequences, merge_sequences)

387

388 def _replace_column_contents(self, new):

389 """Replace column contents with those of `new` alignment.

390

391 In other words: copy gap patterns from `new` to `self`.

392

393 `self.sequences` and `new.sequences` should have the same contents.

394 """

395 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

396 sequence = row.sequence

397 monomers = filter(None, row)

398 assert len(monomers) == len(filter(None, new_row))

399 self._wipe_row(sequence)

400 non_gap_columns = [column

401 for column, monomer in zip(self.columns, new_row)

402 if monomer

403 ]

404 for monomer, column in zip(monomers, non_gap_columns):

405 column[sequence] = monomer

406

407 def _replace_contents(self, new, copy_descriptions, copy_contents):

408 """Replace alignment contents with those of other alignment."""

409 if copy_contents:

410 self._replace_sequence_contents(new, copy_descriptions)

411 self._replace_column_contents(new)

412

413 def process(self, function, copy_descriptions=True, copy_contents=True):

414 """Apply function to the alignment (or block); inject results back.

415

416 - `function(block)` must return block with same line order.

417 - if `copy_descriptions` is False, ignore new sequence names.

418 - if `copy_contents` is False, don't copy sequence contents too.

419

420 `function` (object) may have attributes `copy_descriptions` and

421 `copy_contents`, which override the same named arguments.

422 """

423 new = function(self)

424 if hasattr(function, 'copy_descriptions'):

425 copy_descriptions = function.copy_descriptions

426 if hasattr(function, 'copy_contents'):

427 copy_contents = function.copy_contents

428 self._replace_contents(new, copy_descriptions, copy_contents)

429

430 def realign(self, function):

431 """Realign self.

432

433 I.e.: apply function to self to produce a new alignment, then update

434 self to have the same gap patterns as the new alignment.

435

436 This is the same as process(function, False, False)

437 """

438 new = function(self)

439 self._replace_column_contents(new)

440

441 class Column(dict):

442 """Column of alignment.

443

444 Column is a dict of { sequence : monomer }.

445

446 For sequences that have gaps in current row, given key is not present in

447 the column.

448 """

449

450 types = base

451 """Mapping of related types. SHOULD be redefined in subclasses."""

452

453 def __hash__(self):

454 """Return hash by identity."""

455 return id(self)

456

457 class Block(Alignment):

458 """Block of alignment.

459

460 Block is an intersection of several rows & columns. (The collections of

461 rows and columns are represented as ordered lists, to retain display order

462 of Alignment or add ability to tweak it). Most of blocks look like

463 rectangular part of alignment if you shuffle alignment rows the right way.

464 """

465

466 alignment = None

467 """Alignment the block belongs to."""

468

469 sequences = ()

470 """List of sequences in block."""

471

472 columns = ()

473 """List of columns in block."""

474

475 @classmethod

476 def from_alignment(cls, alignment, sequences=None, columns=None):

477 """Build new block from alignment.

478

479 If sequences are not given, the block uses all sequences in alignment.

480

481 If columns are not given, the block uses all columns in alignment.

482

483 In both cases we use exactly the list used in alignment, thus, if new

484 sequences or columns are added to alignment, the block tracks this too.

485 """

486 if sequences is None:

487 sequences = alignment.sequences

488 if columns is None:

489 columns = alignment.columns

490 block = cls()

491 block.alignment = alignment

492 block.sequences = sequences

493 block.columns = columns

494 return block

495

496 class Markup(object):

497

498 name = None

499 """Name of markup elements"""

500

501 def _register(self, container, name):

502 if name:

503 self.name = name

504 assert self.name is not None

505 assert self.name not in container.markups

506 container.markups[self.name] = self

507

508 def refresh(self):

509 pass

510

511 class SequenceMarkup(Markup):

512

513 def __init__(self, sequence, name=None):

514 self.sequence = sequence

515 self._register(sequence, name)

516 self.refresh()

517

518 def __contains__(self, monomer):

519 return monomer in self.sequence

520

521 def __getitem__(self, monomer):

522 return getattr(monomer, self.name)

523

524 def __setitem__(self, monomer, value):

525 return setattr(monomer, self.name, value)

526

527 class AlignmentMarkup(dict, Markup):

528

529 def __init__(self, alignment, name=None):

530 self.alignment = alignment

531 self._register(alignment, name)

532 self.refresh()

533

534 # vim: set ts=4 sts=4 sw=4 et: