allpy: 95185a216b97 allpy/base.py

allpy

view allpy/base.py @ 650:95185a216b97

NEWS entry for new methods of Alignment by BurkovBA

author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Fri, 10 Jun 2011 16:07:13 +0400
parents	b35116e13f35 88c246f20918
children	5c38ec697295

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = set((".", "-", "~"))

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 module = vars(data.monomers)[cls.type]

41 TheMonomer.__name__ = re.sub(r"\W", "_", name)

42 TheMonomer.__module__ = module.__name__

43 TheMonomer.name = name

44 TheMonomer.code1 = code1

45 TheMonomer.code3 = code3

46 TheMonomer.is_modified = is_modified

47 # Save the class in data.monomers so that it can be pickled

48 # Some names are not unique, we append underscores to them

49 # in order to fix it.

50 while TheMonomer.__name__ in vars(module):

51 TheMonomer.__name__ += "_"

52 vars(module)[TheMonomer.__name__] = TheMonomer

53 if not is_modified:

54 cls.by_code1[code1] = TheMonomer

55 cls.by_code3[code3] = TheMonomer

56 cls.by_name[name] = TheMonomer

57 # We duplicate distinguished long names into Monomer itself, so that we

58 # can use Monomer.from_code3 to create the relevant type of monomer.

59 Monomer.by_code3[code3] = TheMonomer

60 Monomer.by_name[name] = TheMonomer

62 @classmethod

63 def _initialize(cls, codes=None):

64 """Create all relevant subclasses of Monomer."""

65 for code1, is_modified, code3, name in codes:

66 cls._subclass(name, code1, code3, is_modified)

68 @classmethod

69 def from_code1(cls, code1):

70 """Create new monomer from 1-letter code."""

71 return cls.by_code1[code1.upper()]()

73 @classmethod

74 def from_code3(cls, code3):

75 """Create new monomer from 3-letter code."""

76 return cls.by_code3[code3.upper()]()

78 @classmethod

79 def from_name(cls, name):

80 """Create new monomer from full name."""

81 return cls.by_name[name.strip().capitalize()]()

83 def __repr__(self):

84 return "<Monomer %s>" % str(self.code1)

86 def __str__(self):

87 """Returns one-letter code"""

88 return self.code1

90 def __eq__(self, other):

91 """Monomers within same monomer type are compared by code1."""

92 if not other:

93 return False

94 assert self.type == other.type

95 return self.code1 == other.code1

97 def __ne__(self, other):

98 return not (self == other)

100 class Sequence(list):

101 """Sequence of Monomers.

102

103 This behaves like list of monomer objects. In addition to standard list

104 behaviour, Sequence has the following attributes:

105

106 * name -- str with the name of the sequence

107 * description -- str with description of the sequence

108 * source -- str denoting source of the sequence

109

110 Any of them may be empty (i.e. hold empty string)

111 """

112

113 types = base

114 """Mapping of related types. SHOULD be redefined in subclasses."""

115

116 name = ''

117 description = ''

118 source = ''

119

120 @classmethod

121 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

122 """Create sequence from a list of monomer objecst."""

123 result = cls(monomers)

124 if name:

125 result.name = name

126 if description:

127 result.description = description

128 if source:

129 result.source = source

130 return result

131

132 @classmethod

133 def from_string(cls, string, name='', description='', source=''):

134 """Create sequences from string of one-letter codes."""

135 monomer = cls.types.Monomer.from_code1

136 monomers = [monomer(letter) for letter in string]

137 return cls.from_monomers(monomers, name, description, source)

138

139 def __repr__(self):

140 if self.name:

141 return '<Sequence %s>' % str(self.name)

142 else:

143 return '<Sequence %s>' % str(self)

144

145

146 def __str__(self):

147 """Returns sequence of one-letter codes."""

148 return ''.join(monomer.code1 for monomer in self)

149

150 def __hash__(self):

151 """Hash sequence by identity."""

152 return id(self)

153

154 class Alignment(object):

155 """Alignment. It is a list of Columns."""

156

157 types = base

158 """Mapping of related types. SHOULD be redefined in subclasses."""

159

160 sequences = None

161 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

162

163 def __init__(self):

164 """Initialize empty alignment."""

165 self.sequences = []

166 self.columns = []

167

168 # Alignment grow & IO methods

169 # ==============================

170

171 def append_sequence(self, sequence):

172 """Add sequence to alignment. Return self.

173

174 If sequence is too short, pad it with gaps on the right.

175 """

176 self.sequences.append(sequence)

177 self._pad_to_width(len(sequence))

178 for column, monomer in zip(self.columns, sequence):

179 column[sequence] = monomer

180 return self

181

182 def append_row_from_string(self, string,

183 name='', description='', source='', gaps=default_gaps):

184 """Add row from a string of one-letter codes and gaps. Return self."""

185 Sequence = self.types.Sequence

186 without_gaps = util.remove_each(string, gaps)

187 sequence = Sequence.from_string(without_gaps, name, description, source)

188 self._pad_to_width(len(string))

189 non_gap_columns = [column

190 for column, char in zip(self.columns, string)

191 if char not in gaps

192 ]

193 for monomer, column in zip(sequence, non_gap_columns):

194 column[sequence] = monomer

195 self.sequences.append(sequence)

196 return self

197

198 def append_row_with_gaps(self, row, sequence):

199 """Add row from row_as_list representation and sequence. Return self."""

200 self.sequences.append(sequence)

201 self._pad_to_width(len(row))

202 for column, monomer in zip(self.columns, row):

203 if monomer:

204 column[sequence] = monomer

205 return self

206

207 def _pad_to_width(self, n):

208 """Pad alignment with empty columns on the right to width n."""

209 for i in range(len(self.columns), n):

210 self.columns.append(Column())

211

212 def append_file(self, file, format='fasta', gaps=default_gaps):

213 """Append sequences from file to alignment. Return self.

214

215 If sequences in file have gaps (detected as characters belonging to

216 `gaps` set), treat them accordingly.

217 """

218 sequences = []

219 io = fileio.File(file, format)

220 for name, description, body in io.read_strings():

221 self.append_row_from_string(body, name, description, file.name, gaps)

222 return self

223

224 def to_file(self, file, format='fasta', gap='-'):

225 """Write alignment in FASTA file as sequences with gaps."""

226 strings = [(s, s.sequence.name, s.sequence.description)

227 for s in self.rows_as_strings()]

228 fileio.File(file, format).write_strings(strings)

229

230 # Data access methods for alignment

231 # =================================

232

233 def row_as_list(self, sequence):

234 """Creates and returns temporary list of monomers and Nones"""

235 output=[]

236 for column in self.columns:

237 if sequence in column:

238 output.append(column[sequence])

239 else:

240 output.append(None)

241 return output

242

243 def row_as_string(self, sequence):

244 """Creates string of one-letter monomers' codes and gaps ("-")"""

245 def char(monomer):

246 if monomer:

247 return monomer.code1

248 return "-"

249 row = self.row_as_list(sequence)

250 list_of_letters = map(char, row)

251 output=""

252 for letter in list_of_letters: output+=letter

253 return output

254

255 def rows(self):

256 """Return list of rows (temporary objects) in alignment.

257

258 Each row is a dictionary of { column : monomer }.

259

260 For gap positions there is no key for the column in row.

261

262 Each row has attribute `sequence` pointing to the sequence the row is

263 describing.

264

265 Modifications of row have no effect on the alignment.

266 """

267 # For now, the function returns a list rather than iterator.

268 # It is yet to see, whether memory performance here becomes critical,

269 # or is random access useful.

270 rows = []

271 for sequence in self.sequences:

272 row = util.UserDict()

273 row.sequence = sequence

274 for column in self.columns:

275 if sequence in column:

276 row[column] = column[sequence]

277 rows.append(row)

278 return rows

279

280 def rows_as_lists(self):

281 """Return list of rows (temporary objects) in alignment.

282

283 Each row here is a list of either monomer or None (for gaps).

284

285 Each row has attribute `sequence` pointing to the sequence of row.

286

287 Modifications of row have no effect on the alignment.

288 """

289 rows = []

290 for sequence in self.sequences:

291 row = util.UserList()

292 row.sequence = sequence

293 for column in self.columns:

294 row.append(column.get(sequence))

295 rows.append(row)

296 return rows

297

298 def rows_as_strings(self, gap='-'):

299 """Return list of string representation of rows in alignment.

300

301 Each row has attribute `sequence` pointing to the sequence of row.

302

303 `gap` is the symbol to use for gap.

304 """

305 rows = []

306 for sequence in self.sequences:

307 string = ""

308 for column in self.columns:

309 if sequence in column:

310 string += column[sequence].code1

311 else:

312 string += gap

313 string = util.UserString(string)

314 string.sequence = sequence

315 rows.append(string)

316 return rows

317

318 def columns_as_lists(self):

319 """Return list of columns (temorary objects) in alignment.

320

321 Each column here is a list of either monomer or None (for gaps).

322

323 Items of column are sorted in the same way as alignment.sequences.

324

325 Modifications of column have no effect on the alignment.

326 """

327 columns = []

328 for column in self.columns:

329 col = []

330 for sequence in self.sequences:

331 col.append(column.get(sequence))

332 columns.append(col)

333 return columns

334

335 # Alignment / Block editing methods

336 # =================================

337

338 def flush(self, whence='left'):

339 """Remove all gaps from alignment and flush results to one side.

340

341 `whence` must be one of 'left', 'right' or 'center'

342 """

343 if whence == 'left':

344 from processors import Left as Flush

345 elif whence == 'right':

346 from processors import Right as Flush

347 elif whence == 'center':

348 from processors import Center as Flush

349 else:

350 raise AssertionError, "Whence must be left, right or center"

351 self.realign(Flush())

352

353 def remove_gap_columns(self):

354 """Remove all empty columns."""

355 for n, column in reversed(list(enumerate(self.columns))):

356 if column == {}:

357 self.columns[n:n+1] = []

358

359 def _wipe_row(self, sequence):

360 """Turn all row positions into gaps (but keep sequences intact)."""

361 for column in self.columns:

362 if sequence in column:

363 del column[sequence]

364

365 def _merge(self, dst, new, merge):

366 """Replace contents of `dst` with those of `new`.

367

368 Replace contents of elements using function `merge(dst_el, new_le)`.

369 """

370 for el, new_el in zip(dst, new):

371 merge(el, new_el)

372 dst[len(dst):] = new[len(dst):]

373 del dst[len(new):]

374

375 def _replace_sequence_contents(self, new, copy_descriptions):

376 """Replace contents of sequences with those of `new` alignment."""

377 # XXX: we manually copy sequence contents here

378 # XXX: we only copy, overlapping parts and link to the rest

379 def merge_monomers(dst, new):

380 dst.__class__ = new.__class__

381 def merge_sequences(dst, new):

382 if copy_descriptions:

383 vars(dst).update(vars(new))

384 self._merge(dst, new, merge_monomers)

385 self._merge(self.sequences, new.sequences, merge_sequences)

386

387 def _replace_column_contents(self, new):

388 """Replace column contents with those of `new` alignment.

389

390 In other words: copy gap patterns from `new` to `self`.

391

392 `self.sequences` and `new.sequences` should have the same contents.

393 """

394 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

395 sequence = row.sequence

396 monomers = filter(None, row)

397 assert len(monomers) == len(filter(None, new_row))

398 self._wipe_row(sequence)

399 non_gap_columns = [column

400 for column, monomer in zip(self.columns, new_row)

401 if monomer

402 ]

403 for monomer, column in zip(monomers, non_gap_columns):

404 column[sequence] = monomer

405

406 def _replace_contents(self, new, copy_descriptions, copy_contents):

407 """Replace alignment contents with those of other alignment."""

408 if copy_contents:

409 self._replace_sequence_contents(new, copy_descriptions)

410 self._replace_column_contents(new)

411

412 def process(self, function, copy_descriptions=True, copy_contents=True):

413 """Apply function to the alignment (or block); inject results back.

414

415 - `function(block)` must return block with same line order.

416 - if `copy_descriptions` is False, ignore new sequence names.

417 - if `copy_contents` is False, don't copy sequence contents too.

418

419 `function` (object) may have attributes `copy_descriptions` and

420 `copy_contents`, which override the same named arguments.

421 """

422 new = function(self)

423 if hasattr(function, 'copy_descriptions'):

424 copy_descriptions = function.copy_descriptions

425 if hasattr(function, 'copy_contents'):

426 copy_contents = function.copy_contents

427 self._replace_contents(new, copy_descriptions, copy_contents)

428

429 def realign(self, function):

430 """Realign self.

431

432 I.e.: apply function to self to produce a new alignment, then update

433 self to have the same gap patterns as the new alignment.

434

435 This is the same as process(function, False, False)

436 """

437 new = function(self)

438 self._replace_column_contents(new)

439

440 class Column(dict):

441 """Column of alignment.

442

443 Column is a dict of { sequence : monomer }.

444

445 For sequences that have gaps in current row, given key is not present in

446 the column.

447 """

448

449 types = base

450 """Mapping of related types. SHOULD be redefined in subclasses."""

451

452 def __hash__(self):

453 """Return hash by identity."""

454 return id(self)

455

456 class Block(Alignment):

457 """Block of alignment.

458

459 Block is an intersection of several rows & columns. (The collections of

460 rows and columns are represented as ordered lists, to retain display order

461 of Alignment or add ability to tweak it). Most of blocks look like

462 rectangular part of alignment if you shuffle alignment rows the right way.

463 """

464

465 alignment = None

466 """Alignment the block belongs to."""

467

468 sequences = ()

469 """List of sequences in block."""

470

471 columns = ()

472 """List of columns in block."""

473

474 @classmethod

475 def from_alignment(cls, alignment, sequences=None, columns=None):

476 """Build new block from alignment.

477

478 If sequences are not given, the block uses all sequences in alignment.

479

480 If columns are not given, the block uses all columns in alignment.

481

482 In both cases we use exactly the list used in alignment, thus, if new

483 sequences or columns are added to alignment, the block tracks this too.

484 """

485 if sequences is None:

486 sequences = alignment.sequences

487 if columns is None:

488 columns = alignment.columns

489 block = cls()

490 block.alignment = alignment

491 block.sequences = sequences

492 block.columns = columns

493 return block

494

495 # vim: set ts=4 sts=4 sw=4 et: