allpy: 364232e42888 allpy/base.py

allpy

view allpy/base.py @ 537:364232e42888

Reimplemented flush(direction) as processor

author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Mon, 28 Feb 2011 21:17:41 +0300
parents	5dfb9b9761d5
children	737b52785e5e

line source

1 import sys

2 import re

4 import util

5 import fileio

7 # import this very module as means of having all related classes in one place

8 import base

10 default_gaps = set((".", "-", "~"))

11 """Set of characters to recoginze as gaps when parsing alignment."""

13 class Monomer(object):

14 """Monomer object."""

16 type = None

17 """Either of 'dna', 'rna', 'protein'."""

19 types = base

20 """Mapping of related types. SHOULD be redefined in subclasses."""

22 by_code1 = {}

23 """A mapping from 1-letter code to Monomer subclass."""

25 by_code3 = {}

26 """A mapping from 3-letter code to Monomer subclass."""

28 by_name = {}

29 """A mapping from full monomer name to Monomer subclass."""

31 @classmethod

32 def _subclass(cls, name='', code1='', code3='', is_modified=False):

33 """Create new subclass of Monomer for given monomer type."""

34 class TheMonomer(cls):

35 pass

36 name = name.strip().capitalize()

37 code1 = code1.upper()

38 code3 = code3.upper()

39 TheMonomer.__name__ = re.sub(r"[^\w]", "_", name)

40 TheMonomer.name = name

41 TheMonomer.code1 = code1

42 TheMonomer.code3 = code3

43 TheMonomer.is_modified = is_modified

44 if not is_modified:

45 cls.by_code1[code1] = TheMonomer

46 cls.by_code3[code3] = TheMonomer

47 cls.by_name[name] = TheMonomer

48 # We duplicate distinguished long names into Monomer itself, so that we

49 # can use Monomer.from_code3 to create the relevant type of monomer.

50 Monomer.by_code3[code3] = TheMonomer

51 Monomer.by_name[name] = TheMonomer

53 @classmethod

54 def _initialize(cls, codes=None):

55 """Create all relevant subclasses of Monomer."""

56 for code1, is_modified, code3, name in codes:

57 cls._subclass(name, code1, code3, is_modified)

59 @classmethod

60 def from_code1(cls, code1):

61 """Create new monomer from 1-letter code."""

62 return cls.by_code1[code1.upper()]()

64 @classmethod

65 def from_code3(cls, code3):

66 """Create new monomer from 3-letter code."""

67 return cls.by_code3[code3.upper()]()

69 @classmethod

70 def from_name(cls, name):

71 """Create new monomer from full name."""

72 return cls.by_name[name.strip().capitalize()]()

74 def __repr__(self):

75 return '<Monomer %s>' % self.code3

77 def __str__(self):

78 """Returns one-letter code"""

79 return self.code1

81 def __eq__(self, other):

82 """Monomers within same monomer type are compared by code1."""

83 assert self.type == other.type

84 return self.code1 == other.code1

86 def __ne__(self, other):

87 return not (self == other)

89 class Sequence(list):

90 """Sequence of Monomers.

92 This behaves like list of monomer objects. In addition to standard list

93 behaviour, Sequence has the following attributes:

95 * name -- str with the name of the sequence

96 * description -- str with description of the sequence

97 * source -- str denoting source of the sequence

99 Any of them may be empty (i.e. hold empty string)

100 """

101

102 types = base

103 """Mapping of related types. SHOULD be redefined in subclasses."""

104

105 name = ''

106 description = ''

107 source = ''

108

109 @classmethod

110 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

111 """Create sequence from a list of monomer objecst."""

112 result = cls(monomers)

113 if name:

114 result.name = name

115 if description:

116 result.description = description

117 if source:

118 result.source = source

119 return result

120

121 @classmethod

122 def from_string(cls, string, name='', description='', source=''):

123 """Create sequences from string of one-letter codes."""

124 monomer = cls.types.Monomer.from_code1

125 monomers = [monomer(letter) for letter in string]

126 return cls.from_monomers(monomers, name, description, source)

127

128 def __repr__(self):

129 return '<Sequence %s>' % str(self)

130

131 def __str__(self):

132 """Returns sequence of one-letter codes."""

133 return ''.join(monomer.code1 for monomer in self)

134

135 def __hash__(self):

136 """Hash sequence by identity."""

137 return id(self)

138

139 class Alignment(object):

140 """Alignment. It is a list of Columns."""

141

142 types = base

143 """Mapping of related types. SHOULD be redefined in subclasses."""

144

145 sequences = None

146 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

147

148 def __init__(self):

149 """Initialize empty alignment."""

150 self.sequences = []

151 self.columns = []

152

153 # Alignment grow & IO methods

154 # ==============================

155

156 def append_sequence(self, sequence):

157 """Add sequence to alignment. Return self.

158

159 If sequence is too short, pad it with gaps on the right.

160 """

161 self.sequences.append(sequence)

162 self._pad_to_width(len(sequence))

163 for column, monomer in zip(self.columns, sequence):

164 column[sequence] = monomer

165 return self

166

167 def append_row_from_string(self, string,

168 name='', description='', source='', gaps=default_gaps):

169 """Add row from a string of one-letter codes and gaps. Return self."""

170 Sequence = self.types.Sequence

171 without_gaps = util.remove_each(string, gaps)

172 sequence = Sequence.from_string(without_gaps, name, description, source)

173 self._pad_to_width(len(string))

174 non_gap_columns = [column

175 for column, char in zip(self.columns, string)

176 if char not in gaps

177 ]

178 for monomer, column in zip(sequence, non_gap_columns):

179 column[sequence] = monomer

180 self.sequences.append(sequence)

181 return self

182

183 def append_row_with_gaps(self, row, sequence):

184 """Add row from row_as_list representation and sequence. Return self."""

185 self.sequences.append(sequence)

186 self._pad_to_width(len(row))

187 for column, monomer in zip(self.columns, row):

188 if monomer:

189 column[sequence] = monomer

190 return self

191

192 def _pad_to_width(self, n):

193 """Pad alignment with empty columns on the right to width n."""

194 for i in range(len(self.columns), n):

195 self.columns.append(Column())

196

197 def append_file(self, file, format='fasta', gaps=default_gaps):

198 """Append sequences from file to alignment. Return self.

199

200 If sequences in file have gaps (detected as characters belonging to

201 `gaps` set), treat them accordingly.

202 """

203 sequences = []

204 if format == 'fasta':

205 sequences = fileio.FastaIo(file).get_all_strings()

206 elif format == 'msf':

207 sequences = fileio.MsfIo(file).get_all_strings()

208 else:

209 raise Exception("We don't support other formats yet")

210 for (name, description, body) in sequences:

211 self.append_row_from_string(body, name, description, file.name, gaps)

212 return self

213

214 def to_file(self, file, format='fasta', gap='-'):

215 """Write alignment in FASTA file as sequences with gaps."""

216 def char(monomer):

217 if monomer:

218 return monomer.code1

219 return gap

220 if format == 'fasta':

221 io = fileio.FastaIo(file)

222 elif format == 'msf':

223 io = fileio.MsfIo(file)

224 else:

225 raise Exception("We don't support other formats yet")

226 for row in self.rows_as_lists():

227 seq = row.sequence

228 line = "".join(map(char, row))

229 io.save_string(line, seq.name, seq.description)

230

231 # Data access methods for alignment

232 # =================================

233

234 def rows(self):

235 """Return list of rows (temporary objects) in alignment.

236

237 Each row is a dictionary of { column : monomer }.

238

239 For gap positions there is no key for the column in row.

240

241 Each row has attribute `sequence` pointing to the sequence the row is

242 describing.

243

244 Modifications of row have no effect on the alignment.

245 """

246 # For now, the function returns a list rather than iterator.

247 # It is yet to see, whether memory performance here becomes critical,

248 # or is random access useful.

249 rows = []

250 for sequence in self.sequences:

251 row = util.UserDict()

252 row.sequence = sequence

253 for column in self.columns:

254 if sequence in column:

255 row[column] = column[sequence]

256 rows.append(row)

257 return rows

258

259 def rows_as_lists(self):

260 """Return list of rows (temporary objects) in alignment.

261

262 Each row here is a list of either monomer or None (for gaps).

263

264 Each row has attribute `sequence` pointing to the sequence of row.

265

266 Modifications of row have no effect on the alignment.

267 """

268 rows = []

269 for sequence in self.sequences:

270 row = util.UserList()

271 row.sequence = sequence

272 for column in self.columns:

273 row.append(column.get(sequence))

274 rows.append(row)

275 return rows

276

277 def columns_as_lists(self):

278 """Return list of columns (temorary objects) in alignment.

279

280 Each column here is a list of either monomer or None (for gaps).

281

282 Items of column are sorted in the same way as alignment.sequences.

283

284 Modifications of column have no effect on the alignment.

285 """

286 columns = []

287 for column in self.columns:

288 col = []

289 for sequence in self.sequences:

290 col.append(column.get(sequence))

291 columns.append(col)

292 return columns

293

294 # Alignment / Block editing methods

295 # =================================

296

297 def flush(self, whence='left'):

298 """Remove all gaps from alignment and flush results to one side.

299

300 `whence` must be one of 'left', 'right' or 'center'

301 """

302 if whence == 'left':

303 from processors import Left as Flush

304 elif whence == 'right':

305 from processors import Right as Flush

306 elif whence == 'center':

307 from processors import Center as Flush

308 else:

309 raise AssertionError, "Whence must be left, right or center"

310 self.realign(Flush())

311

312 def remove_gap_columns(self):

313 """Remove all empty columns."""

314 for n, column in reversed(enumerate(self.columns)):

315 if column == {}:

316 self.columns[n:n+1] = []

317

318 def _wipe_row(self, sequence):

319 """Turn all row positions into gaps (but keep sequences intact)."""

320 for column in self.columns:

321 if sequence in column:

322 del column[sequence]

323

324 def _merge(self, dst, new, merge):

325 """Replace contents of `dst` with those of `new`.

326

327 Replace contents of elements using function `merge(dst_el, new_le)`.

328 """

329 for el, new_el in zip(dst, new):

330 merge(el, new_el)

331 dst[len(dst):] = new[len(dst):]

332 del dst[len(new):]

333

334 def _replace_sequence_contents(self, new, copy_descriptions):

335 """Replace contents of sequences with those of `new` alignment."""

336 # XXX: we manually copy sequence contents here

337 # XXX: we only copy, overlapping parts and link to the rest

338 def merge_monomers(dst, new):

339 dst.__class__ = new.__class__

340 def merge_sequences(dst, new):

341 if copy_descriptions:

342 vars(dst).update(vars(new))

343 self._merge(dst, new, merge_monomers)

344 self._merge(self.sequences, new.sequences, merge_sequences)

345

346 def _replace_column_contents(self, new):

347 """Replace column contents with those of `new` alignment.

348

349 In other words: copy gap patterns from `new` to `self`.

350

351 `self.sequences` and `new.sequences` should have the same contents.

352 """

353 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

354 sequence = row.sequence

355 monomers = filter(None, row)

356 assert len(monomers) == len(filter(None, new_row))

357 self._wipe_row(sequence)

358 non_gap_columns = [column

359 for column, monomer in zip(self.columns, new_row)

360 if monomer

361 ]

362 for monomer, column in zip(monomers, non_gap_columns):

363 column[sequence] = monomer

364

365 def _replace_contents(self, new, copy_descriptions, copy_contents):

366 """Replace alignment contents with those of other alignment."""

367 if copy_contents:

368 self._replace_sequence_contents(new, copy_descriptions)

369 self._replace_column_contents(new)

370

371 def process(self, function, copy_descriptions=True, copy_contents=True):

372 """Apply function to the alignment (or block); inject results back.

373

374 - `function(block)` must return block with same line order.

375 - if `copy_descriptions` is False, ignore new sequence names.

376 - if `copy_contents` is False, don't copy sequence contents too.

377

378 `function` (object) may have attributes `copy_descriptions` and

379 `copy_contents`, which override the same named arguments.

380 """

381 new = function(self)

382 if hasattr(function, 'copy_descriptions'):

383 copy_descriptions = function.copy_descriptions

384 if hasattr(function, 'copy_contents'):

385 copy_contents = function.copy_contents

386 self._replace_contents(new, copy_descriptions, copy_contents)

387

388 def realign(self, function):

389 """Realign self.

390

391 I.e.: apply function to self to produce a new alignment, then update

392 self to have the same gap patterns as the new alignment.

393

394 This is the same as process(function, False, False)

395 """

396 new = function(self)

397 self._replace_column_contents(new)

398

399 class Column(dict):

400 """Column of alignment.

401

402 Column is a dict of { sequence : monomer }.

403

404 For sequences that have gaps in current row, given key is not present in

405 the column.

406 """

407

408 types = base

409 """Mapping of related types. SHOULD be redefined in subclasses."""

410

411 def __hash__(self):

412 """Return hash by identity."""

413 return id(self)

414

415 class Block(Alignment):

416 """Block of alignment.

417

418 Block is an intersection of several rows & columns. (The collections of

419 rows and columns are represented as ordered lists, to retain display order

420 of Alignment or add ability to tweak it). Most of blocks look like

421 rectangular part of alignment if you shuffle alignment rows the right way.

422 """

423

424 alignment = None

425 """Alignment the block belongs to."""

426

427 sequences = ()

428 """List of sequences in block."""

429

430 columns = ()

431 """List of columns in block."""

432

433 @classmethod

434 def from_alignment(cls, alignment, sequences=None, columns=None):

435 """Build new block from alignment.

436

437 If sequences are not given, the block uses all sequences in alignment.

438

439 If columns are not given, the block uses all columns in alignment.

440

441 In both cases we use exactly the list used in alignment, thus, if new

442 sequences or columns are added to alignment, the block tracks this too.

443 """

444 if sequences is None:

445 sequences = alignment.sequences

446 if columns is None:

447 columns = alignment.columns

448 block = cls()

449 block.alignment = alignment

450 block.sequences = sequences

451 block.columns = columns

452 return block

453

454 # vim: set ts=4 sts=4 sw=4 et: