allpy: 1a8ddc6e2eee allpy/base.py

allpy

view allpy/base.py @ 536:1a8ddc6e2eee

mkcodes.py, codes.py: remove entries with 1-letter code longer than 1 letter

author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Mon, 28 Feb 2011 19:24:28 +0300
parents	23af8cf15344
children	364232e42888

line source

1 import sys

2 import re

4 import util

5 import fileio

7 # import this very module as means of having all related classes in one place

8 import base

10 default_gaps = set((".", "-", "~"))

11 """Set of characters to recoginze as gaps when parsing alignment."""

13 class Monomer(object):

14 """Monomer object."""

16 type = None

17 """Either of 'dna', 'rna', 'protein'."""

19 types = base

20 """Mapping of related types. SHOULD be redefined in subclasses."""

22 by_code1 = {}

23 """A mapping from 1-letter code to Monomer subclass."""

25 by_code3 = {}

26 """A mapping from 3-letter code to Monomer subclass."""

28 by_name = {}

29 """A mapping from full monomer name to Monomer subclass."""

31 @classmethod

32 def _subclass(cls, name='', code1='', code3='', is_modified=False):

33 """Create new subclass of Monomer for given monomer type."""

34 class TheMonomer(cls):

35 pass

36 name = name.strip().capitalize()

37 code1 = code1.upper()

38 code3 = code3.upper()

39 TheMonomer.__name__ = re.sub(r"[^\w]", "_", name)

40 TheMonomer.name = name

41 TheMonomer.code1 = code1

42 TheMonomer.code3 = code3

43 TheMonomer.is_modified = is_modified

44 if not is_modified:

45 cls.by_code1[code1] = TheMonomer

46 cls.by_code3[code3] = TheMonomer

47 cls.by_name[name] = TheMonomer

48 # We duplicate distinguished long names into Monomer itself, so that we

49 # can use Monomer.from_code3 to create the relevant type of monomer.

50 Monomer.by_code3[code3] = TheMonomer

51 Monomer.by_name[name] = TheMonomer

53 @classmethod

54 def _initialize(cls, codes=None):

55 """Create all relevant subclasses of Monomer."""

56 for code1, is_modified, code3, name in codes:

57 cls._subclass(name, code1, code3, is_modified)

59 @classmethod

60 def from_code1(cls, code1):

61 """Create new monomer from 1-letter code."""

62 return cls.by_code1[code1.upper()]()

64 @classmethod

65 def from_code3(cls, code3):

66 """Create new monomer from 3-letter code."""

67 return cls.by_code3[code3.upper()]()

69 @classmethod

70 def from_name(cls, name):

71 """Create new monomer from full name."""

72 return cls.by_name[name.strip().capitalize()]()

74 def __repr__(self):

75 return '<Monomer %s>' % self.code3

77 def __str__(self):

78 """Returns one-letter code"""

79 return self.code1

81 def __eq__(self, other):

82 """Monomers within same monomer type are compared by code1."""

83 assert self.type == other.type

84 return self.code1 == other.code1

86 def __ne__(self, other):

87 return not (self == other)

89 class Sequence(list):

90 """Sequence of Monomers.

92 This behaves like list of monomer objects. In addition to standard list

93 behaviour, Sequence has the following attributes:

95 * name -- str with the name of the sequence

96 * description -- str with description of the sequence

97 * source -- str denoting source of the sequence

99 Any of them may be empty (i.e. hold empty string)

100 """

101

102 types = base

103 """Mapping of related types. SHOULD be redefined in subclasses."""

104

105 name = ''

106 description = ''

107 source = ''

108

109 @classmethod

110 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

111 """Create sequence from a list of monomer objecst."""

112 result = cls(monomers)

113 if name:

114 result.name = name

115 if description:

116 result.description = description

117 if source:

118 result.source = source

119 return result

120

121 @classmethod

122 def from_string(cls, string, name='', description='', source=''):

123 """Create sequences from string of one-letter codes."""

124 monomer = cls.types.Monomer.from_code1

125 monomers = [monomer(letter) for letter in string]

126 return cls.from_monomers(monomers, name, description, source)

127

128 def __repr__(self):

129 return '<Sequence %s>' % str(self)

130

131 def __str__(self):

132 """Returns sequence of one-letter codes."""

133 return ''.join(monomer.code1 for monomer in self)

134

135 def __hash__(self):

136 """Hash sequence by identity."""

137 return id(self)

138

139 class Alignment(object):

140 """Alignment. It is a list of Columns."""

141

142 types = base

143 """Mapping of related types. SHOULD be redefined in subclasses."""

144

145 sequences = None

146 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

147

148 def __init__(self):

149 """Initialize empty alignment."""

150 self.sequences = []

151 self.columns = []

152

153 # Alignment grow & IO methods

154 # ==============================

155

156 def append_sequence(self, sequence):

157 """Add sequence to alignment. Return self.

158

159 If sequence is too short, pad it with gaps on the right.

160 """

161 self.sequences.append(sequence)

162 self._pad_to_width(len(sequence))

163 for column, monomer in zip(self.columns, sequence):

164 column[sequence] = monomer

165 return self

166

167 def append_row_from_string(self, string,

168 name='', description='', source='', gaps=default_gaps):

169 """Add row from a string of one-letter codes and gaps. Return self."""

170 Sequence = self.types.Sequence

171 without_gaps = util.remove_each(string, gaps)

172 sequence = Sequence.from_string(without_gaps, name, description, source)

173 self._pad_to_width(len(string))

174 non_gap_columns = [column

175 for column, char in zip(self.columns, string)

176 if char not in gaps

177 ]

178 for monomer, column in zip(sequence, non_gap_columns):

179 column[sequence] = monomer

180 self.sequences.append(sequence)

181 return self

182

183 def _pad_to_width(self, n):

184 """Pad alignment with empty columns on the right to width n."""

185 for i in range(len(self.columns), n):

186 self.columns.append(Column())

187

188 def append_file(self, file, format='fasta', gaps=default_gaps):

189 """Append sequences from file to alignment. Return self.

190

191 If sequences in file have gaps (detected as characters belonging to

192 `gaps` set), treat them accordingly.

193 """

194 sequences = []

195 if format == 'fasta':

196 sequences = fileio.FastaIo(file).get_all_strings()

197 elif format == 'msf':

198 sequences = fileio.MsfIo(file).get_all_strings()

199 else:

200 raise Exception("We don't support other formats yet")

201 for (name, description, body) in sequences:

202 self.append_row_from_string(body, name, description, file.name, gaps)

203 return self

204

205 def to_file(self, file, format='fasta', gap='-'):

206 """Write alignment in FASTA file as sequences with gaps."""

207 def char(monomer):

208 if monomer:

209 return monomer.code1

210 return gap

211 if format == 'fasta':

212 io = fileio.FastaIo(file)

213 elif format == 'msf':

214 io = fileio.MsfIo(file)

215 else:

216 raise Exception("We don't support other formats yet")

217 for row in self.rows_as_lists():

218 seq = row.sequence

219 line = "".join(map(char, row))

220 io.save_string(line, seq.name, seq.description)

221

222 # Data access methods for alignment

223 # =================================

224

225 def rows(self):

226 """Return list of rows (temporary objects) in alignment.

227

228 Each row is a dictionary of { column : monomer }.

229

230 For gap positions there is no key for the column in row.

231

232 Each row has attribute `sequence` pointing to the sequence the row is

233 describing.

234

235 Modifications of row have no effect on the alignment.

236 """

237 # For now, the function returns a list rather than iterator.

238 # It is yet to see, whether memory performance here becomes critical,

239 # or is random access useful.

240 rows = []

241 for sequence in self.sequences:

242 row = util.UserDict()

243 row.sequence = sequence

244 for column in self.columns:

245 if sequence in column:

246 row[column] = column[sequence]

247 rows.append(row)

248 return rows

249

250 def rows_as_lists(self):

251 """Return list of rows (temporary objects) in alignment.

252

253 Each row here is a list of either monomer or None (for gaps).

254

255 Each row has attribute `sequence` pointing to the sequence of row.

256

257 Modifications of row have no effect on the alignment.

258 """

259 rows = []

260 for sequence in self.sequences:

261 row = util.UserList()

262 row.sequence = sequence

263 for column in self.columns:

264 row.append(column.get(sequence))

265 rows.append(row)

266 return rows

267

268 def columns_as_lists(self):

269 """Return list of columns (temorary objects) in alignment.

270

271 Each column here is a list of either monomer or None (for gaps).

272

273 Items of column are sorted in the same way as alignment.sequences.

274

275 Modifications of column have no effect on the alignment.

276 """

277 columns = []

278 for column in self.columns:

279 col = []

280 for sequence in self.sequences:

281 col.append(column.get(sequence))

282 columns.append(col)

283 return columns

284

285 # Alignment / Block editing methods

286 # =================================

287

288 def _flush_row(self, row, whence='left'):

289 """Helper for `flush`: flush to one side all monomers in one row."""

290 row = filter(None, row)

291 padding = [None] * len(self.columns)

292 if whence == 'left':

293 return row + padding

294 if whence == 'right':

295 return padding + row

296 if whence == 'center':

297 pad_len = (len(self.columns) - len(row)) // 2

298 # vvv fix padding for case when length is odd: better have more

299 pad_len += len(self.columns) - 2 * pad_len

300 padding = [None] * pad_len

301 return padding + row + padding

302 assert True, "whence must be either 'left' or 'right' or 'center'"

303

304 def flush(self, whence='left'):

305 """Remove all gaps from alignment and flush results to one side.

306

307 `whence` must be one of 'left', 'right' or 'center'

308 """

309 for row in self.rows_as_lists():

310 sequence = row.sequence

311 row = self._flush_row(row, whence)

312 for monomer, column in zip(row, self.columns):

313 if monomer:

314 column[sequence] = monomer

315 elif sequence in column:

316 del column[sequence]

317

318 def remove_gap_columns(self):

319 """Remove all empty columns."""

320 for n, column in reversed(enumerate(self.columns)):

321 if column == {}:

322 self.columns[n:n+1] = []

323

324 def _wipe_row(self, sequence):

325 """Turn all row positions into gaps (but keep sequences intact)."""

326 for column in self.columns:

327 if sequence in column:

328 del column[sequence]

329

330 def _merge(self, dst, new, merge):

331 """Replace contents of `dst` with those of `new`.

332

333 Replace contents of elements using function `merge(dst_el, new_le)`.

334 """

335 for el, new_el in zip(dst, new):

336 merge(el, new_el)

337 dst[len(dst):] = new[len(dst):]

338 del dst[len(new):]

339

340 def _replace_sequence_contents(self, new, copy_descriptions):

341 """Replace contents of sequences with those of `new` alignment."""

342 # XXX: we manually copy sequence contents here

343 # XXX: we only copy, overlapping parts and link to the rest

344 def merge_monomers(dst, new):

345 dst.__class__ = new.__class__

346 def merge_sequences(dst, new):

347 if copy_descriptions:

348 vars(dst).update(vars(new))

349 self._merge(dst, new, merge_monomers)

350 self._merge(self.sequences, new.sequences, merge_sequences)

351

352 def _replace_column_contents(self, new):

353 """Replace column contents with those of `new` alignment.

354

355 In other words: copy gap patterns from `new` to `self`.

356

357 `self.sequences` and `new.sequences` should have the same contents.

358 """

359 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

360 sequence = row.sequence

361 monomers = filter(None, row)

362 assert len(monomers) == len(filter(None, new_row))

363 self._wipe_row(sequence)

364 non_gap_columns = [column

365 for column, monomer in zip(self.columns, new_row)

366 if monomer

367 ]

368 for monomer, column in zip(monomers, non_gap_columns):

369 column[sequence] = monomer

370

371 def _replace_contents(self, new, copy_descriptions, copy_contents):

372 """Replace alignment contents with those of other alignment."""

373 if copy_contents:

374 self._replace_sequence_contents(new, copy_descriptions)

375 self._replace_column_contents(new)

376

377 def process(self, function, copy_descriptions=True, copy_contents=True):

378 """Apply function to the alignment (or block); inject results back.

379

380 - `function(block)` must return block with same line order.

381 - if `copy_descriptions` is False, ignore new sequence names.

382 - if `copy_contents` is False, don't copy sequence contents too.

383

384 `function` (object) may have attributes `copy_descriptions` and

385 `copy_contents`, which override the same named arguments.

386 """

387 new = function(self)

388 if hasattr(function, 'copy_descriptions'):

389 copy_descriptions = function.copy_descriptions

390 if hasattr(function, 'copy_contents'):

391 copy_contents = function.copy_contents

392 self._replace_contents(new, copy_descriptions, copy_contents)

393

394 class Column(dict):

395 """Column of alignment.

396

397 Column is a dict of { sequence : monomer }.

398

399 For sequences that have gaps in current row, given key is not present in

400 the column.

401 """

402

403 types = base

404 """Mapping of related types. SHOULD be redefined in subclasses."""

405

406 def __hash__(self):

407 """Return hash by identity."""

408 return id(self)

409

410 class Block(Alignment):

411 """Block of alignment.

412

413 Block is an intersection of several rows & columns. (The collections of

414 rows and columns are represented as ordered lists, to retain display order

415 of Alignment or add ability to tweak it). Most of blocks look like

416 rectangular part of alignment if you shuffle alignment rows the right way.

417 """

418

419 alignment = None

420 """Alignment the block belongs to."""

421

422 sequences = ()

423 """List of sequences in block."""

424

425 columns = ()

426 """List of columns in block."""

427

428 @classmethod

429 def from_alignment(cls, alignment, sequences=None, columns=None):

430 """Build new block from alignment.

431

432 If sequences are not given, the block uses all sequences in alignment.

433

434 If columns are not given, the block uses all columns in alignment.

435

436 In both cases we use exactly the list used in alignment, thus, if new

437 sequences or columns are added to alignment, the block tracks this too.

438 """

439 if sequences is None:

440 sequences = alignment.sequences

441 if columns is None:

442 columns = alignment.columns

443 block = cls()

444 block.alignment = alignment

445 block.sequences = sequences

446 block.columns = columns

447 return block

448

449 # vim: set ts=4 sts=4 sw=4 et: