Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/annotate/aae821828b03/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Mar 2 07:05:14 2014
Кодировка:
allpy: allpy/base.py annotate

allpy

annotate allpy/base.py @ 260:aae821828b03

Moved contents of allpy._monomer to allpy.base
author Daniil Alexeyevsky <me.dendik@gmail.com>
date Tue, 14 Dec 2010 20:51:53 +0300
parents 117b03ee9e8f
children d60628e29b24
rev   line source
bnagaev@249 1 from fasta import save_fasta
me@260 2 import data.codes
me@260 3
me@260 4 class MonomerType(object):
me@260 5 """Class of monomer types.
me@260 6
me@260 7 Each MonomerType object represents a known monomer type, e.g. Valine,
me@260 8 and is referenced to by each instance of monomer in a given sequence.
me@260 9
me@260 10 - `name`: full name of monomer type
me@260 11 - `code1`: one-letter code
me@260 12 - `code3`: three-letter code
me@260 13 - `is_modified`: either of True or False
me@260 14
me@260 15 class atributes:
me@260 16
me@260 17 - `by_code1`: a mapping from one-letter code to MonomerType object
me@260 18 - `by_code3`: a mapping from three-letter code to MonomerType object
me@260 19 - `by_name`: a mapping from monomer name to MonomerType object
me@260 20 - `instance_type`: class of Monomer objects to use when creating new
me@260 21 objects; this must be redefined in descendent classes
me@260 22
me@260 23 All of the class attributes MUST be redefined when subclassing.
me@260 24 """
me@260 25
me@260 26 by_code1 = {}
me@260 27 by_code3 = {}
me@260 28 by_name = {}
me@260 29 instance_type = None
me@260 30
me@260 31 def __init__(self, name="", code1="", code3="", is_modified=False):
me@260 32 self.name = name.capitalize()
me@260 33 self.code1 = code1.upper()
me@260 34 self.code3 = code3.upper()
me@260 35 self.is_modified = bool(is_modified)
me@260 36 if not is_modified:
me@260 37 self.by_code1[self.code1] = self
me@260 38 self.by_code3[code3] = self
me@260 39 self.by_name[name] = self
me@260 40 # We duplicate distinguished long names into MonomerType itself,
me@260 41 # so that we can use MonomerType.from_code3 to create the relevant
me@260 42 # type of monomer.
me@260 43 MonomerType.by_code3[code3] = self
me@260 44 MonomerType.by_name[name] = self
me@260 45
me@260 46 @classmethod
me@260 47 def _initialize(cls, type_letter, codes=data.codes.codes):
me@260 48 """Create all relevant instances of MonomerType.
me@260 49
me@260 50 `type_letter` is either of:
me@260 51
me@260 52 - 'p' for protein
me@260 53 - 'd' for DNA
me@260 54 - 'r' for RNA
me@260 55
me@260 56 `codes` is a table of monomer codes
me@260 57 """
me@260 58 for type, code1, is_modified, code3, name in codes:
me@260 59 if type == type_letter:
me@260 60 cls(name, code1, code3, is_modified)
me@260 61
me@260 62 @classmethod
me@260 63 def from_code1(cls, code1):
me@260 64 """Return monomer type by one-letter code."""
me@260 65 return cls.by_code1[code1.upper()]
me@260 66
me@260 67 @classmethod
me@260 68 def from_code3(cls, code3):
me@260 69 """Return monomer type by three-letter code."""
me@260 70 return cls.by_code3[code3.upper()]
me@260 71
me@260 72 @classmethod
me@260 73 def from_name(cls, name):
me@260 74 """Return monomer type by name."""
me@260 75 return cls.by_name[name.capitalize()]
me@260 76
me@260 77 def instance(self):
me@260 78 """Create a new monomer of given type."""
me@260 79 return self.instance_type(self)
me@260 80
me@260 81 def __eq__(self, other):
me@260 82 if hasattr(other, "type"):
me@260 83 return self is other.type
me@260 84 return self is other
me@260 85
me@260 86 class Monomer(object):
me@260 87 """Monomer object.
me@260 88
me@260 89 attributes:
me@260 90
me@260 91 - `type`: type of monomer (a MonomerType object)
me@260 92
me@260 93 class attribute `monomer_type` is MonomerType or either of it's subclasses,
me@260 94 it is used when creating new monomers. It MUST be redefined when subclassing Monomer.
me@260 95 """
me@260 96 monomer_type = MonomerType
me@260 97
me@260 98 def __init__(self, type):
me@260 99 self.type = type
me@260 100
me@260 101 @classmethod
me@260 102 def from_code1(cls, code1):
me@260 103 return cls(cls.monomer_type.by_code1[code1.upper()])
me@260 104
me@260 105 @classmethod
me@260 106 def from_code3(cls, code3):
me@260 107 return cls(cls.monomer_type.by_code3[code3.upper()])
me@260 108
me@260 109 @classmethod
me@260 110 def from_name(cls, name):
me@260 111 return cls(cls.monomer_type.by_name[name.capitalize()])
me@260 112
me@260 113 def __eq__(self, other):
me@260 114 if hasattr(other, "type"):
me@260 115 return self.type is other.type
me@260 116 return self.type is other
bnagaev@239 117
bnagaev@239 118 class Sequence(list):
bnagaev@243 119 """ Sequence of Monomers
bnagaev@243 120
bnagaev@243 121 list of monomer objects
bnagaev@243 122
bnagaev@243 123 Mandatory data:
bnagaev@243 124 * name -- str with the name of sequence
bnagaev@243 125 * description -- str with description of the sequence
bnagaev@243 126 """
bnagaev@243 127 pass
bnagaev@243 128
bnagaev@249 129 class Alignment(dict):
bnagaev@249 130 """ Alignment
bnagaev@249 131
bnagaev@249 132 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
bnagaev@249 133 keys are the Sequence objects, values are the lists, which
bnagaev@249 134 contain monomers of those sequences or None for gaps in the
bnagaev@249 135 corresponding sequence of alignment
bnagaev@249 136 """
bnagaev@249 137 # _sequences -- list of Sequence objects. Sequences don't contain gaps
bnagaev@249 138 # - see sequence.py module
bnagaev@249 139
bnagaev@249 140 def __init__(self, *args):
bnagaev@249 141 """overloaded constructor
bnagaev@249 142
bnagaev@249 143 Alignment() -> new empty Alignment
bnagaev@249 144 Alignment(sequences, body) -> new Alignment with sequences and
bnagaev@249 145 body initialized from arguments
bnagaev@249 146 Alignment(fasta_file) -> new Alignment, read body and sequences
bnagaev@249 147 from fasta file
bnagaev@249 148
bnagaev@249 149 """
bnagaev@249 150 if len(args)>1:#overloaded constructor
bnagaev@249 151 self.sequences=args[0]
bnagaev@249 152 self.body=args[1]
bnagaev@249 153 elif len(args)==0:
bnagaev@249 154 self.sequences=[]
bnagaev@249 155 self.body={}
bnagaev@249 156 else:
bnagaev@249 157 self.sequences, self.body = Alignment.from_fasta(args[0])
bnagaev@249 158
bnagaev@249 159 def length(self):
bnagaev@249 160 """ Returns width, ie length of each sequence with gaps """
bnagaev@249 161 return max([len(line) for line in self.body.values()])
bnagaev@249 162
bnagaev@249 163 def height(self):
bnagaev@249 164 """ The number of sequences in alignment (it's thickness). """
bnagaev@249 165 return len(self.body)
bnagaev@249 166
bnagaev@249 167 def identity(self):
bnagaev@249 168 """ Calculate the identity of alignment positions for colouring.
bnagaev@249 169
bnagaev@249 170 For every (row, column) in alignment the percentage of the exactly
bnagaev@249 171 same residue in the same column in the alignment is calculated.
bnagaev@249 172 The data structure is just like the Alignment.body, but istead of
bnagaev@249 173 monomers it contains float percentages.
bnagaev@249 174 """
bnagaev@249 175 # Oh, God, that's awful! Absolutely not understandable.
bnagaev@249 176 # First, calculate percentages of amino acids in every column
bnagaev@249 177 contribution = 1.0 / len(self.sequences)
bnagaev@249 178 all_columns = []
bnagaev@249 179 for position in range(len(self)):
bnagaev@249 180 column_percentage = {}
bnagaev@249 181 for seq in self.body:
bnagaev@249 182 if self.body[seq][position] is not None:
bnagaev@249 183 aa = self.body[seq][position].code
bnagaev@249 184 else:
bnagaev@249 185 aa = None
bnagaev@249 186 if aa in allpy.data.amino_acids:
bnagaev@249 187 if aa in column_percentage.keys():
bnagaev@249 188 column_percentage[aa] += contribution
bnagaev@249 189 else:
bnagaev@249 190 column_percentage[aa] = contribution
bnagaev@249 191 all_columns.append(column_percentage)
bnagaev@249 192 # Second, map these percentages onto the alignment
bnagaev@249 193 self.identity_percentages = {}
bnagaev@249 194 for seq in self.sequences:
bnagaev@249 195 self.identity_percentages[seq] = []
bnagaev@249 196 for seq in self.identity_percentages:
bnagaev@249 197 line = self.identity_percentages[seq]
bnagaev@249 198 for position in range(len(self)):
bnagaev@249 199 if self.body[seq][position] is not None:
bnagaev@249 200 aa = self.body[seq][position].code
bnagaev@249 201 else:
bnagaev@249 202 aa = None
bnagaev@249 203 line.append(all_columns[position].get(aa))
bnagaev@249 204 return self.identity_percentages
bnagaev@249 205
bnagaev@249 206 @staticmethod
bnagaev@249 207 def from_fasta(file, monomer_kind=AminoAcidType):
bnagaev@249 208 """ Import data from fasta file
bnagaev@249 209
bnagaev@249 210 monomer_kind is class, inherited from MonomerType
bnagaev@249 211
bnagaev@249 212 >>> import alignment
bnagaev@249 213 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))
bnagaev@249 214 """
bnagaev@249 215 import re
bnagaev@249 216
bnagaev@249 217 sequences = []
bnagaev@249 218 body = {}
bnagaev@249 219
bnagaev@249 220 raw_sequences = file.read().split(">")
bnagaev@249 221 if len(raw_sequences) <= 1:
bnagaev@249 222 raise Exception("Wrong format of fasta-file %s" % file.name)
bnagaev@249 223
bnagaev@249 224 raw_sequences = raw_sequences[1:] #ignore everything before the first >
bnagaev@249 225 for raw in raw_sequences:
bnagaev@249 226 parsed_raw_sequence = raw.split("\n")
bnagaev@249 227 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
bnagaev@249 228 name_and_description = parsed_raw_sequence[0]
bnagaev@249 229 name_and_description = name_and_description.split(" ",1)
bnagaev@249 230 if len(name_and_description) == 2:
bnagaev@249 231 name, description = name_and_description
bnagaev@249 232 elif len(name_and_description) == 1:
bnagaev@249 233 #if there is description
bnagaev@249 234 name = name_and_description[0]
bnagaev@249 235 description = ''
bnagaev@249 236 else:
bnagaev@249 237 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \
bnagaev@249 238 {'name': name, 'file': file.name})
bnagaev@249 239
bnagaev@249 240 if len(parsed_raw_sequence) <= 1:
bnagaev@249 241 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \
bnagaev@249 242 {'name': name, 'file': file.name})
bnagaev@249 243 string = ""
bnagaev@249 244 for piece in parsed_raw_sequence[1:]:
bnagaev@249 245 piece_without_whitespace_chars = re.sub("\s", "", piece)
bnagaev@249 246 string += piece_without_whitespace_chars
bnagaev@249 247 monomers = [] #convert into Monomer objects
bnagaev@249 248 body_list = [] #create the respective list in body dict
bnagaev@249 249 for current_monomer in string:
bnagaev@249 250 if current_monomer not in ["-", ".", "~"]:
bnagaev@249 251 monomers.append(monomer_kind.from_code1(current_monomer).instance())
bnagaev@249 252 body_list.append(monomers[-1])
bnagaev@249 253 else:
bnagaev@249 254 body_list.append(None)
bnagaev@249 255 s = sequence.Sequence(monomers, name, description)
bnagaev@249 256 sequences.append(s)
bnagaev@249 257 body[s] = body_list
bnagaev@249 258 return sequences, body
bnagaev@249 259
bnagaev@249 260 @staticmethod
bnagaev@249 261 def from_sequences(*sequences):
bnagaev@249 262 """ Constructs new alignment from sequences
bnagaev@249 263
bnagaev@249 264 Add None's to right end to make equal lengthes of alignment sequences
bnagaev@249 265 """
bnagaev@249 266 alignment = Alignment()
bnagaev@249 267 alignment.sequences = sequences
bnagaev@249 268 max_length = max(len(sequence) for sequence in sequences)
bnagaev@249 269 for sequence in sequences:
bnagaev@249 270 gaps_count = max_length - len(sequence)
bnagaev@249 271 alignment.body[sequence] = sequence.monomers + [None] * gaps_count
bnagaev@249 272 return alignment
bnagaev@249 273
bnagaev@249 274 def save_fasta(self, out_file, long_line=70, gap='-'):
bnagaev@249 275 """ Saves alignment to given file
bnagaev@249 276
bnagaev@249 277 Splits long lines to substrings of length=long_line
bnagaev@249 278 To prevent this, set long_line=None
bnagaev@249 279 """
bnagaev@249 280 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)
bnagaev@249 281
bnagaev@249 282 def muscle_align(self):
bnagaev@249 283 """ Simple align ths alignment using sequences (muscle)
bnagaev@249 284
bnagaev@249 285 uses old Monomers and Sequences objects
bnagaev@249 286 """
bnagaev@249 287 tmp_file = NamedTemporaryFile(delete=False)
bnagaev@249 288 self.save_fasta(tmp_file)
bnagaev@249 289 tmp_file.close()
bnagaev@249 290 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})
bnagaev@249 291 sequences, body = Alignment.from_fasta(open(tmp_file.name))
bnagaev@249 292 for sequence in self.sequences:
bnagaev@249 293 try:
bnagaev@249 294 new_sequence = [i for i in sequences if sequence==i][0]
bnagaev@249 295 except:
bnagaev@249 296 raise Exception("Align: Cann't find sequence %s in muscle output" % \
bnagaev@249 297 sequence.name)
bnagaev@249 298 old_monomers = iter(sequence.monomers)
bnagaev@249 299 self.body[sequence] = []
bnagaev@249 300 for monomer in body[new_sequence]:
bnagaev@249 301 if not monomer:
bnagaev@249 302 self.body[sequence].append(monomer)
bnagaev@249 303 else:
bnagaev@249 304 old_monomer = old_monomers.next()
bnagaev@249 305 if monomer != old_monomer:
bnagaev@249 306 raise Exception("Align: alignment errors")
bnagaev@249 307 self.body[sequence].append(old_monomer)
bnagaev@249 308 os.unlink(tmp_file.name)
bnagaev@249 309
bnagaev@249 310 def column(self, sequence=None, sequences=None, original=None):
bnagaev@249 311 """ returns list of columns of alignment
bnagaev@249 312
bnagaev@249 313 sequence or sequences:
bnagaev@249 314 if sequence is given, then column is (original_monomer, monomer)
bnagaev@249 315 if sequences is given, then column is (original_monomer, {sequence: monomer})
bnagaev@249 316 if both of them are given, it is an error
bnagaev@249 317 original (Sequence type):
bnagaev@249 318 if given, this filters only columns represented by original sequence
bnagaev@249 319 """
bnagaev@249 320 if sequence and sequences:
bnagaev@249 321 raise Exception("Wrong usage. read help")
bnagaev@249 322 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])
bnagaev@249 323 alignment = self.body.items()
bnagaev@249 324 alignment.sort(key=lambda i: indexes[i[0]])
bnagaev@249 325 alignment = [monomers for seq, monomers in alignment]
bnagaev@249 326 for column in zip(*alignment):
bnagaev@249 327 if not original or column[indexes[original]]:
bnagaev@249 328 if sequence:
bnagaev@249 329 yield (column[indexes[original]], column[indexes[sequence]])
bnagaev@249 330 else:
bnagaev@249 331 yield (column[indexes[original]],
bnagaev@249 332 dict([(s, column[indexes[s]]) for s in sequences]))
bnagaev@249 333
bnagaev@249 334 def secstr(self, sequence, pdb_chain, gap='-'):
bnagaev@249 335 """ Returns string representing secondary structure """
bnagaev@249 336 return ''.join([
bnagaev@249 337 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)
bnagaev@249 338 for m in self.body[sequence]])
bnagaev@249 339
bnagaev@249 340 class Block(object):
bnagaev@239 341 """ """
bnagaev@239 342 pass
bnagaev@239 343
me@260 344 # vim: set ts=4 sts=4 sw=4 et: