allpy: 9369dbad919d allpy/base.py

allpy

view allpy/base.py @ 251:9369dbad919d

Incompatible changes to Monomer interfaces. This branch does not work! - (!!) only changed allpy._monomer, not uses - (!!) removed (temporarily) classes for specific monomer types (DNAMonomer, etc) - refurbished allpy.data.AAcodes to allpy.data.codes with much cleaner interface - refurbished allpy._monomer for simplicity and more friendly interface Now it will (someday) be possible to say: a = Monomer.from_name("alanine") b = protein.Monomer.from_code1("a") c = protein.MonomerType.from_code3("ala") d = dna.Monomer.from_code3("DA") but impossible to say: d = protein.Monomer.from_code3("DA")

author	Daniil Alexeyevsky <me.dendik@gmail.com>
date	Mon, 13 Dec 2010 20:12:11 +0300
parents	6507cd6808c7
children	aae821828b03

line source

1 import _monomer

2 from fasta import save_fasta

4 class Sequence(list):

5 """ Sequence of Monomers

7 list of monomer objects

9 Mandatory data:

10 * name -- str with the name of sequence

11 * description -- str with description of the sequence

12 """

13 pass

15 MonomerType = _monomer.MonomerType

16 Monomer = _monomer.Monomer

18 class Alignment(dict):

19 """ Alignment

21 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}

22 keys are the Sequence objects, values are the lists, which

23 contain monomers of those sequences or None for gaps in the

24 corresponding sequence of alignment

25 """

26 # _sequences -- list of Sequence objects. Sequences don't contain gaps

27 # - see sequence.py module

29 def __init__(self, *args):

30 """overloaded constructor

32 Alignment() -> new empty Alignment

33 Alignment(sequences, body) -> new Alignment with sequences and

34 body initialized from arguments

35 Alignment(fasta_file) -> new Alignment, read body and sequences

36 from fasta file

38 """

39 if len(args)>1:#overloaded constructor

40 self.sequences=args[0]

41 self.body=args[1]

42 elif len(args)==0:

43 self.sequences=[]

44 self.body={}

45 else:

46 self.sequences, self.body = Alignment.from_fasta(args[0])

48 def length(self):

49 """ Returns width, ie length of each sequence with gaps """

50 return max([len(line) for line in self.body.values()])

52 def height(self):

53 """ The number of sequences in alignment (it's thickness). """

54 return len(self.body)

56 def identity(self):

57 """ Calculate the identity of alignment positions for colouring.

59 For every (row, column) in alignment the percentage of the exactly

60 same residue in the same column in the alignment is calculated.

61 The data structure is just like the Alignment.body, but istead of

62 monomers it contains float percentages.

63 """

64 # Oh, God, that's awful! Absolutely not understandable.

65 # First, calculate percentages of amino acids in every column

66 contribution = 1.0 / len(self.sequences)

67 all_columns = []

68 for position in range(len(self)):

69 column_percentage = {}

70 for seq in self.body:

71 if self.body[seq][position] is not None:

72 aa = self.body[seq][position].code

73 else:

74 aa = None

75 if aa in allpy.data.amino_acids:

76 if aa in column_percentage.keys():

77 column_percentage[aa] += contribution

78 else:

79 column_percentage[aa] = contribution

80 all_columns.append(column_percentage)

81 # Second, map these percentages onto the alignment

82 self.identity_percentages = {}

83 for seq in self.sequences:

84 self.identity_percentages[seq] = []

85 for seq in self.identity_percentages:

86 line = self.identity_percentages[seq]

87 for position in range(len(self)):

88 if self.body[seq][position] is not None:

89 aa = self.body[seq][position].code

90 else:

91 aa = None

92 line.append(all_columns[position].get(aa))

93 return self.identity_percentages

95 @staticmethod

96 def from_fasta(file, monomer_kind=AminoAcidType):

97 """ Import data from fasta file

99 monomer_kind is class, inherited from MonomerType

100

101 >>> import alignment

102 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))

103 """

104 import re

105

106 sequences = []

107 body = {}

108

109 raw_sequences = file.read().split(">")

110 if len(raw_sequences) <= 1:

111 raise Exception("Wrong format of fasta-file %s" % file.name)

112

113 raw_sequences = raw_sequences[1:] #ignore everything before the first >

114 for raw in raw_sequences:

115 parsed_raw_sequence = raw.split("\n")

116 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]

117 name_and_description = parsed_raw_sequence[0]

118 name_and_description = name_and_description.split(" ",1)

119 if len(name_and_description) == 2:

120 name, description = name_and_description

121 elif len(name_and_description) == 1:

122 #if there is description

123 name = name_and_description[0]

124 description = ''

125 else:

126 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \

127 {'name': name, 'file': file.name})

128

129 if len(parsed_raw_sequence) <= 1:

130 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \

131 {'name': name, 'file': file.name})

132 string = ""

133 for piece in parsed_raw_sequence[1:]:

134 piece_without_whitespace_chars = re.sub("\s", "", piece)

135 string += piece_without_whitespace_chars

136 monomers = [] #convert into Monomer objects

137 body_list = [] #create the respective list in body dict

138 for current_monomer in string:

139 if current_monomer not in ["-", ".", "~"]:

140 monomers.append(monomer_kind.from_code1(current_monomer).instance())

141 body_list.append(monomers[-1])

142 else:

143 body_list.append(None)

144 s = sequence.Sequence(monomers, name, description)

145 sequences.append(s)

146 body[s] = body_list

147 return sequences, body

148

149 @staticmethod

150 def from_sequences(*sequences):

151 """ Constructs new alignment from sequences

152

153 Add None's to right end to make equal lengthes of alignment sequences

154 """

155 alignment = Alignment()

156 alignment.sequences = sequences

157 max_length = max(len(sequence) for sequence in sequences)

158 for sequence in sequences:

159 gaps_count = max_length - len(sequence)

160 alignment.body[sequence] = sequence.monomers + [None] * gaps_count

161 return alignment

162

163 def save_fasta(self, out_file, long_line=70, gap='-'):

164 """ Saves alignment to given file

165

166 Splits long lines to substrings of length=long_line

167 To prevent this, set long_line=None

168 """

169 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)

170

171 def muscle_align(self):

172 """ Simple align ths alignment using sequences (muscle)

173

174 uses old Monomers and Sequences objects

175 """

176 tmp_file = NamedTemporaryFile(delete=False)

177 self.save_fasta(tmp_file)

178 tmp_file.close()

179 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})

180 sequences, body = Alignment.from_fasta(open(tmp_file.name))

181 for sequence in self.sequences:

182 try:

183 new_sequence = [i for i in sequences if sequence==i][0]

184 except:

185 raise Exception("Align: Cann't find sequence %s in muscle output" % \

186 sequence.name)

187 old_monomers = iter(sequence.monomers)

188 self.body[sequence] = []

189 for monomer in body[new_sequence]:

190 if not monomer:

191 self.body[sequence].append(monomer)

192 else:

193 old_monomer = old_monomers.next()

194 if monomer != old_monomer:

195 raise Exception("Align: alignment errors")

196 self.body[sequence].append(old_monomer)

197 os.unlink(tmp_file.name)

198

199 def column(self, sequence=None, sequences=None, original=None):

200 """ returns list of columns of alignment

201

202 sequence or sequences:

203 if sequence is given, then column is (original_monomer, monomer)

204 if sequences is given, then column is (original_monomer, {sequence: monomer})

205 if both of them are given, it is an error

206 original (Sequence type):

207 if given, this filters only columns represented by original sequence

208 """

209 if sequence and sequences:

210 raise Exception("Wrong usage. read help")

211 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])

212 alignment = self.body.items()

213 alignment.sort(key=lambda i: indexes[i[0]])

214 alignment = [monomers for seq, monomers in alignment]

215 for column in zip(*alignment):

216 if not original or column[indexes[original]]:

217 if sequence:

218 yield (column[indexes[original]], column[indexes[sequence]])

219 else:

220 yield (column[indexes[original]],

221 dict([(s, column[indexes[s]]) for s in sequences]))

222

223 def secstr(self, sequence, pdb_chain, gap='-'):

224 """ Returns string representing secondary structure """

225 return ''.join([

226 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)

227 for m in self.body[sequence]])

228

229 class Block(object):

230 """ """

231 pass

232