allpy: 08494fb2c47a lib/alignment.py

allpy

view lib/alignment.py @ 190:08494fb2c47a

repeats: visitors for visiting tree items (almost) completed (tree from 10 Nov 2010 of https://docs1.google.com/document/d/1-7kCo7cdYTpHOtPXGMdSG9wxoXh6sIvrNyYyZuCKECs/edit?hl=en#)

author	boris (netbook) <bnagaev@gmail.com>
date	Thu, 18 Nov 2010 00:01:07 +0300
parents	e4f3a2022b08
children

line source

1 #!/usr/bin/python

3 """

4 "I will not use abbrev."

5 "I will always finish what I st"

6 - Bart Simpson

8 """

10 import sequence

11 from monomer import AminoAcidType

12 import allpy_data

13 from tempfile import NamedTemporaryFile

14 import os

15 import block

16 from fasta import save_fasta

18 class Alignment(object):

19 """ Alignment

21 Mandatory data:

22 * body -- dict

23 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}

24 keys are the Sequence objects, values are the lists, which

25 contain monomers of those sequences or None for gaps in the

26 corresponding sequence of

27 alignment

29 """

30 # _sequences -- list of Sequence objects. Sequences don't contain gaps

31 # - see sequence.py module

33 def __init__(self, *args):

34 """overloaded constructor

36 Alignment() -> new empty Alignment

37 Alignment(sequences, body) -> new Alignment with sequences and

38 body initialized from arguments

39 Alignment(fasta_file) -> new Alignment, read body and sequences

40 from fasta file

42 """

43 if len(args)>1:#overloaded constructor

44 self.sequences=args[0]

45 self.body=args[1]

46 elif len(args)==0:

47 self.sequences=[]

48 self.body={}

49 else:

50 self.sequences, self.body = Alignment.from_fasta(args[0])

52 def length(self):

53 """ Returns width, ie length of each sequence with gaps """

54 return max([len(line) for line in self.body.values()])

56 def height(self):

57 """ The number of sequences in alignment (it's thickness). """

58 return len(self.body)

60 def identity(self):

61 """ Calculate the identity of alignment positions for colouring.

63 For every (row, column) in alignment the percentage of the exactly

64 same residue in the same column in the alignment is calculated.

65 The data structure is just like the Alignment.body, but istead of

66 monomers it contains float percentages.

67 """

68 # Oh, God, that's awful! Absolutely not understandable.

69 # First, calculate percentages of amino acids in every column

70 contribution = 1.0 / len(self.sequences)

71 all_columns = []

72 for position in range(len(self)):

73 column_percentage = {}

74 for seq in self.body:

75 if self.body[seq][position] is not None:

76 aa = self.body[seq][position].code

77 else:

78 aa = None

79 if aa in allpy_data.amino_acids:

80 if aa in column_percentage.keys():

81 column_percentage[aa] += contribution

82 else:

83 column_percentage[aa] = contribution

84 all_columns.append(column_percentage)

85 # Second, map these percentages onto the alignment

86 self.identity_percentages = {}

87 for seq in self.sequences:

88 self.identity_percentages[seq] = []

89 for seq in self.identity_percentages:

90 line = self.identity_percentages[seq]

91 for position in range(len(self)):

92 if self.body[seq][position] is not None:

93 aa = self.body[seq][position].code

94 else:

95 aa = None

96 line.append(all_columns[position].get(aa))

97 return self.identity_percentages

99 @staticmethod

100 def from_fasta(file, monomer_kind=AminoAcidType):

101 """ Import data from fasta file

102

103 monomer_kind is class, inherited from MonomerType

104

105 >>> import alignment

106 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))

107 """

108 import re

109

110 sequences = []

111 body = {}

112

113 raw_sequences = file.read().split(">")

114 if len(raw_sequences) <= 1:

115 raise Exception("Wrong format of fasta-file %s" % file.name)

116

117 raw_sequences = raw_sequences[1:] #ignore everything before the first >

118 for raw in raw_sequences:

119 parsed_raw_sequence = raw.split("\n")

120 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]

121 name_and_description = parsed_raw_sequence[0]

122 name_and_description = name_and_description.split(" ",1)

123 if len(name_and_description) == 2:

124 name, description = name_and_description

125 elif len(name_and_description) == 1:

126 #if there is description

127 name = name_and_description[0]

128 description = ''

129 else:

130 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \

131 {'name': name, 'file': file.name})

132

133 if len(parsed_raw_sequence) <= 1:

134 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \

135 {'name': name, 'file': file.name})

136 string = ""

137 for piece in parsed_raw_sequence[1:]:

138 piece_without_whitespace_chars = re.sub("\s", "", piece)

139 string += piece_without_whitespace_chars

140 monomers = [] #convert into Monomer objects

141 body_list = [] #create the respective list in body dict

142 for current_monomer in string:

143 if current_monomer not in ["-", ".", "~"]:

144 monomers.append(monomer_kind.from_code1(current_monomer).instance())

145 body_list.append(monomers[-1])

146 else:

147 body_list.append(None)

148 s = sequence.Sequence(monomers, name, description)

149 sequences.append(s)

150 body[s] = body_list

151 return sequences, body

152

153

154 @staticmethod

155 def from_sequences(*sequences):

156 """ Constructs new alignment from sequences

157

158 Add None's to right end to make equal lengthes of alignment sequences

159 """

160 alignment = Alignment()

161 alignment.sequences = sequences

162 max_length = max(len(sequence) for sequence in sequences)

163 for sequence in sequences:

164 gaps_count = max_length - len(sequence)

165 alignment.body[sequence] = sequence.monomers + [None] * gaps_count

166 return alignment

167

168 def save_fasta(self, out_file, long_line=70, gap='-'):

169 """ Saves alignment to given file

170

171 Splits long lines to substrings of length=long_line

172 To prevent this, set long_line=None

173 """

174 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap)

175

176 def muscle_align(self):

177 """ Simple align ths alignment using sequences (muscle)

178

179 uses old Monomers and Sequences objects

180 """

181 tmp_file = NamedTemporaryFile(delete=False)

182 self.save_fasta(tmp_file)

183 tmp_file.close()

184 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})

185 sequences, body = Alignment.from_fasta(open(tmp_file.name))

186 for sequence in self.sequences:

187 try:

188 new_sequence = [i for i in sequences if sequence==i][0]

189 except:

190 raise Exception("Align: Cann't find sequence %s in muscle output" % \

191 sequence.name)

192 old_monomers = iter(sequence.monomers)

193 self.body[sequence] = []

194 for monomer in body[new_sequence]:

195 if not monomer:

196 self.body[sequence].append(monomer)

197 else:

198 old_monomer = old_monomers.next()

199 if monomer != old_monomer:

200 raise Exception("Align: alignment errors")

201 self.body[sequence].append(old_monomer)

202 os.unlink(tmp_file.name)

203

204

205 def column(self, sequence=None, sequences=None, original=None):

206 """ returns list of columns of alignment

207

208 sequence or sequences:

209 if sequence is given, then column is (original_monomer, monomer)

210 if sequences is given, then column is (original_monomer, {sequence: monomer})

211 if both of them are given, it is an error

212 original (Sequence type):

213 if given, this filters only columns represented by original sequence

214 """

215 if sequence and sequences:

216 raise Exception("Wrong usage. read help")

217 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])

218 alignment = self.body.items()

219 alignment.sort(key=lambda i: indexes[i[0]])

220 alignment = [monomers for seq, monomers in alignment]

221 for column in zip(*alignment):

222 if not original or column[indexes[original]]:

223 if sequence:

224 yield (column[indexes[original]], column[indexes[sequence]])

225 else:

226 yield (column[indexes[original]],

227 dict([(s, column[indexes[s]]) for s in sequences]))

228

229

230 def secstr(self, sequence, pdb_chain, gap='-'):

231 """ Returns string representing secondary structure """

232 return ''.join([

233 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap)

234 for m in self.body[sequence]])

235

236 def save_secstr(self, out_file, sequence, pdb_chain,

237 name, description='', gap='-', long_line=70):

238 """ Save secondary structure and name in fasta format """

239 save_fasta(out_file, self.secstr(sequence, pdb_chain, gap), name, description, long_line)

240