allpy: 675b402094be lib/project.py

allpy

view lib/project.py @ 151:675b402094be

day commit -- a lot of changes fasta.py: universal save_fasta() determine_long_line -- for determine length of fasta sequence string in user input everywhere: standart long_line=60 --> 70 blocK.sequences_chains: returns sequences in order as in project added monomer pdb_secstr to store secondary structure pdb adding: some improvements and fixes fix in from_pdb_chain: use all peptides, not only first Sequence.pdb_files added to store information about pdb file for each chain dssp bindings to get secondary structure /sec_str -- tool to map secondary structure on each sequence of alignment

author	boris (netbook) <bnagaev@gmail.com>
date	Tue, 26 Oct 2010 00:40:36 +0400
parents	f7dead025719
children	0c7f6117481b

line source

1 #!/usr/bin/python

3 """

4 "I will not use abbrev."

5 "I will always finish what I st"

6 - Bart Simpson

8 """

10 import sequence

11 Sequence = sequence.Sequence

12 from monomer import AminoAcidType

13 import allpy_data

14 from tempfile import NamedTemporaryFile

15 import os

16 import block

17 from fasta import save_fasta

19 Block = block.Block

21 class Project(object):

22 """ Alignment representing class

24 Mandatory data:

25 * sequences -- list of Sequence objects. Sequences don't contain gaps

26 - see sequence.py module

27 * alignment -- dict

28 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}

29 keys are the Sequence objects, values are the lists, which

30 contain monomers of those sequences or None for gaps in the

31 corresponding sequence of

32 alignment

34 """

35 def __init__(self, *args):

36 """overloaded constructor

38 Project() -> new empty Project

39 Project(sequences, alignment) -> new Project with sequences and

40 alignment initialized from arguments

41 Project(fasta_file) -> new Project, read alignment and sequences

42 from fasta file

44 """

45 if len(args)>1:#overloaded constructor

46 self.sequences=args[0]

47 self.alignment=args[1]

48 elif len(args)==0:

49 self.sequences=[]

50 self.alignment={}

51 else:

52 self.sequences,self.alignment=Project.from_fasta(args[0])

54 def __len__(self):

55 """ Returns width, ie length of each sequence with gaps """

56 return max([len(line) for line in self.alignment.values()])

58 def thickness(self):

59 """ The number of sequences in alignment (it's thickness). """

60 return len(self.alignment)

62 def calc_identity(self):

63 """ Calculate the identity of alignment positions for colouring.

65 For every (row, column) in alignment the percentage of the exactly

66 same residue in the same column in the alignment is calculated.

67 The data structure is just like the Project.alignment, but istead of

68 monomers it contains float percentages.

69 """

70 # Oh, God, that's awful! Absolutely not understandable.

71 # First, calculate percentages of amino acids in every column

72 contribution = 1.0 / len(self.sequences)

73 all_columns = []

74 for position in range(len(self)):

75 column_percentage = {}

76 for seq in self.alignment:

77 if self.alignment[seq][position] is not None:

78 aa = self.alignment[seq][position].code

79 else:

80 aa = None

81 if aa in allpy_data.amino_acids:

82 if aa in column_percentage.keys():

83 column_percentage[aa] += contribution

84 else:

85 column_percentage[aa] = contribution

86 all_columns.append(column_percentage)

87 # Second, map these percentages onto the alignment

88 self.identity_percentages = {}

89 for seq in self.sequences:

90 self.identity_percentages[seq] = []

91 for seq in self.identity_percentages:

92 line = self.identity_percentages[seq]

93 for position in range(len(self)):

94 if self.alignment[seq][position] is not None:

95 aa = self.alignment[seq][position].code

96 else:

97 aa = None

98 line.append(all_columns[position].get(aa))

99 return self.identity_percentages

100

101 @staticmethod

102 def from_fasta(file, monomer_kind=AminoAcidType):

103 """ Import data from fasta file

104

105 monomer_kind is class, inherited from MonomerType

106

107 >>> import project

108 >>> sequences,alignment=project.Project.from_fasta(open("test.fasta"))

109 """

110 import re

111

112 sequences = []

113 alignment = {}

114

115 raw_sequences = file.read().split(">")

116 if len(raw_sequences) <= 1:

117 raise Exception("Wrong format of fasta-file %s" % file.name)

118

119 raw_sequences = raw_sequences[1:] #ignore everything before the first >

120 for raw in raw_sequences:

121 parsed_raw_sequence = raw.split("\n")

122 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]

123 name_and_description = parsed_raw_sequence[0]

124 name_and_description = name_and_description.split(" ",1)

125 if len(name_and_description) == 2:

126 name, description = name_and_description

127 elif len(name_and_description) == 1:

128 #if there is description

129 name = name_and_description[0]

130 description = ''

131 else:

132 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \

133 {'name': name, 'file': file.name})

134

135 if len(parsed_raw_sequence) <= 1:

136 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \

137 {'name': name, 'file': file.name})

138 string = ""

139 for piece in parsed_raw_sequence[1:]:

140 piece_without_whitespace_chars = re.sub("\s", "", piece)

141 string += piece_without_whitespace_chars

142 monomers = [] #convert into Monomer objects

143 alignment_list = [] #create the respective list in alignment dict

144 for current_monomer in string:

145 if current_monomer not in ["-", ".", "~"]:

146 monomers.append(monomer_kind.from_code1(current_monomer).instance())

147 alignment_list.append(monomers[-1])

148 else:

149 alignment_list.append(None)

150 sequence = Sequence(monomers, name, description)

151 sequences.append(sequence)

152 alignment[sequence] = alignment_list

153 return sequences, alignment

154

155

156 @staticmethod

157 def from_sequences(*sequences):

158 """ Constructs new alignment from sequences

159

160 Add None's to right end to make equal lengthes of alignment sequences

161 """

162 project = Project()

163 project.sequences = sequences

164 max_length = max(len(sequence) for sequence in sequences)

165 for sequence in sequences:

166 gaps_count = max_length - len(sequence)

167 project.alignment[sequence] = sequence.monomers + [None] * gaps_count

168 return project

169

170 def save_fasta(self, out_file, long_line=70, gap='-'):

171 """ Saves alignment to given file

172

173 Splits long lines to substrings of length=long_line

174 To prevent this, set long_line=None

175 """

176 Block(self).save_fasta(out_file, long_line=long_line, gap=gap)

177

178 def muscle_align(self):

179 """ Simple align ths alignment using sequences (muscle)

180

181 uses old Monomers and Sequences objects

182 """

183 tmp_file = NamedTemporaryFile(delete=False)

184 self.save_fasta(tmp_file)

185 tmp_file.close()

186 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name})

187 sequences, alignment = Project.from_fasta(open(tmp_file.name))

188 for sequence in self.sequences:

189 try:

190 new_sequence = [i for i in sequences if sequence==i][0]

191 except:

192 raise Exception("Align: Cann't find sequence %s in muscle output" % \

193 sequence.name)

194 old_monomers = iter(sequence.monomers)

195 self.alignment[sequence] = []

196 for monomer in alignment[new_sequence]:

197 if not monomer:

198 self.alignment[sequence].append(monomer)

199 else:

200 old_monomer = old_monomers.next()

201 if monomer != old_monomer:

202 raise Exception("Align: alignment errors")

203 self.alignment[sequence].append(old_monomer)

204 os.unlink(tmp_file.name)

205

206

207 def column(self, sequence=None, sequences=None, original=None):

208 """ returns list of columns of alignment

209

210 sequence or sequences:

211 if sequence is given, then column is (original_monomer, monomer)

212 if sequences is given, then column is (original_monomer, {sequence: monomer})

213 if both of them are given, it is an error

214 original (Sequence type):

215 if given, this filters only columns represented by original sequence

216 """

217 if sequence and sequences:

218 raise Exception("Wrong usage. read help")

219 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])

220 alignment = self.alignment.items()

221 alignment.sort(key=lambda i: indexes[i[0]])

222 alignment = [monomers for seq, monomers in alignment]

223 for column in zip(*alignment):

224 if not original or column[indexes[original]]:

225 if sequence:

226 yield (column[indexes[original]], column[indexes[sequence]])

227 else:

228 yield (column[indexes[original]],

229 dict([(s, column[indexes[s]]) for s in sequences]))

230

231 def pdb_auto_add(self, conformity_file=None):

232 """ Adds pdb information to each sequence

233

234 TODO: conformity_file

235 """

236 conformity = {}

237

238 for sequence in self.sequences:

239 try:

240 sequence.pdb_auto_add(conformity.get(sequence.name, None))

241 except Exception, t:

242 print "Cann't add pdb information about chain %s:" % sequence.name

243 print t

244

245 def secstr(self, secuence, pdb_chain, gap='-'):

246 """ Returns string representing secondary structure """

247 return ''.join([

248 (m.pdb_secstr[pdb_chain] if m and pdb_chain in m.pdb_secstr else gap)

249 for m in self.alignment[secuence]])

250

251 def save_secstr(self, out_file, secuence, pdb_chain,

252 name, description='', gap='-', long_line=70):

253 """ Save secondary structure and name in fasta format """

254 save_fasta(out_file, self.secstr(secuence, pdb_chain, gap), name, description, long_line)

255