allpy: 9b91b49b9ef0 lib/project.py

allpy

view lib/project.py @ 96:9b91b49b9ef0

lib::alignment::from_sequences

author	boris <bnagaev@gmail.com>
date	Wed, 20 Oct 2010 22:24:59 +0400
parents	a2f3926f5077
children	7dd461d17f96

line source

1 #!/usr/bin/python

3 """

4 "I will not use abbrev."

5 "I will always finish what I st"

6 - Bart Simpson

8 """

10 import sequence

11 from monomer import AminoAcidType

12 import allpy_data

14 class Project(object):

15 """

16 Mandatory data:

17 * sequences -- list of Sequence objects. Sequences don't contain gaps

18 - see sequence.py module

19 * alignment -- dict

20 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}

21 keys are the Sequence objects, values are the lists, which

22 contain monomers of those sequences or None for gaps in the

23 corresponding sequence of

24 alignment

26 """

27 def __init__(self, *args):

28 """overloaded constructor

30 Project() -> new empty Project

31 Project(sequences, alignment) -> new Project with sequences and

32 alignment initialized from arguments

33 Project(fasta_file) -> new Project, read alignment and sequences

34 from fasta file

36 """

37 if len(args)>1:#overloaded constructor

38 self.sequences=args[0]

39 self.alignment=args[1]

40 elif len(args)==0:

41 self.sequences=[]

42 self.alignment={}

43 else:

44 self.sequences,self.alignment=Project.from_fasta(args[0])

46 def __len__(self):

47 return max([len(line) for line in self.alignment.values()])

49 def thickness(self):

50 """ The number of sequences in alignment (it's thickness).

51 """

52 return len(self.alignment)

54 def calc_identity(self):

55 """ Calculate the identity of alignment positions for colouring.

57 For every (row, column) in alignment the percentage of the exactly

58 same residue in the same column in the alignment is calculated.

59 The data structure is just like the Project.alignment, but istead of

60 monomers it contains float percentages.

61 """

62 # Oh, God, that's awful! Absolutely not understandable.

63 # First, calculate percentages of amino acids in every column

64 contribution = 1.0 / len(self.sequences)

65 all_columns = []

66 for position in range(len(self)):

67 column_percentage = {}

68 for seq in self.alignment:

69 if self.alignment[seq][position] is not None:

70 aa = self.alignment[seq][position].code

71 else:

72 aa = None

73 if aa in allpy_data.amino_acids:

74 if aa in column_percentage.keys():

75 column_percentage[aa] += contribution

76 else:

77 column_percentage[aa] = contribution

78 all_columns.append(column_percentage)

79 # Second, map these percentages onto the alignment

80 self.identity_percentages = {}

81 for seq in self.sequences:

82 self.identity_percentages[seq] = []

83 for seq in self.identity_percentages:

84 line = self.identity_percentages[seq]

85 for position in range(len(self)):

86 if self.alignment[seq][position] is not None:

87 aa = self.alignment[seq][position].code

88 else:

89 aa = None

90 line.append(all_columns[position].get(aa))

91 return self.identity_percentages

93 @staticmethod

94 def from_fasta(file, monomer_type=AminoAcidType):

95 """

96 >>> import project

97 >>> sequences,alignment=project.Project.from_fasta(open("test.fasta"))

98 """

99 import re

100

101 sequences=[]

102 alignment={}

103

104 content=file.read()

105 raw_sequences=content.split(">")[1:]#ignore everything before the first >

106 for raw in raw_sequences:

107 parsed_raw_sequence = raw.split("\n")

108 for counter,piece in enumerate(parsed_raw_sequence):

109 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces

110 name_and_description = parsed_raw_sequence[0]

111 if len(name_and_description.split(" ",1))==2:

112 name, description = name_and_description.split(" ",1)

113 elif len(name_and_description.split(" ", 1)) == 1:#if there is description

114 name = name_and_description

115 else:

116 raise "Wrong name of sequence in fasta file"

117 string=""

118 for piece in parsed_raw_sequence[1:]:

119 piece_without_whitespace_chars=re.sub("\s","",piece)

120 string+=piece_without_whitespace_chars

121 monomers=[]#convert into Monomer objects

122 alignment_list=[]#create the respective list in alignment dict

123 for current_monomer in string:

124 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":

125 monomers.append(monomer_type.from_code1(current_monomer).instance())

126 alignment_list.append(monomers[-1])

127 else:

128 alignment_list.append(None)

129 if "description" in vars():#if there's no description

130 sequences.append(sequence.Sequence(name,monomers,description))

131 else:

132 sequences.append(sequence.Sequence(name,monomers))

133 alignment[sequences[-1]]=alignment_list

134 return sequences,alignment

135

136

137 @staticmethod

138 def from_sequences(*sequences):

139 """

140 Constructs new alignment from sequences

141 Add gaps to right end to make equal lengthes of alignment sequences

142 """

143 project = Project()

144 project.sequences = sequences

145 max_length = max(len(sequence) for sequence in sequences)

146 for sequence in sequences:

147 gaps_count = max_length - len(sequence)

148 project.alignment[sequence] = sequence.monomers + [None] * gaps_count

149 return project

150

151

152