allpy: e4b269ba4f5f lib/project.py

allpy

view lib/project.py @ 76:e4b269ba4f5f

wt: improved project parameters

author	boris <bnagaev@gmail.com>
date	Sat, 25 Sep 2010 22:25:03 +0400
parents	a185e7e255b3
children	6cd288019400

line source

1 #!/usr/bin/python

3 """

4 "I will not use abbrev."

5 "I will always finish what I st"

6 - Bart Simpson

8 """

10 import sequence

11 import monomer

12 import allpy_data

14 class Project(object):

15 """

16 Mandatory data:

17 * sequences -- list of Sequence objects. Sequences don't contain gaps

18 - see sequence.py module

19 * alignment -- dict

20 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}

21 keys are the Sequence objects, values are the lists, which

22 contain monomers of those sequences or None for gaps in the

23 corresponding sequence of

24 alignment

26 """

27 def __init__(self, *args):

28 """overloaded constructor

30 Project() -> new empty Project

31 Project(sequences, alignment) -> new Project with sequences and

32 alignment initialized from arguments

33 Project(fasta_file) -> new Project, read alignment and sequences

34 from fasta file

36 """

37 if len(args)>1:#overloaded constructor

38 self.sequences=args[0]

39 self.alignment=args[1]

40 elif len(args)==0:

41 self.sequences=[]

42 self.alignment={}

43 else:

44 self.sequences,self.alignment=Project.get_from_fasta(args[0])

46 def __len__(self):

47 return max([len(line) for line in self.alignment.values()])

49 def thickness(self):

50 """ The number of sequences in alignment (it's thickness).

51 """

52 return len(self.alignment)

54 def calc_identity(self):

55 """ Calculate the identity of alignment positions for colouring.

57 For every (row, column) in alignment the percentage of the exactly

58 same residue in the same column in the alignment is calculated.

59 The data structure is just like the Project.alignment, but istead of

60 monomers it contains float percentages.

61 """

62 # Oh, God, that's awful! Absolutely not understandable.

63 # First, calculate percentages of amino acids in every column

64 contribution = 1.0 / len(self.sequences)

65 all_columns = []

66 for position in range(len(self)):

67 column_percentage = {}

68 for seq in self.alignment:

69 if self.alignment[seq][position] is not None:

70 aa = self.alignment[seq][position].code

71 else:

72 aa = None

73 if aa in allpy_data.amino_acids:

74 if aa in column_percentage.keys():

75 column_percentage[aa] += contribution

76 else:

77 column_percentage[aa] = contribution

78 all_columns.append(column_percentage)

79 # Second, map these percentages onto the alignment

80 self.identity_percentages = {}

81 for seq in self.sequences:

82 self.identity_percentages[seq] = []

83 for seq in self.identity_percentages:

84 line = self.identity_percentages[seq]

85 for position in range(len(self)):

86 if self.alignment[seq][position] is not None:

87 aa = self.alignment[seq][position].code

88 else:

89 aa = None

90 line.append(all_columns[position].get(aa))

91 return self.identity_percentages

93 @staticmethod

94 def get_from_fasta(file):

95 """

96 >>> import project

97 >>> sequences,alignment=project.Project.get_from_fasta(open("test.fasta"))

98 """

99 import re

100

101 sequences=[]

102 alignment={}

103

104 content=file.read()

105 raw_sequences=content.split(">")[1:]#ignore everything before the first >

106 for raw in raw_sequences:

107 parsed_raw_sequence = raw.split("\n")

108 for counter,piece in enumerate(parsed_raw_sequence):

109 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces

110 name_and_description = parsed_raw_sequence[0]

111 if len(name_and_description.split(" ",1))==2:

112 name,description=name_and_description.split(" ",1)

113 elif len(name_and_description.split(" ",1))==1:#if there is description

114 name=name_and_description

115 else:

116 raise "Wrong name of sequence in fasta file"

117 string=""

118 for piece in parsed_raw_sequence[1:]:

119 piece_without_whitespace_chars=re.sub("\s","",piece)

120 string+=piece_without_whitespace_chars

121 monomers=[]#convert into Monomer objects

122 alignment_list=[]#create the respective list in alignment dict

123 for current_monomer in string:

124 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":

125 monomers.append(monomer.Monomer(current_monomer))

126 alignment_list.append(monomers[-1])

127 else:

128 alignment_list.append(None)

129 if "description" in vars():#if there's no description

130 sequences.append(sequence.Sequence(name,monomers,description))

131 else:

132 sequences.append(sequence.Sequence(name,monomers))

133 alignment[sequences[-1]]=alignment_list

134 return sequences,alignment