Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/9b91b49b9ef0/lib/project.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 23:23:55 2013
Кодировка:
allpy: 9b91b49b9ef0 lib/project.py

allpy

view lib/project.py @ 96:9b91b49b9ef0

lib::alignment::from_sequences
author boris <bnagaev@gmail.com>
date Wed, 20 Oct 2010 22:24:59 +0400
parents a2f3926f5077
children 7dd461d17f96
line source
1 #!/usr/bin/python
3 """
4 "I will not use abbrev."
5 "I will always finish what I st"
6 - Bart Simpson
8 """
10 import sequence
11 from monomer import AminoAcidType
12 import allpy_data
14 class Project(object):
15 """
16 Mandatory data:
17 * sequences -- list of Sequence objects. Sequences don't contain gaps
18 - see sequence.py module
19 * alignment -- dict
20 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
21 keys are the Sequence objects, values are the lists, which
22 contain monomers of those sequences or None for gaps in the
23 corresponding sequence of
24 alignment
26 """
27 def __init__(self, *args):
28 """overloaded constructor
30 Project() -> new empty Project
31 Project(sequences, alignment) -> new Project with sequences and
32 alignment initialized from arguments
33 Project(fasta_file) -> new Project, read alignment and sequences
34 from fasta file
36 """
37 if len(args)>1:#overloaded constructor
38 self.sequences=args[0]
39 self.alignment=args[1]
40 elif len(args)==0:
41 self.sequences=[]
42 self.alignment={}
43 else:
44 self.sequences,self.alignment=Project.from_fasta(args[0])
46 def __len__(self):
47 return max([len(line) for line in self.alignment.values()])
49 def thickness(self):
50 """ The number of sequences in alignment (it's thickness).
51 """
52 return len(self.alignment)
54 def calc_identity(self):
55 """ Calculate the identity of alignment positions for colouring.
57 For every (row, column) in alignment the percentage of the exactly
58 same residue in the same column in the alignment is calculated.
59 The data structure is just like the Project.alignment, but istead of
60 monomers it contains float percentages.
61 """
62 # Oh, God, that's awful! Absolutely not understandable.
63 # First, calculate percentages of amino acids in every column
64 contribution = 1.0 / len(self.sequences)
65 all_columns = []
66 for position in range(len(self)):
67 column_percentage = {}
68 for seq in self.alignment:
69 if self.alignment[seq][position] is not None:
70 aa = self.alignment[seq][position].code
71 else:
72 aa = None
73 if aa in allpy_data.amino_acids:
74 if aa in column_percentage.keys():
75 column_percentage[aa] += contribution
76 else:
77 column_percentage[aa] = contribution
78 all_columns.append(column_percentage)
79 # Second, map these percentages onto the alignment
80 self.identity_percentages = {}
81 for seq in self.sequences:
82 self.identity_percentages[seq] = []
83 for seq in self.identity_percentages:
84 line = self.identity_percentages[seq]
85 for position in range(len(self)):
86 if self.alignment[seq][position] is not None:
87 aa = self.alignment[seq][position].code
88 else:
89 aa = None
90 line.append(all_columns[position].get(aa))
91 return self.identity_percentages
93 @staticmethod
94 def from_fasta(file, monomer_type=AminoAcidType):
95 """
96 >>> import project
97 >>> sequences,alignment=project.Project.from_fasta(open("test.fasta"))
98 """
99 import re
101 sequences=[]
102 alignment={}
104 content=file.read()
105 raw_sequences=content.split(">")[1:]#ignore everything before the first >
106 for raw in raw_sequences:
107 parsed_raw_sequence = raw.split("\n")
108 for counter,piece in enumerate(parsed_raw_sequence):
109 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces
110 name_and_description = parsed_raw_sequence[0]
111 if len(name_and_description.split(" ",1))==2:
112 name, description = name_and_description.split(" ",1)
113 elif len(name_and_description.split(" ", 1)) == 1:#if there is description
114 name = name_and_description
115 else:
116 raise "Wrong name of sequence in fasta file"
117 string=""
118 for piece in parsed_raw_sequence[1:]:
119 piece_without_whitespace_chars=re.sub("\s","",piece)
120 string+=piece_without_whitespace_chars
121 monomers=[]#convert into Monomer objects
122 alignment_list=[]#create the respective list in alignment dict
123 for current_monomer in string:
124 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":
125 monomers.append(monomer_type.from_code1(current_monomer).instance())
126 alignment_list.append(monomers[-1])
127 else:
128 alignment_list.append(None)
129 if "description" in vars():#if there's no description
130 sequences.append(sequence.Sequence(name,monomers,description))
131 else:
132 sequences.append(sequence.Sequence(name,monomers))
133 alignment[sequences[-1]]=alignment_list
134 return sequences,alignment
137 @staticmethod
138 def from_sequences(*sequences):
139 """
140 Constructs new alignment from sequences
141 Add gaps to right end to make equal lengthes of alignment sequences
142 """
143 project = Project()
144 project.sequences = sequences
145 max_length = max(len(sequence) for sequence in sequences)
146 for sequence in sequences:
147 gaps_count = max_length - len(sequence)
148 project.alignment[sequence] = sequence.monomers + [None] * gaps_count
149 return project