Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/ceea02544d83/lib/project.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 23:30:05 2013
Кодировка:
allpy: ceea02544d83 lib/project.py

allpy

view lib/project.py @ 82:ceea02544d83

pdb: allow multiple chains per sequence (documentation only)
author boris <bnagaev@gmail.com>
date Tue, 19 Oct 2010 00:56:48 +0400
parents 0b9c0f4f1fda 217d83a617c3
children ea3113da42ca
line source
1 #!/usr/bin/python
3 """
4 "I will not use abbrev."
5 "I will always finish what I st"
6 - Bart Simpson
8 """
10 import sequence
11 import monomer
12 import allpy_data
14 class Project(object):
15 """
16 Mandatory data:
17 * sequences -- list of Sequence objects. Sequences don't contain gaps
18 - see sequence.py module
19 * alignment -- dict
20 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
21 keys are the Sequence objects, values are the lists, which
22 contain monomers of those sequences or None for gaps in the
23 corresponding sequence of
24 alignment
26 """
27 def __init__(self, *args):
28 """overloaded constructor
30 Project() -> new empty Project
31 Project(sequences, alignment) -> new Project with sequences and
32 alignment initialized from arguments
33 Project(fasta_file) -> new Project, read alignment and sequences
34 from fasta file
36 """
37 if len(args)>1:#overloaded constructor
38 self.sequences=args[0]
39 self.alignment=args[1]
40 elif len(args)==0:
41 self.sequences=[]
42 self.alignment={}
43 else:
44 self.sequences,self.alignment=Project.get_from_fasta(args[0])
46 def __len__(self):
47 return max([len(line) for line in self.alignment.values()])
49 def thickness(self):
50 """ The number of sequences in alignment (it's thickness).
51 """
52 return len(self.alignment)
54 def calc_identity(self):
55 """ Calculate the identity of alignment positions for colouring.
57 For every (row, column) in alignment the percentage of the exactly
58 same residue in the same column in the alignment is calculated.
59 The data structure is just like the Project.alignment, but istead of
60 monomers it contains float percentages.
61 """
62 # Oh, God, that's awful! Absolutely not understandable.
63 # First, calculate percentages of amino acids in every column
64 contribution = 1.0 / len(self.sequences)
65 all_columns = []
66 for position in range(len(self)):
67 column_percentage = {}
68 for seq in self.alignment:
69 if self.alignment[seq][position] is not None:
70 aa = self.alignment[seq][position].code
71 else:
72 aa = None
73 if aa in allpy_data.amino_acids:
74 if aa in column_percentage.keys():
75 column_percentage[aa] += contribution
76 else:
77 column_percentage[aa] = contribution
78 all_columns.append(column_percentage)
79 # Second, map these percentages onto the alignment
80 self.identity_percentages = {}
81 for seq in self.sequences:
82 self.identity_percentages[seq] = []
83 for seq in self.identity_percentages:
84 line = self.identity_percentages[seq]
85 for position in range(len(self)):
86 if self.alignment[seq][position] is not None:
87 aa = self.alignment[seq][position].code
88 else:
89 aa = None
90 line.append(all_columns[position].get(aa))
91 return self.identity_percentages
93 @staticmethod
94 def from_fasta(file):
95 """
96 >>> import project
97 >>> sequences,alignment=project.Project.from_fasta(open("test.fasta"))
98 """
99 import re
101 sequences=[]
102 alignment={}
104 content=file.read()
105 raw_sequences=content.split(">")[1:]#ignore everything before the first >
106 for raw in raw_sequences:
107 parsed_raw_sequence = raw.split("\n")
108 for counter,piece in enumerate(parsed_raw_sequence):
109 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces
110 name_and_description = parsed_raw_sequence[0]
111 if len(name_and_description.split(" ",1))==2:
112 name,description=name_and_description.split(" ",1)
113 elif len(name_and_description.split(" ",1))==1:#if there is description
114 name=name_and_description
115 else:
116 raise "Wrong name of sequence in fasta file"
117 string=""
118 for piece in parsed_raw_sequence[1:]:
119 piece_without_whitespace_chars=re.sub("\s","",piece)
120 string+=piece_without_whitespace_chars
121 monomers=[]#convert into Monomer objects
122 alignment_list=[]#create the respective list in alignment dict
123 for current_monomer in string:
124 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":
125 monomers.append(monomer.Monomer(current_monomer))
126 alignment_list.append(monomers[-1])
127 else:
128 alignment_list.append(None)
129 if "description" in vars():#if there's no description
130 sequences.append(sequence.Sequence(name,monomers,description))
131 else:
132 sequences.append(sequence.Sequence(name,monomers))
133 alignment[sequences[-1]]=alignment_list
134 return sequences,alignment