allpy
diff allpy/base.py @ 287:f8bd7c469fcf
Clean reimplementation of allpy.base.Alignment.from_fasta
Also changed interface of fasta.parse_file
Added allpy.util with currently one function unzip -- the reverse of zip
builtin
author | Daniil Alexeyevsky <me.dendik@gmail.com> |
---|---|
date | Thu, 16 Dec 2010 01:12:06 +0300 |
parents | cf6cdc3b7ec5 |
children | 9c68d8eab8f5 |
line diff
1.1 --- a/allpy/base.py Wed Dec 15 23:45:11 2010 +0300 1.2 +++ b/allpy/base.py Thu Dec 16 01:12:06 2010 +0300 1.3 @@ -181,41 +181,47 @@ 1.4 """ 1.5 sequences = fasta.parse_file(file) 1.6 assert len(sequences) == 1 1.7 - header = sequences.keys()[0] 1.8 - name, _, description = header.partition(" ") 1.9 + name, description = sequences.keys()[0] 1.10 return cls(sequences[header], name, description, file.name) 1.11 1.12 -class Alignment(dict): 1.13 +class Alignment(list): 1.14 """Alignment. 1.15 1.16 Behaves like a list of Columns. 1.17 """ 1.18 - # _sequences -- list of Sequence objects. Sequences don't contain gaps 1.19 - # - see sequence.py module 1.20 1.21 - def __init__(self, *args): 1.22 - """overloaded constructor 1.23 + sequence_type = Sequence 1.24 + """Type of sequences to create in alignment. 1.25 + 1.26 + SHOULD be redefined when subclassing""" 1.27 1.28 - Alignment() 1.29 - new empty Alignment 1.30 + def __init__(self): 1.31 + """Initialize empty alignment.""" 1.32 + super(Alignment, self).__init__() 1.33 1.34 - Alignment(sequences, body) 1.35 - new Alignment with sequences and body initialized from arguments 1.36 + self.sequences = [] 1.37 + """Ordered list of sequences in alignment.""" 1.38 1.39 - Alignment(fasta_file) 1.40 - new Alignment, read body and sequences from fasta file 1.41 - """ 1.42 - if len(args)>1:#overloaded constructor 1.43 - self.sequences=args[0] 1.44 - self.body=args[1] 1.45 - elif len(args)==0: 1.46 - self.sequences=[] 1.47 - self.body={} 1.48 - else: 1.49 - self.sequences, self.body = Alignment.from_fasta(args[0]) 1.50 + def add_gapped_line(self, line, name='', description='', source=''): 1.51 + """Add row from a line of one-letter codes and gaps.""" 1.52 + Sequence = cls.sequence_type 1.53 + not_gap = lambda (i, char): char != "-" 1.54 + no_gaps = line.replace("-", "") 1.55 + sequence = Sequence(no_gaps, name, description, source) 1.56 + for i, (j, char) in enumerate(filter(not_gap, enumerate(line))): 1.57 + self[j][seq] = sequence[i] 1.58 + self.sequences.append(sequence) 1.59 + 1.60 + @classmethod 1.61 + def from_fasta(cls, file): 1.62 + """Create new alignment from FASTA file.""" 1.63 + self = cls() 1.64 + for ((name, description), body) in fasta.parse_file(file): 1.65 + self.add_gapped_line(body, name, description) 1.66 + return self 1.67 1.68 def length(self): 1.69 - """ Returns width, ie length of each sequence with gaps """ 1.70 + """Return width, ie length of each sequence with gaps.""" 1.71 return max([len(line) for line in self.body.values()]) 1.72 1.73 def height(self): 1.74 @@ -261,58 +267,6 @@ 1.75 line.append(all_columns[position].get(aa)) 1.76 return self.identity_percentages 1.77 1.78 - @classmethod 1.79 - def from_fasta(file): 1.80 - """ Import data from fasta file 1.81 - 1.82 - >>> import alignment 1.83 - >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta")) 1.84 - """ 1.85 - import re 1.86 - 1.87 - sequences = [] 1.88 - body = {} 1.89 - 1.90 - raw_sequences = file.read().split(">") 1.91 - if len(raw_sequences) <= 1: 1.92 - raise Exception("Wrong format of fasta-file %s" % file.name) 1.93 - 1.94 - raw_sequences = raw_sequences[1:] #ignore everything before the first > 1.95 - for raw in raw_sequences: 1.96 - parsed_raw_sequence = raw.split("\n") 1.97 - parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence] 1.98 - name_and_description = parsed_raw_sequence[0] 1.99 - name_and_description = name_and_description.split(" ",1) 1.100 - if len(name_and_description) == 2: 1.101 - name, description = name_and_description 1.102 - elif len(name_and_description) == 1: 1.103 - #if there is description 1.104 - name = name_and_description[0] 1.105 - description = '' 1.106 - else: 1.107 - raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \ 1.108 - {'name': name, 'file': file.name}) 1.109 - 1.110 - if len(parsed_raw_sequence) <= 1: 1.111 - raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \ 1.112 - {'name': name, 'file': file.name}) 1.113 - string = "" 1.114 - for piece in parsed_raw_sequence[1:]: 1.115 - piece_without_whitespace_chars = re.sub("\s", "", piece) 1.116 - string += piece_without_whitespace_chars 1.117 - monomers = [] #convert into Monomer objects 1.118 - body_list = [] #create the respective list in body dict 1.119 - for current_monomer in string: 1.120 - if current_monomer not in ["-", ".", "~"]: 1.121 - monomers.append(cls.monomer_type.from_code1(current_monomer)) 1.122 - body_list.append(monomers[-1]) 1.123 - else: 1.124 - body_list.append(None) 1.125 - s = sequence.Sequence(monomers, name, description) 1.126 - sequences.append(s) 1.127 - body[s] = body_list 1.128 - return sequences, body 1.129 - 1.130 @staticmethod 1.131 def from_sequences(*sequences): 1.132 """ Constructs new alignment from sequences