Package jazzparser :: Package taggers :: Package baseline1 :: Module tagger
[hide private]
[frames] | no frames]

Source Code for Module jazzparser.taggers.baseline1.tagger

  1  """First, very simple baseline tagger model. 
  2   
  3  Tagging model 'baseline1' is a very simple tagging model that tags  
  4  using just the unigram probabilities on the basis of observed chord  
  5  types (no intervals). 
  6   
  7  It is the model presented as 'model 3' in the Stupid Baselines talk 
  8  (the first two are just thought experiments and not worth implementing). 
  9   
 10  """ 
 11  """ 
 12  ============================== License ======================================== 
 13   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
 14    
 15   This file is part of The Jazz Parser. 
 16    
 17   The Jazz Parser is free software: you can redistribute it and/or modify 
 18   it under the terms of the GNU General Public License as published by 
 19   the Free Software Foundation, either version 3 of the License, or 
 20   (at your option) any later version. 
 21    
 22   The Jazz Parser is distributed in the hope that it will be useful, 
 23   but WITHOUT ANY WARRANTY; without even the implied warranty of 
 24   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 25   GNU General Public License for more details. 
 26    
 27   You should have received a copy of the GNU General Public License 
 28   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
 29   
 30  ============================ End license ====================================== 
 31   
 32  """ 
 33  __author__ = "Mark Granroth-Wilding <mark@granroth-wilding.co.uk>"  
 34   
 35  import pickle 
 36  from jazzparser.taggers.models import ModelTagger, ModelLoadError, TaggerModel 
 37  from jazzparser.taggers import process_chord_input 
 38  from jazzparser.utils.probabilities import batch_sizes 
 39  from jazzparser.data import Chord 
 40   
41 -def observation_from_chord(crd):
42 chord = Chord.from_name(crd) 43 return chord.type
44
45 -class Baseline1Model(TaggerModel):
46 """ 47 A class to encapsulate the model data for the tagger. 48 """ 49 MODEL_TYPE = "baseline1" 50
51 - def __init__(self, model_name, *args, **kwargs):
52 super(Baseline1Model, self).__init__(model_name, *args, **kwargs) 53 self.category_chord_count = {} 54 self.category_count = {} 55 self.chord_count = {}
56
57 - def _add_category_chord_count(self, category, chord):
58 """ 59 Adds a count of the joint observation of the category and the 60 chord and of the category and the chord themselves. 61 """ 62 # Count the cat-chord combo 63 cat_chords = self.category_chord_count.setdefault(category, {}) 64 if chord in cat_chords: 65 cat_chords[chord] += 1 66 else: 67 cat_chords[chord] = 1 68 # Count the cat occurrence 69 if category in self.category_count: 70 self.category_count[category] += 1 71 else: 72 self.category_count[category] = 1 73 # Count the chord occurrence 74 if chord in self.chord_count: 75 self.chord_count[chord] += 1 76 else: 77 self.chord_count[chord] = 1
78
79 - def train(self, sequences, grammar=None, logger=None):
80 seqs = 0 81 chords = 0 82 # Each sequence in the given corpus 83 for seq in sequences: 84 seqs += 1 85 # Each chord in the sequence 86 for chord in seq.iterator(): 87 chords += 1 88 self._add_category_chord_count(chord.category, observation_from_chord(str(chord))) 89 # Add a bit of training info to the descriptive text 90 self.model_description = """\ 91 Unigram probability model, observing only chord types 92 93 Training sequences: %(seqs)d 94 Training samples: %(samples)d""" % { 95 'seqs' : seqs, 96 'samples' : chords 97 }
98
99 - def get_prob_cat_given_chord(self, cat, chord):
100 obs = observation_from_chord(chord) 101 chord_count = self.chord_count.get(obs, 0) 102 if chord_count == 0: 103 # Unseen data: give all seen cats equal probability 104 if cat in self.category_count: 105 return 1.0 / len(self.category_count) 106 else: 107 # Haven't seen the category before: don't smooth 108 return 0.0 109 count = self.category_chord_count.get(cat, {}).get(obs, 0) 110 return float(count) / chord_count
111
112 -class Baseline1Tagger(ModelTagger):
113 """ 114 The first of the simple baseline tagger models. This models unigram 115 probabilities of tags, given only the chord types. 116 117 """ 118 MODEL_CLASS = Baseline1Model 119 INPUT_TYPES = ['db', 'chords'] 120
121 - def __init__(self, grammar, input, options={}, *args, **kwargs):
122 super(Baseline1Tagger, self).__init__(grammar, input, options, *args, **kwargs) 123 process_chord_input(self) 124 125 #### Tag the input sequence #### 126 self._tagged_data = [] 127 self._batch_ranges = [] 128 # Get all the possible signs from the grammar 129 for index in range(self.input_length): 130 features = { 131 'duration' : self.durations[index], 132 'time' : self.times[index], 133 } 134 word_signs = [] 135 # Now assign a probability to each tag, given the observation 136 for tag in self.model.category_count.keys(): 137 sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features) 138 if sign is not None: 139 probability = self.model.get_prob_cat_given_chord(tag, self.input[index]) 140 word_signs.append((sign, tag, probability)) 141 word_signs = list(reversed(sorted([(sign, tag, prob) for sign,tag,prob in word_signs], key=lambda x:x[2]))) 142 self._tagged_data.append(word_signs) 143 144 # Work out the sizes of the batches to return these in 145 batches = batch_sizes([p for __,__,p in word_signs], self.batch_ratio) 146 # Transform these into a form that's easier to use for getting the signs 147 so_far = 0 148 batch_ranges = [] 149 for batch in batches: 150 batch_ranges.append((so_far,so_far+batch)) 151 so_far += batch 152 self._batch_ranges.append(batch_ranges)
153
154 - def get_signs_for_word(self, index, offset=0):
155 if self.best_only: 156 # Only ever return one sign 157 if offset == 0 and len(self._tagged_data[index]) > 0: 158 return [self._tagged_data[index][0]] 159 else: 160 return None 161 ranges = self._batch_ranges[index] 162 if offset >= len(ranges): 163 # No more batches left 164 return None 165 start,end = ranges[offset] 166 return self._tagged_data[index][start:end]
167
168 - def get_word(self, index):
169 return self.input[index]
170