jazzparser.taggers.ngram.tagger

1 """Ngram model supertagger, making use NLTK's probability models. 2 3 This provides the tagger interface routines for an ngram tagger. It is 4 backed by the ngram models defined in L{jazzparser.utils.nltk.ngram}, 5 which use NLTK's probability handling classes. 6 7 """ 8 """ 9 ============================== License ======================================== 10 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 11 12 This file is part of The Jazz Parser. 13 14 The Jazz Parser is free software: you can redistribute it and/or modify 15 it under the terms of the GNU General Public License as published by 16 the Free Software Foundation, either version 3 of the License, or 17 (at your option) any later version. 18 19 The Jazz Parser is distributed in the hope that it will be useful, 20 but WITHOUT ANY WARRANTY; without even the implied warranty of 21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 GNU General Public License for more details. 23 24 You should have received a copy of the GNU General Public License 25 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 26 27 ============================ End license ====================================== 28 29 """ 30 import cPickle as pickle 31 import random 32 from jazzparser.taggers.models import ModelTagger, ModelLoadError, \ 33 TaggerModel, TaggingModelError, ModelSaveError 34 from jazzparser.taggers import process_chord_input 35 from jazzparser.taggers.chordmap import get_chord_mapping_module_option, \ 36 get_chord_mapping 37 from jazzparser.data import Chord 38 from jazzparser.data.db_mirrors import Chord as DbChord 39 40 from jazzparser.utils.nltk.probability import ESTIMATORS, laplace_estimator, get_estimator_name 41 from jazzparser.utils.options import ModuleOption, choose_from_list, \ 42 choose_from_dict 43 from jazzparser.utils.base import group_pairs, load_optional_package, load_from_optional_package 44 from jazzparser.utils.probabilities import batch_sizes, beamed_batch_sizes 45 46 from nltk.probability import FreqDist

47 48 -def observation_from_chord_pair(crd1, crd2, chordmap):

49 if crd2 is None: 50 interval = 0 51 else: 52 interval = Chord.interval(Chord.from_name(str(crd1)), Chord.from_name(str(crd2))) 53 if not isinstance(crd1, Chord) and not isinstance(crd1, DbChord): 54 crd1 = Chord.from_name(crd1) 55 return "%d-%s" % (interval, chordmap[crd1.type])

56

57 58 -class NgramTaggerModel(TaggerModel):

59 MODEL_TYPE = 'ngram' 60 # Set up possible options for training 61 TRAINING_OPTIONS = [ 62 ModuleOption('n', filter=int, 63 help_text="Length of the n-grams which this model will use.", 64 usage="n=N, where N is an integer. Defaults to bigrams", default=2), 65 ModuleOption('backoff', filter=int, 66 help_text="Number of orders of backoff to use. This must be "\ 67 "less than n. E.g. if using a trigram model (n=3) you can "\ 68 "set backoff=2 to back off to bigrams and from bigrams "\ 69 "to unigrams. Set to 0 to use no backoff at all (default).", 70 usage="backoff=X, where X is an integer < n", default=0), 71 ModuleOption('cutoff', filter=int, 72 help_text="In estimating probabilities, treat any counts below "\ 73 "cutoff as zero", 74 usage="cutoff=X, where X is an integer", default=0), 75 ModuleOption('backoff_cutoff', filter=int, 76 help_text="Apply a different cutoff setting to the backoff model. "\ 77 "Default is to use the same as the main model", 78 usage="backoff_cutoff=X, where X is an integer"), 79 ModuleOption('estimator', filter=choose_from_dict(ESTIMATORS), 80 help_text="A way of constructing a probability model given "\ 81 "the set of counts from the data. Default is to use "\ 82 "laplace (add-one) smoothing.", 83 usage="estimator=X, where X is one of: %s" % \ 84 ", ".join(ESTIMATORS.keys()), default=laplace_estimator), 85 # Add the standard chord mapping option ("chord_mapping") 86 get_chord_mapping_module_option(), 87 ] + TaggerModel.TRAINING_OPTIONS 88

89 - def __init__(self, model_name, model=None, chordmap=None, *args, **kwargs):

90 """ 91 An n-gram model to be used as a tagging model. Uses NLTK to 92 represent, train and evaluate the n-gram model. 93 94 """ 95 super(NgramTaggerModel, self).__init__(model_name, *args, **kwargs) 96 self.model = model 97 98 self.chordmap = get_chord_mapping(chordmap) 99 self.chordmap_name = chordmap 100 101 if self.options['n'] <= self.options['backoff']: 102 # This is not allowed 103 # We can only back off n-1 orders for an n-gram model 104 raise TaggingModelError, "tried to load an n-gram model with "\ 105 "more orders of backoff than are possible (backing off "\ 106 "%d orders on a %d-gram model)" % \ 107 (self.options['backoff'], self.options['n'])

108

109 - def train(self, sequences, grammar=None, logger=None):

110 from jazzparser.utils.nltk.ngram import PrecomputedNgramModel 111 if grammar is None: 112 from jazzparser.grammar import get_grammar 113 # Load the default grammar 114 grammar = get_grammar() 115 116 N = self.options['n'] 117 backoff = self.options['backoff'] 118 chordmap = self.options['chord_mapping'] 119 self.chordmap = chordmap 120 self.chordmap_name = chordmap.name 121 122 # Get data in the form of lists of (observation,tag) pairs 123 training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \ 124 for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)] 125 for seq in sequences] 126 # Get all the possible pos tags from the grammar 127 label_dom = grammar.pos_tags 128 # Build the emission domain to include all the observations that 129 # theoretically could occur, not just those that are seen - 130 # we might not see all interval/chord type pairs in the data. 131 chord_types = chordmap.values() 132 emission_dom = sum([["%d-%s" % (interval,chord) for chord in chord_types] for interval in range(12)], []) 133 134 # Ignore unlabelled data 135 ignores = [''] 136 137 if self.options['backoff_cutoff'] is None: 138 backoff_kwargs = {} 139 else: 140 backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']} 141 142 # Precompute the transition matrix and store it along with the model 143 self.model = PrecomputedNgramModel.train( 144 self.options['n'], 145 training_data, 146 label_dom, 147 emission_dom=emission_dom, 148 cutoff=self.options['cutoff'], 149 backoff_order=self.options['backoff'], 150 estimator=self.options['estimator'], 151 ignore_list=ignores, 152 backoff_kwargs=backoff_kwargs) 153 154 # Add some model-specific info into the descriptive text 155 # so we know how it was trained 156 est_name = get_estimator_name(self.options['estimator']) 157 self.model_description = """\ 158 Model order: %(order)d 159 Backoff orders: %(backoff)d 160 Probability estimator: %(est)s 161 Zero-count threshold: %(cutoff)d 162 Chord mapping: %(chordmap)s 163 Training sequences: %(seqs)d 164 Training samples: %(samples)d\ 165 """ % \ 166 { 167 'est' : est_name, 168 'seqs' : len(training_data), 169 'samples' : len(sum(training_data, [])), 170 'order' : self.options['n'], 171 'backoff' : self.options['backoff'], 172 'cutoff' : self.options['cutoff'], 173 'chordmap' : self.chordmap_name, 174 }

175 176 @staticmethod

177 - def _load_model(data):

178 from jazzparser.utils.nltk.ngram import PrecomputedNgramModel 179 180 model = PrecomputedNgramModel.from_picklable_dict(data['model']) 181 name = data['name'] 182 chordmap = data.get("chordmap", None) 183 return NgramTaggerModel(name, model=model, chordmap=chordmap)

184

185 - def _get_model_data(self):

186 data = { 187 'name' : self.model_name, 188 'model' : self.model.to_picklable_dict(), 189 'chordmap' : self.chordmap_name, 190 } 191 return data

192

193 - def generate_chord_sequence(self, length=20):

194 """ 195 Just for a laugh, use the trained n-gram to generate a chord 196 sequence and output it in a playable form. 197 Returns a tuple: (chords, tags) 198 199 @todo: this isn't implemented yet for n-grams. It's not a 200 high priority, but would be fun. 201 202 """ 203 # Easily done, because the NgramModel already implements it itself 204 raise NotImplementedError, "not yet done generation for n-grams" 205 # This is what the other tagger did: 206 207 from jazzparser.utils.chords import int_to_chord_numeral 208 # Use the model to generate randomly 209 rand_seq = self.model.random_sample(random.Random(), length) 210 pitch = 0 211 chords = [] 212 prochords,tags = zip(*rand_seq) 213 # Convert the generated observations into readable chords 214 for chord in prochords: 215 interval,__,ctype = chord.partition("-") 216 chords.append("%s%s" % (int_to_chord_numeral(pitch),ctype)) 217 pitch = (pitch + int(interval)) % 12 218 return (chords, tags)

219

220 - def forward_probabilities(self, sequence):

221 """ Interface to the NgramModel's forward_probabilities """ 222 return self.model.forward_probabilities(sequence)

223

224 - def forward_backward_probabilities(self, sequence):

225 return self.model.gamma_probabilities(sequence, dictionary=True)

226

227 - def viterbi_probabilities(self, sequence):

228 return self.model.viterbi_selector_probabilities(sequence)

229

230 - def _get_tags(self):

231 return self.model.label_dom

232 tags = property(_get_tags) 233 234 #### Readable output of the parameters ####

235 - def _get_readable_params(self):

236 try: 237 text = "" 238 239 # Include the stored model description 240 text += self.model_description 241 242 text += "\nNum emissions: %d\n" % self.model.num_emissions 243 text += "\nShowing only probs for non-zero counts. "\ 244 "Others may have a non-zero prob by smoothing\n" 245 246 text += "\nChord mapping: %s:\n" % self.chordmap.name 247 for (crdin, crdout) in self.chordmap.items(): 248 text += " %s -> %s\n" % (crdin, crdout) 249 250 # Emission distribution 251 text += "\nEmission dist:\n" 252 for label in sorted(self.model.label_dom): 253 text += " %s:\n" % label 254 probs = reversed(sorted( 255 [(self.model.emission_dist[label].prob(em),em) for \ 256 em in self.model.emission_dist[label].samples()])) 257 for (prob,em) in probs: 258 text += " %s: %s\n" % (em, prob) 259 260 text += "\n\nTransition dist:\n" 261 for history in sorted(self.model.label_dist.conditions()): 262 text += " %s\n" % str(history) 263 dist = [(self.model.label_dist[history].prob(lab),lab) 264 for lab in self.model.label_dist[history].samples()] 265 for prob,label in reversed(sorted(dist)): 266 text += " %s: %s\n" % (str(label), prob) 267 268 return text 269 except AttributeError, err: 270 # Catch this, because otherwise it just looks like the attribute 271 # (readable_parameters) doesn't exist (stupid Python behaviour) 272 raise ValueError, "error generating model description "\ 273 "(attribute error): %s" % err

274 readable_parameters = property(_get_readable_params)

275 276 277 DECODERS = ['viterbi', 'forward-backward', 'forward']

278 279 -class NgramTagger(ModelTagger):

280 MODEL_CLASS = NgramTaggerModel 281 TAGGER_OPTIONS = ModelTagger.TAGGER_OPTIONS + [ 282 ModuleOption('decode', filter=choose_from_list(DECODERS), 283 help_text="Decoding method for inference.", 284 usage="decode=X, where X is one of %s" % \ 285 ", ".join("'%s'" % d for d in DECODERS), 286 default="forward-backward"), 287 ] 288 INPUT_TYPES = ['db', 'chords'] 289

290 - def __init__(self, grammar, input, options={}, *args, **kwargs):

291 """ 292 Tags using an ngram model backed by NLTK. 293 294 """ 295 super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs) 296 process_chord_input(self) 297 298 #### Tag the input sequence #### 299 self._tagged_data = [] 300 self._batch_ranges = [] 301 # Group the input into pairs to get observations 302 inpairs = group_pairs(self.input, none_final=True) 303 # Convert the pairs into observations 304 observations = [observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs] 305 306 # Use the ngram model to get tag probabilities for each input by 307 # computing the forward probability matrix 308 if self.options['decode'] == "viterbi": 309 probabilities = self.model.viterbi_probabilities(observations) 310 elif self.options['decode'] == "forward": 311 probabilities = self.model.forward_probabilities(observations) 312 else: 313 probabilities = self.model.forward_backward_probabilities(observations) 314 315 word_tag_probs = [] 316 317 for index,probs in enumerate(probabilities): 318 features = { 319 'duration' : self.durations[index], 320 'time' : self.times[index], 321 } 322 word_signs = [] 323 # Now assign a probability to each tag, given the observation 324 for tag in self.model.tags: 325 # Read a full sign out of the grammar 326 sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features) 327 if sign is not None: 328 # Read off the probability from the matrix 329 probability = probs[tag] 330 word_signs.append((sign, tag, probability)) 331 332 # Randomly sort the list first to make sure equal probabilities are randomly ordered 333 word_signs = [(sign, tag, prob) for sign,tag,prob in word_signs] 334 random.shuffle(word_signs) 335 # Now sort by probability 336 word_signs = list(reversed(sorted(word_signs, key=lambda x:x[2]))) 337 self._tagged_data.append(word_signs) 338 339 # Store the list of probabilities for tags, which we'll use 340 # after we've tagged every word to work out the sizes 341 # of the tag batches 342 word_tag_probs.append([p for __,__,p in word_signs]) 343 344 if self.options['best']: 345 # Only return one for each word 346 self._batch_ranges = [[(0,1)] for i in range(len(self.input))] 347 else: 348 # Work out the number of tags to return in each batch 349 batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio) 350 # So far, this has assigned a probability to every possible 351 # tag. We don't want the tagger ever to return the least 352 # probably batch of tags, unless it's the only one. 353 #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes] 354 # Transform these into a form that's easier to use for getting the signs 355 self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \ 356 for batches in batch_sizes]

357

358 - def get_signs(self, offset=0):

359 all_signs = [] 360 for start_node in range(len(self.input)): 361 # Get the indices of the signs to return in this offset batch 362 ranges = self._batch_ranges[start_node] 363 if offset >= len(ranges): 364 # No more batches left for this word 365 continue 366 start,end = ranges[offset] 367 signs = self._tagged_data[start_node][start:end] 368 # Add each sign to the output list along with its node values 369 for sign in signs: 370 all_signs.append((start_node, start_node+1, sign)) 371 return all_signs

372

373 - def get_word(self, index):

374 return self.input[index]

375

Source Code for Module jazzparser.taggers.ngram.tagger