1 """Ngram model supertagger, making use NLTK's probability models.
2
3 This provides the tagger interface routines for an ngram tagger. It is
4 backed by the ngram models defined in L{jazzparser.utils.nltk.ngram},
5 which use NLTK's probability handling classes.
6
7 """
8 """
9 ============================== License ========================================
10 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
11
12 This file is part of The Jazz Parser.
13
14 The Jazz Parser is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
18
19 The Jazz Parser is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
26
27 ============================ End license ======================================
28
29 """
30 import cPickle as pickle
31 import random
32 from jazzparser.taggers.models import ModelTagger, ModelLoadError, \
33 TaggerModel, TaggingModelError, ModelSaveError
34 from jazzparser.taggers import process_chord_input
35 from jazzparser.taggers.chordmap import get_chord_mapping_module_option, \
36 get_chord_mapping
37 from jazzparser.data import Chord
38 from jazzparser.data.db_mirrors import Chord as DbChord
39
40 from jazzparser.utils.nltk.probability import ESTIMATORS, laplace_estimator, get_estimator_name
41 from jazzparser.utils.options import ModuleOption, choose_from_list, \
42 choose_from_dict
43 from jazzparser.utils.base import group_pairs, load_optional_package, load_from_optional_package
44 from jazzparser.utils.probabilities import batch_sizes, beamed_batch_sizes
45
46 from nltk.probability import FreqDist
56
59 MODEL_TYPE = 'ngram'
60
61 TRAINING_OPTIONS = [
62 ModuleOption('n', filter=int,
63 help_text="Length of the n-grams which this model will use.",
64 usage="n=N, where N is an integer. Defaults to bigrams", default=2),
65 ModuleOption('backoff', filter=int,
66 help_text="Number of orders of backoff to use. This must be "\
67 "less than n. E.g. if using a trigram model (n=3) you can "\
68 "set backoff=2 to back off to bigrams and from bigrams "\
69 "to unigrams. Set to 0 to use no backoff at all (default).",
70 usage="backoff=X, where X is an integer < n", default=0),
71 ModuleOption('cutoff', filter=int,
72 help_text="In estimating probabilities, treat any counts below "\
73 "cutoff as zero",
74 usage="cutoff=X, where X is an integer", default=0),
75 ModuleOption('backoff_cutoff', filter=int,
76 help_text="Apply a different cutoff setting to the backoff model. "\
77 "Default is to use the same as the main model",
78 usage="backoff_cutoff=X, where X is an integer"),
79 ModuleOption('estimator', filter=choose_from_dict(ESTIMATORS),
80 help_text="A way of constructing a probability model given "\
81 "the set of counts from the data. Default is to use "\
82 "laplace (add-one) smoothing.",
83 usage="estimator=X, where X is one of: %s" % \
84 ", ".join(ESTIMATORS.keys()), default=laplace_estimator),
85
86 get_chord_mapping_module_option(),
87 ] + TaggerModel.TRAINING_OPTIONS
88
89 - def __init__(self, model_name, model=None, chordmap=None, *args, **kwargs):
108
109 - def train(self, sequences, grammar=None, logger=None):
110 from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
111 if grammar is None:
112 from jazzparser.grammar import get_grammar
113
114 grammar = get_grammar()
115
116 N = self.options['n']
117 backoff = self.options['backoff']
118 chordmap = self.options['chord_mapping']
119 self.chordmap = chordmap
120 self.chordmap_name = chordmap.name
121
122
123 training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \
124 for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)]
125 for seq in sequences]
126
127 label_dom = grammar.pos_tags
128
129
130
131 chord_types = chordmap.values()
132 emission_dom = sum([["%d-%s" % (interval,chord) for chord in chord_types] for interval in range(12)], [])
133
134
135 ignores = ['']
136
137 if self.options['backoff_cutoff'] is None:
138 backoff_kwargs = {}
139 else:
140 backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']}
141
142
143 self.model = PrecomputedNgramModel.train(
144 self.options['n'],
145 training_data,
146 label_dom,
147 emission_dom=emission_dom,
148 cutoff=self.options['cutoff'],
149 backoff_order=self.options['backoff'],
150 estimator=self.options['estimator'],
151 ignore_list=ignores,
152 backoff_kwargs=backoff_kwargs)
153
154
155
156 est_name = get_estimator_name(self.options['estimator'])
157 self.model_description = """\
158 Model order: %(order)d
159 Backoff orders: %(backoff)d
160 Probability estimator: %(est)s
161 Zero-count threshold: %(cutoff)d
162 Chord mapping: %(chordmap)s
163 Training sequences: %(seqs)d
164 Training samples: %(samples)d\
165 """ % \
166 {
167 'est' : est_name,
168 'seqs' : len(training_data),
169 'samples' : len(sum(training_data, [])),
170 'order' : self.options['n'],
171 'backoff' : self.options['backoff'],
172 'cutoff' : self.options['cutoff'],
173 'chordmap' : self.chordmap_name,
174 }
175
176 @staticmethod
184
192
194 """
195 Just for a laugh, use the trained n-gram to generate a chord
196 sequence and output it in a playable form.
197 Returns a tuple: (chords, tags)
198
199 @todo: this isn't implemented yet for n-grams. It's not a
200 high priority, but would be fun.
201
202 """
203
204 raise NotImplementedError, "not yet done generation for n-grams"
205
206
207 from jazzparser.utils.chords import int_to_chord_numeral
208
209 rand_seq = self.model.random_sample(random.Random(), length)
210 pitch = 0
211 chords = []
212 prochords,tags = zip(*rand_seq)
213
214 for chord in prochords:
215 interval,__,ctype = chord.partition("-")
216 chords.append("%s%s" % (int_to_chord_numeral(pitch),ctype))
217 pitch = (pitch + int(interval)) % 12
218 return (chords, tags)
219
223
226
229
232 tags = property(_get_tags)
233
234
236 try:
237 text = ""
238
239
240 text += self.model_description
241
242 text += "\nNum emissions: %d\n" % self.model.num_emissions
243 text += "\nShowing only probs for non-zero counts. "\
244 "Others may have a non-zero prob by smoothing\n"
245
246 text += "\nChord mapping: %s:\n" % self.chordmap.name
247 for (crdin, crdout) in self.chordmap.items():
248 text += " %s -> %s\n" % (crdin, crdout)
249
250
251 text += "\nEmission dist:\n"
252 for label in sorted(self.model.label_dom):
253 text += " %s:\n" % label
254 probs = reversed(sorted(
255 [(self.model.emission_dist[label].prob(em),em) for \
256 em in self.model.emission_dist[label].samples()]))
257 for (prob,em) in probs:
258 text += " %s: %s\n" % (em, prob)
259
260 text += "\n\nTransition dist:\n"
261 for history in sorted(self.model.label_dist.conditions()):
262 text += " %s\n" % str(history)
263 dist = [(self.model.label_dist[history].prob(lab),lab)
264 for lab in self.model.label_dist[history].samples()]
265 for prob,label in reversed(sorted(dist)):
266 text += " %s: %s\n" % (str(label), prob)
267
268 return text
269 except AttributeError, err:
270
271
272 raise ValueError, "error generating model description "\
273 "(attribute error): %s" % err
274 readable_parameters = property(_get_readable_params)
275
276
277 DECODERS = ['viterbi', 'forward-backward', 'forward']
280 MODEL_CLASS = NgramTaggerModel
281 TAGGER_OPTIONS = ModelTagger.TAGGER_OPTIONS + [
282 ModuleOption('decode', filter=choose_from_list(DECODERS),
283 help_text="Decoding method for inference.",
284 usage="decode=X, where X is one of %s" % \
285 ", ".join("'%s'" % d for d in DECODERS),
286 default="forward-backward"),
287 ]
288 INPUT_TYPES = ['db', 'chords']
289
290 - def __init__(self, grammar, input, options={}, *args, **kwargs):
291 """
292 Tags using an ngram model backed by NLTK.
293
294 """
295 super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs)
296 process_chord_input(self)
297
298
299 self._tagged_data = []
300 self._batch_ranges = []
301
302 inpairs = group_pairs(self.input, none_final=True)
303
304 observations = [observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs]
305
306
307
308 if self.options['decode'] == "viterbi":
309 probabilities = self.model.viterbi_probabilities(observations)
310 elif self.options['decode'] == "forward":
311 probabilities = self.model.forward_probabilities(observations)
312 else:
313 probabilities = self.model.forward_backward_probabilities(observations)
314
315 word_tag_probs = []
316
317 for index,probs in enumerate(probabilities):
318 features = {
319 'duration' : self.durations[index],
320 'time' : self.times[index],
321 }
322 word_signs = []
323
324 for tag in self.model.tags:
325
326 sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features)
327 if sign is not None:
328
329 probability = probs[tag]
330 word_signs.append((sign, tag, probability))
331
332
333 word_signs = [(sign, tag, prob) for sign,tag,prob in word_signs]
334 random.shuffle(word_signs)
335
336 word_signs = list(reversed(sorted(word_signs, key=lambda x:x[2])))
337 self._tagged_data.append(word_signs)
338
339
340
341
342 word_tag_probs.append([p for __,__,p in word_signs])
343
344 if self.options['best']:
345
346 self._batch_ranges = [[(0,1)] for i in range(len(self.input))]
347 else:
348
349 batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio)
350
351
352
353
354
355 self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \
356 for batches in batch_sizes]
357
359 all_signs = []
360 for start_node in range(len(self.input)):
361
362 ranges = self._batch_ranges[start_node]
363 if offset >= len(ranges):
364
365 continue
366 start,end = ranges[offset]
367 signs = self._tagged_data[start_node][start:end]
368
369 for sign in signs:
370 all_signs.append((start_node, start_node+1, sign))
371 return all_signs
372
374 return self.input[index]
375