1 """Special bindings to the music_halfspan formalism required by the PCFG parser.
2
3 """
4 """
5 ============================== License ========================================
6 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
7
8 This file is part of The Jazz Parser.
9
10 The Jazz Parser is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or
13 (at your option) any later version.
14
15 The Jazz Parser is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
22
23 ============================ End license ======================================
24
25 """
26 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
27
28 import copy
29 from StringIO import StringIO
30 from nltk.probability import ConditionalProbDist
31
32 from jazzparser import settings
33 from jazzparser.data import Chord
34 from jazzparser.data.input import DbInput, ChordInput
35 from jazzparser.grammar import get_grammar
36 from jazzparser.data.input import DbInput
37 from jazzparser.parsers.pcfg.model import PcfgModel, ModelError, \
38 ModelTrainingError
39 from jazzparser.utils.nltk.probability import CutoffConditionalFreqDist, \
40 CutoffFreqDist, mle_estimator, ESTIMATORS, \
41 laplace_estimator, get_estimator_name, \
42 generate_from_prob_dist
43 from jazzparser.utils.nltk.storage import object_to_dict, dict_to_object
44 from jazzparser.utils.options import ModuleOption, choose_from_dict
45 from jazzparser.utils.strings import str_to_bool
46 from jazzparser.utils.loggers import create_dummy_logger
47 from jazzparser.taggers.chordmap import get_chord_mapping_module_option, \
48 get_chord_mapping
49 from jazzparser.taggers.pretagged import PretaggedTagger
50 from jazzparser.parsers.cky import DirectedCkyParser
51 from jazzparser.data.trees import build_tree_for_sequence, TreeBuildError
52 from jazzparser.formalisms.music_halfspan.syntax import syntax_from_string
55 """
56 A simple implementation of the PcfgModel interface. The model just
57 uses counts to compute the probabilities, with only very simple
58 smoothing.
59
60 By default, unary expansions are fobidden, since our grammar doesn't
61 use them. If you want to allow them, set C{unary_expansions=True}.
62
63 """
64 MODEL_TYPE = "halfspan"
65 TRAINING_OPTIONS = PcfgModel.TRAINING_OPTIONS + [
66 ModuleOption('cutoff', filter=int,
67 help_text="In estimating probabilities, treat any counts below "\
68 "cutoff as zero",
69 usage="cutoff=X, where X is an integer",
70 default=0),
71 ModuleOption('cat_bins', filter=int,
72 help_text="Number of possible categories used in estimating "\
73 "probabilities. When using smoothing, this will determine "\
74 "mass reserved for unseen categories. Has no effect if using "\
75 "mle. Overrides the value given in the grammar definition, "\
76 "which will be used by default.",
77 usage="cat_bins=X, where X is an integer",
78 default=None),
79 ModuleOption('estimator', filter=choose_from_dict(ESTIMATORS),
80 help_text="A way of constructing a probability model given "\
81 "the set of counts from the data. Default is to use "\
82 "laplace (add-one) smoothing.",
83 usage="estimator=X, where X is one of %s" % ", ".join(ESTIMATORS.keys()),
84 default=laplace_estimator),
85 ModuleOption('lexical', filter=str_to_bool,
86 help_text="Whether the model generates actual lexical entries, "\
87 "or just lexical categories. By default, models are lexical, "\
88 "but when using a model in combination with a tagger, it's "\
89 "sometimes desirable to let the PCCG model only the category "\
90 "generation.",
91 usage="lexical=B, where B is a boolean",
92 default=True),
93
94 get_chord_mapping_module_option(),
95 ]
96
97 LEX_INPUT_TYPES = [
98 DbInput, ChordInput
99 ]
100
101 - def __init__(self, name, cutoff=0, cat_bins=None,
102 estimator=laplace_estimator, lexical=True, chordmap=None,
103 parent_counts=None, expansion_type_counts=None,
104 head_expansion_counts=None, non_head_expansion_counts=None,
105 lexical_counts=None, **kwargs):
106 if cat_bins is None:
107 raise ValueError, "cat_bins must be specified"
108 if chordmap is None:
109 raise ValueError, "chordmap must be specified"
110
111 self.cutoff = cutoff
112 self.cat_bins = cat_bins
113 self._estimator = estimator
114 self.lexical = lexical
115 self.chordmap = chordmap
116 self.word_bins = 12*len(set(self.chordmap.values()))
117
118 super(HalfspanPcfgModel, self).__init__(name, **kwargs)
119
120
121 if parent_counts is None:
122 parent_counts = CutoffFreqDist(cutoff)
123 self._parent_counts = parent_counts
124 self._parent_dist = estimator(parent_counts, cat_bins)
125 if expansion_type_counts is None:
126 expansion_type_counts = CutoffConditionalFreqDist(cutoff)
127 self._expansion_type_counts = expansion_type_counts
128
129
130 self._expansion_type_dist = ConditionalProbDist(expansion_type_counts,
131 estimator, 2)
132 if head_expansion_counts is None:
133 head_expansion_counts = CutoffConditionalFreqDist(cutoff)
134 self._head_expansion_counts = head_expansion_counts
135 self._head_expansion_dist = ConditionalProbDist(head_expansion_counts,
136 estimator, cat_bins)
137 if non_head_expansion_counts is None:
138 non_head_expansion_counts = CutoffConditionalFreqDist(cutoff)
139 self._non_head_expansion_counts = non_head_expansion_counts
140 self._non_head_expansion_dist = ConditionalProbDist(
141 non_head_expansion_counts, estimator, cat_bins)
142 if lexical_counts is None:
143 lexical_counts = CutoffConditionalFreqDist(cutoff)
144 self._lexical_counts = lexical_counts
145 self._lexical_dist = ConditionalProbDist(lexical_counts,
146 estimator, self.word_bins)
147
149 """
150 Returns the string observation counted for a given chord.
151 Note that the chord should already have been made relative to its parent.
152
153 """
154 return "%s%s" % (chord.root_numeral, self.chordmap[chord.type])
155
157 data = {
158 'parents' : object_to_dict(self._parent_counts),
159 'expansions' : object_to_dict(self._expansion_type_counts),
160 'heads' : object_to_dict(self._head_expansion_counts),
161 'non_heads' : object_to_dict(self._non_head_expansion_counts),
162 'words' : object_to_dict(self._lexical_counts),
163 'cutoff' : self.cutoff,
164 'cat_bins' : self.cat_bins,
165 'estimator': self._estimator,
166 'grammar' : self.grammar,
167 'lexical' : self.lexical,
168 'chordmap' : self.chordmap.name,
169 }
170 return data
171
172 @staticmethod
174 obj = HalfspanPcfgModel(
175 name = name,
176 cutoff = data['cutoff'],
177 cat_bins = data['cat_bins'],
178 estimator = data['estimator'],
179 lexical = data.get('lexical', True),
180 chordmap = get_chord_mapping(data.get('chordmap', None)),
181 parent_counts = dict_to_object(data['parents']),
182 expansion_type_counts = dict_to_object(data['expansions']),
183 head_expansion_counts = dict_to_object(data['heads']),
184 non_head_expansion_counts = dict_to_object(data['non_heads']),
185 lexical_counts = dict_to_object(data['words']),
186 grammar = data['grammar'],
187 )
188 return obj
189
191 """
192 Probability of a (non-leaf) subtree, computed from the probability
193 of its expansions. This doesn't include the probabilities
194 of the subtrees of the daughters. To get the full inside probability,
195 multiply the returned value with the daughters' insider probabilities.
196
197 """
198 parent_rep = model_category_repr(parent.category)
199
200 exp_prob = self._expansion_type_dist[parent_rep].prob(expansion)
201
202 if expansion == 'leaf':
203
204
205 if not self.lexical:
206 word_prob = 1.0
207 else:
208
209 word = left
210
211 chord = Chord.from_name(word)
212 chord_obs = self.chord_observation(
213 category_relative_chord(chord,
214 category=parent.category))
215 word_prob = self._lexical_dist[parent_rep].prob(chord_obs)
216 return exp_prob * word_prob
217 else:
218
219 assert right is not None, "pcfg model only supports binary branches"
220 head = right
221 non_head = left
222
223 condition = (expansion, parent_rep)
224 head_rep = model_category_repr(head.category, parent.category)
225 head_prob = self._head_expansion_dist[condition].prob(head_rep)
226
227
228 condition = (head_rep, expansion, parent_rep)
229 non_head_rep = model_category_repr(non_head.category, parent.category)
230 non_head_prob = \
231 self._non_head_expansion_dist[condition].prob(non_head_rep)
232 return exp_prob * head_prob * non_head_prob
233
235 """
236 Outer probability of a subtree. This is approximated in these models
237 as the prior probability of the parent of the tree.
238
239 Prior probability P(parent) is used to approximate the outside
240 probability.
241
242 """
243 cat = model_category_repr(parent.category)
244 return self._parent_dist.prob(cat)
245
247 buff = StringIO()
248 def _fdist_str(fd):
249 return "FDist<%d>: %s" % (fd.N(), ", ".join("%s:%d" % pr for pr in fd.items()))
250 def _cfd_str(cfd):
251 fds = [(cond,cfd[cond]) for cond in cfd.conditions()]
252
253 fds = reversed(sorted(fds, key=lambda (c,fd): fd.N()))
254 return "\n".join("%s: %s" % (cond, _fdist_str(fd)) for (cond,fd) in fds)
255
256 print >>buff, "Parent distribution:"
257 print >>buff, _fdist_str(self._parent_counts)
258 print >>buff
259 print >>buff, "Expansion type distribution:"
260 print >>buff, _cfd_str(self._expansion_type_counts)
261 print >>buff
262 print >>buff, "Head expansion distribution:"
263 print >>buff, _cfd_str(self._head_expansion_counts)
264 print >>buff
265 print >>buff, "Non-head expansion distribution:"
266 print >>buff, _cfd_str(self._non_head_expansion_counts)
267 print >>buff
268 print >>buff, "Lexical expansion distribution:"
269 print >>buff, _cfd_str(self._lexical_counts)
270 print >>buff
271 print >>buff, "Possible words: %d" % self.word_bins
272 print >>buff, "Possible categories: %d" % self.cat_bins
273 print >>buff
274 print >>buff, "Estimator: %s" % get_estimator_name(self._estimator)
275 print >>buff, "Frequency cutoff: %d" % self.cutoff
276 return buff.getvalue()
277
278 @staticmethod
279 - def train(name, training_data, options, grammar=None, logger=None):
314
316 """
317 Adds counts to the model for a single chord sequence.
318
319 """
320
321 input = DbInput.from_sequence(sequence)
322 categories = [chord.category for chord in sequence.iterator()]
323 str_inputs = input.inputs
324
325 try:
326 tree = build_tree_for_sequence(sequence)
327 except TreeBuildError, err:
328 raise ModelTrainingError, "could not build a tree for '%s': %s" % \
329 (sequence.string_name, err)
330
331 def _add_counts(trace):
332 """ Add counts to the model from a derivation trace """
333 parent = trace.result
334
335 parent_rep = model_category_repr(parent.category)
336 self._parent_counts.inc(parent_rep)
337
338 if len(trace.rules) == 0:
339
340
341 self._expansion_type_counts[parent_rep].inc('leaf')
342
343 chord = Chord.from_name(trace.word)
344 chord = category_relative_chord(chord, parent.category)
345 observation = self.chord_observation(chord)
346
347
348 self._lexical_counts[parent_rep].inc(observation)
349 else:
350
351
352 for rule,args in trace.rules:
353 if rule.arity == 1:
354
355 raise ModelTrainingError, "we don't currently support "\
356 "unary rule application, but one was found in "\
357 "the training data"
358 if rule.arity == 2:
359
360
361 expansion = 'right'
362 self._expansion_type_counts[parent_rep].inc(expansion)
363
364 head_rep = model_category_repr(args[1].result.category,
365 parent.category)
366 self._head_expansion_counts[
367 (expansion,parent_rep)].inc(head_rep)
368
369
370 non_head_rep = model_category_repr(
371 args[0].result.category, parent.category)
372 self._non_head_expansion_counts[
373 (head_rep,expansion,parent_rep)
374 ].inc(non_head_rep)
375
376 for arg in args:
377 _add_counts(arg)
378
379
380
381 end = 0
382 successes = 0
383 num_trees = 0
384 for sub_tree in tree.children:
385
386 length = sub_tree.span_length
387 start = end
388 end += length
389
390
391 if not hasattr(sub_tree, 'chord'):
392 num_trees += 1
393
394
395 tags = []
396 for word,tag in zip(str_inputs[start:end],categories[start:end]):
397 if tag == "":
398 word_signs = []
399 elif tag not in self.grammar.families:
400 raise ModelTrainingError, "could not get a sign from "\
401 "the grammar for tag '%s' (chord '%s')" % \
402 (tag, word)
403 else:
404
405 word_signs = self.grammar.get_signs_for_word(word, tags=[tag])
406 tags.append(word_signs)
407
408 tagger = PretaggedTagger(self.grammar, input.slice(start,end), tags=tags)
409
410 parser = DirectedCkyParser(self.grammar, tagger, derivation_tree=sub_tree)
411 try:
412 parser.parse(derivations=True)
413 except DirectedParseError, err:
414
415 logger.error("Parsing using the derivation tree failed: "\
416 "%s" % err)
417 continue
418
419
420 parses = parser.chart.parses
421 if len(parses) > 1:
422 raise ModelTrainingError, "the annotated tree gave multiple "\
423 "parse results: %s" % ", ".join(["%s" % p for p in parses])
424 parse = parses[0]
425
426
427
428 _add_counts(parse.derivation_trace)
429 successes += 1
430
431 - def generate(self, logger=None, max_depth=None):
432 """
433 Generate a chord sequence from the model.
434
435 """
436 if logger is None:
437 logger = create_dummy_logger()
438
439 def _generate(parent, depth=0, pitch=0):
440
441
442
443
444 parent_rep = model_category_repr(parent)
445 parent_pitch = (pitch + base_pitch(parent)) % 12
446 logger.debug("%sGenerating from parent: %s" % (" "*depth,parent_rep))
447
448 if max_depth is not None and depth >= max_depth and \
449 len(self._lexical_dist[parent_rep].samples()) != 0:
450
451
452 exp = 'leaf'
453 logger.debug("%sForcing leaf" % (" "*depth))
454 else:
455
456 exp = generate_from_prob_dist(self._expansion_type_dist[parent_rep])
457 logger.debug("%sExpansion: %s" % (" "*depth, exp))
458 exp_parent = (exp,parent_rep)
459
460 if exp == 'leaf':
461
462 word = generate_from_prob_dist(self._lexical_dist[parent_rep])
463 logger.debug("%sWord: %s, pitch: %d" % (" "*depth, word, parent_pitch))
464 chord = Chord.from_name(word)
465 chord.root = (chord.root + parent_pitch) % 12
466 return [chord]
467 else:
468
469 head = generate_from_prob_dist(self._head_expansion_dist[exp_parent])
470 logger.debug("%sHead: %s" % (" "*depth, head))
471
472 head_generated = _generate(head, depth=depth+1, \
473 pitch=parent_pitch)
474
475 head_exp_parent = (head,exp,parent_rep)
476
477 non_head = generate_from_prob_dist(
478 self._non_head_expansion_dist[head_exp_parent])
479 logger.debug("%sNon-head: %s" % (" "*depth, non_head))
480
481 non_head_generated = _generate(non_head, depth=depth+1, \
482 pitch=parent_pitch)
483
484 return non_head_generated + head_generated
485
486
487
488 root = syntax_from_string("I^T-I^T")
489 logger.debug("Root: %s" % root)
490 return _generate(root)
491
494 """
495 Given a syntactic category, generates a representation that will
496 be used by the model. The result should be a string and must
497 uniquely identify all of the features of the category that the
498 model should distinguish.
499
500 If base_category is given, it may be used as a base to compare the
501 category to. In this case, observations of generated categories
502 always take their parent as a base category and syntactic pitches
503 are all relative to it.
504
505 If base_category is not given, the category itself will be
506 considered the base category.
507
508 """
509 from .syntax import make_absolute_category_from_relative
510
511 category = category.copy()
512 if base_category is None:
513 base_category = category
514
515 root = base_pitch(base_category)
516 if root is None:
517
518
519 root = base_pitch(category)
520
521
522 if root is not None:
523
524
525
526 root_pitch = (12 - root) % 12
527
528 make_absolute_category_from_relative(category, root_pitch)
529
530 return category
531
533 """
534 Arbitrarily picks a root out of the category, so that we can take
535 the rest of the category and other categories relative to this
536 consistently picked root.
537
538 """
539 from .syntax import AtomicCategory, ComplexCategory
540 if isinstance(cat, AtomicCategory):
541
542 return cat.from_half.root
543 elif isinstance(cat, ComplexCategory):
544
545 return cat.result.root
546 else:
547 raise TypeError, "category given to base_pitch was neither "\
548 "complex nor atomic. Type was %s" % (type(cat).__name__)
549
551 """
552 Returns a version of the Chord object that's relative to the base
553 pitch of the category. If no category is given, the chord returned
554 will be rooted at I (i.e. relative to itself).
555
556 """
557 if category is not None:
558 base = base_pitch(category)
559 if base is None:
560 base = chord.root
561 else:
562 base = chord.root
563 chord_rel = copy.deepcopy(chord)
564 chord_rel.root -= base
565 return chord_rel
566