jazzparser.formalisms.music

1 """Special bindings to the music_halfspan formalism required by the PCFG parser. 2 3 """ 4 """ 5 ============================== License ======================================== 6 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 7 8 This file is part of The Jazz Parser. 9 10 The Jazz Parser is free software: you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation, either version 3 of the License, or 13 (at your option) any later version. 14 15 The Jazz Parser is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 22 23 ============================ End license ====================================== 24 25 """ 26 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 27 28 import copy 29 from StringIO import StringIO 30 from nltk.probability import ConditionalProbDist 31 32 from jazzparser import settings 33 from jazzparser.data import Chord 34 from jazzparser.data.input import DbInput, ChordInput 35 from jazzparser.grammar import get_grammar 36 from jazzparser.data.input import DbInput 37 from jazzparser.parsers.pcfg.model import PcfgModel, ModelError, \ 38 ModelTrainingError 39 from jazzparser.utils.nltk.probability import CutoffConditionalFreqDist, \ 40 CutoffFreqDist, mle_estimator, ESTIMATORS, \ 41 laplace_estimator, get_estimator_name, \ 42 generate_from_prob_dist 43 from jazzparser.utils.nltk.storage import object_to_dict, dict_to_object 44 from jazzparser.utils.options import ModuleOption, choose_from_dict 45 from jazzparser.utils.strings import str_to_bool 46 from jazzparser.utils.loggers import create_dummy_logger 47 from jazzparser.taggers.chordmap import get_chord_mapping_module_option, \ 48 get_chord_mapping 49 from jazzparser.taggers.pretagged import PretaggedTagger 50 from jazzparser.parsers.cky import DirectedCkyParser 51 from jazzparser.data.trees import build_tree_for_sequence, TreeBuildError 52 from jazzparser.formalisms.music_halfspan.syntax import syntax_from_string

53 54 -class HalfspanPcfgModel(PcfgModel):

55 """ 56 A simple implementation of the PcfgModel interface. The model just 57 uses counts to compute the probabilities, with only very simple 58 smoothing. 59 60 By default, unary expansions are fobidden, since our grammar doesn't 61 use them. If you want to allow them, set C{unary_expansions=True}. 62 63 """ 64 MODEL_TYPE = "halfspan" 65 TRAINING_OPTIONS = PcfgModel.TRAINING_OPTIONS + [ 66 ModuleOption('cutoff', filter=int, 67 help_text="In estimating probabilities, treat any counts below "\ 68 "cutoff as zero", 69 usage="cutoff=X, where X is an integer", 70 default=0), 71 ModuleOption('cat_bins', filter=int, 72 help_text="Number of possible categories used in estimating "\ 73 "probabilities. When using smoothing, this will determine "\ 74 "mass reserved for unseen categories. Has no effect if using "\ 75 "mle. Overrides the value given in the grammar definition, "\ 76 "which will be used by default.", 77 usage="cat_bins=X, where X is an integer", 78 default=None), 79 ModuleOption('estimator', filter=choose_from_dict(ESTIMATORS), 80 help_text="A way of constructing a probability model given "\ 81 "the set of counts from the data. Default is to use "\ 82 "laplace (add-one) smoothing.", 83 usage="estimator=X, where X is one of %s" % ", ".join(ESTIMATORS.keys()), 84 default=laplace_estimator), 85 ModuleOption('lexical', filter=str_to_bool, 86 help_text="Whether the model generates actual lexical entries, "\ 87 "or just lexical categories. By default, models are lexical, "\ 88 "but when using a model in combination with a tagger, it's "\ 89 "sometimes desirable to let the PCCG model only the category "\ 90 "generation.", 91 usage="lexical=B, where B is a boolean", 92 default=True), 93 # Add the standard chord mapping option ("chord_mapping") 94 get_chord_mapping_module_option(), 95 ] 96 # Force non-lexical model for some input types 97 LEX_INPUT_TYPES = [ 98 DbInput, ChordInput 99 ] 100

101 - def __init__(self, name, cutoff=0, cat_bins=None, 102 estimator=laplace_estimator, lexical=True, chordmap=None, 103 parent_counts=None, expansion_type_counts=None, 104 head_expansion_counts=None, non_head_expansion_counts=None, 105 lexical_counts=None, **kwargs):

106 if cat_bins is None: 107 raise ValueError, "cat_bins must be specified" 108 if chordmap is None: 109 raise ValueError, "chordmap must be specified" 110 111 self.cutoff = cutoff 112 self.cat_bins = cat_bins 113 self._estimator = estimator 114 self.lexical = lexical 115 self.chordmap = chordmap 116 self.word_bins = 12*len(set(self.chordmap.values())) 117 118 super(HalfspanPcfgModel, self).__init__(name, **kwargs) 119 120 ## Prepare the distributions from the counts 121 if parent_counts is None: 122 parent_counts = CutoffFreqDist(cutoff) 123 self._parent_counts = parent_counts 124 self._parent_dist = estimator(parent_counts, cat_bins) 125 if expansion_type_counts is None: 126 expansion_type_counts = CutoffConditionalFreqDist(cutoff) 127 self._expansion_type_counts = expansion_type_counts 128 # 2 possible expansions: right and leaf 129 # If it becomes possible to have more (e.g. unary), set this somehow 130 self._expansion_type_dist = ConditionalProbDist(expansion_type_counts, 131 estimator, 2) 132 if head_expansion_counts is None: 133 head_expansion_counts = CutoffConditionalFreqDist(cutoff) 134 self._head_expansion_counts = head_expansion_counts 135 self._head_expansion_dist = ConditionalProbDist(head_expansion_counts, 136 estimator, cat_bins) 137 if non_head_expansion_counts is None: 138 non_head_expansion_counts = CutoffConditionalFreqDist(cutoff) 139 self._non_head_expansion_counts = non_head_expansion_counts 140 self._non_head_expansion_dist = ConditionalProbDist( 141 non_head_expansion_counts, estimator, cat_bins) 142 if lexical_counts is None: 143 lexical_counts = CutoffConditionalFreqDist(cutoff) 144 self._lexical_counts = lexical_counts 145 self._lexical_dist = ConditionalProbDist(lexical_counts, 146 estimator, self.word_bins)

147

148 - def chord_observation(self, chord):

149 """ 150 Returns the string observation counted for a given chord. 151 Note that the chord should already have been made relative to its parent. 152 153 """ 154 return "%s%s" % (chord.root_numeral, self.chordmap[chord.type])

155

156 - def _get_model_data(self):

157 data = { 158 'parents' : object_to_dict(self._parent_counts), 159 'expansions' : object_to_dict(self._expansion_type_counts), 160 'heads' : object_to_dict(self._head_expansion_counts), 161 'non_heads' : object_to_dict(self._non_head_expansion_counts), 162 'words' : object_to_dict(self._lexical_counts), 163 'cutoff' : self.cutoff, 164 'cat_bins' : self.cat_bins, 165 'estimator': self._estimator, 166 'grammar' : self.grammar, 167 'lexical' : self.lexical, 168 'chordmap' : self.chordmap.name, 169 } 170 return data

171 172 @staticmethod

173 - def _load_model(name, data):

174 obj = HalfspanPcfgModel( 175 name = name, 176 cutoff = data['cutoff'], 177 cat_bins = data['cat_bins'], 178 estimator = data['estimator'], 179 lexical = data.get('lexical', True), 180 chordmap = get_chord_mapping(data.get('chordmap', None)), 181 parent_counts = dict_to_object(data['parents']), 182 expansion_type_counts = dict_to_object(data['expansions']), 183 head_expansion_counts = dict_to_object(data['heads']), 184 non_head_expansion_counts = dict_to_object(data['non_heads']), 185 lexical_counts = dict_to_object(data['words']), 186 grammar = data['grammar'], 187 ) 188 return obj

189

190 - def inside_probability(self, expansion, parent, left, right=None):

191 """ 192 Probability of a (non-leaf) subtree, computed from the probability 193 of its expansions. This doesn't include the probabilities 194 of the subtrees of the daughters. To get the full inside probability, 195 multiply the returned value with the daughters' insider probabilities. 196 197 """ 198 parent_rep = model_category_repr(parent.category) 199 # Get the probability of the expansion type 200 exp_prob = self._expansion_type_dist[parent_rep].prob(expansion) 201 202 if expansion == 'leaf': 203 # Get the probability of the word given parent 204 # If the model doesn't generate words, this probability is 1 205 if not self.lexical: 206 word_prob = 1.0 207 else: 208 # In this case the word is given as the left branch 209 word = left 210 # Word should be a chord label: interpret it as such 211 chord = Chord.from_name(word) 212 chord_obs = self.chord_observation( 213 category_relative_chord(chord, 214 category=parent.category)) 215 word_prob = self._lexical_dist[parent_rep].prob(chord_obs) 216 return exp_prob * word_prob 217 else: 218 # We currently only recognise one other case: right-head 219 assert right is not None, "pcfg model only supports binary branches" 220 head = right 221 non_head = left 222 # Get the probability of the head (right) daughter given the parent 223 condition = (expansion, parent_rep) 224 head_rep = model_category_repr(head.category, parent.category) 225 head_prob = self._head_expansion_dist[condition].prob(head_rep) 226 # Get the probability of the non-head daughter given the 227 # parent and the head daughter 228 condition = (head_rep, expansion, parent_rep) 229 non_head_rep = model_category_repr(non_head.category, parent.category) 230 non_head_prob = \ 231 self._non_head_expansion_dist[condition].prob(non_head_rep) 232 return exp_prob * head_prob * non_head_prob

233

234 - def outside_probability(self, parent):

235 """ 236 Outer probability of a subtree. This is approximated in these models 237 as the prior probability of the parent of the tree. 238 239 Prior probability P(parent) is used to approximate the outside 240 probability. 241 242 """ 243 cat = model_category_repr(parent.category) 244 return self._parent_dist.prob(cat)

245

246 - def description(self):

247 buff = StringIO() 248 def _fdist_str(fd): 249 return "FDist<%d>: %s" % (fd.N(), ", ".join("%s:%d" % pr for pr in fd.items()))

250 def _cfd_str(cfd): 251 fds = [(cond,cfd[cond]) for cond in cfd.conditions()] 252 # Sort by N of each FD 253 fds = reversed(sorted(fds, key=lambda (c,fd): fd.N())) 254 return "\n".join("%s: %s" % (cond, _fdist_str(fd)) for (cond,fd) in fds)

255 256 print >>buff, "Parent distribution:" 257 print >>buff, _fdist_str(self._parent_counts) 258 print >>buff 259 print >>buff, "Expansion type distribution:" 260 print >>buff, _cfd_str(self._expansion_type_counts) 261 print >>buff 262 print >>buff, "Head expansion distribution:" 263 print >>buff, _cfd_str(self._head_expansion_counts) 264 print >>buff 265 print >>buff, "Non-head expansion distribution:" 266 print >>buff, _cfd_str(self._non_head_expansion_counts) 267 print >>buff 268 print >>buff, "Lexical expansion distribution:" 269 print >>buff, _cfd_str(self._lexical_counts) 270 print >>buff 271 print >>buff, "Possible words: %d" % self.word_bins 272 print >>buff, "Possible categories: %d" % self.cat_bins 273 print >>buff 274 print >>buff, "Estimator: %s" % get_estimator_name(self._estimator) 275 print >>buff, "Frequency cutoff: %d" % self.cutoff 276 return buff.getvalue() 277 278 @staticmethod

279 - def train(name, training_data, options, grammar=None, logger=None):

280 if grammar is None: 281 grammar = get_grammar() 282 if logger is None: 283 logger = create_dummy_logger() 284 285 # If cat_bins wasn't given, read it from the grammar 286 if options["cat_bins"]: 287 cat_bins = options["cat_bins"] 288 elif grammar.max_categories: 289 cat_bins = grammar.max_categories 290 else: 291 # Nothing given in the grammar either: error 292 raise ValueError, "no value was given for cat_bins and the "\ 293 "grammar doesn't supply one" 294 295 # Create a new model with empty distributions 296 model = HalfspanPcfgModel( 297 name, 298 cutoff = options['cutoff'], 299 cat_bins = cat_bins, 300 estimator = options['estimator'], 301 lexical = options['lexical'], 302 chordmap = options['chord_mapping'], 303 grammar = grammar) 304 305 # Add counts to this model for each sequence 306 for sequence in training_data: 307 try: 308 model._sequence_train(sequence) 309 except ModelTrainingError, err: 310 logger.warn("Error training on %s: %s" % (sequence.string_name, 311 err)) 312 313 return model

314

315 - def _sequence_train(self, sequence):

316 """ 317 Adds counts to the model for a single chord sequence. 318 319 """ 320 # Prepare the input and annotations 321 input = DbInput.from_sequence(sequence) 322 categories = [chord.category for chord in sequence.iterator()] 323 str_inputs = input.inputs 324 # Build the implicit normal-form tree from the annotations 325 try: 326 tree = build_tree_for_sequence(sequence) 327 except TreeBuildError, err: 328 raise ModelTrainingError, "could not build a tree for '%s': %s" % \ 329 (sequence.string_name, err) 330 331 def _add_counts(trace): 332 """ Add counts to the model from a derivation trace """ 333 parent = trace.result 334 # Add a count for the parent category 335 parent_rep = model_category_repr(parent.category) 336 self._parent_counts.inc(parent_rep) 337 338 if len(trace.rules) == 0: 339 # Leaf node - lexical generation 340 # Count this parent expanding as a leaf 341 self._expansion_type_counts[parent_rep].inc('leaf') 342 # Interpret the word as a chord 343 chord = Chord.from_name(trace.word) 344 chord = category_relative_chord(chord, parent.category) 345 observation = self.chord_observation(chord) 346 # Count this parent producing this word 347 # The chord root is now relative to the base pitch of the category 348 self._lexical_counts[parent_rep].inc(observation) 349 else: 350 # Internal node - rule application 351 # There should only be one rule application, but just in case... 352 for rule,args in trace.rules: 353 if rule.arity == 1: 354 # Unary rule 355 raise ModelTrainingError, "we don't currently support "\ 356 "unary rule application, but one was found in "\ 357 "the training data" 358 if rule.arity == 2: 359 # Binary rule 360 # Assume all heads come from the right 361 expansion = 'right' 362 self._expansion_type_counts[parent_rep].inc(expansion) 363 # Count this parent expanding to the head daughter 364 head_rep = model_category_repr(args[1].result.category, 365 parent.category) 366 self._head_expansion_counts[ 367 (expansion,parent_rep)].inc(head_rep) 368 # Count this parent with this head expansion expanding 369 # to the non-head daughter 370 non_head_rep = model_category_repr( 371 args[0].result.category, parent.category) 372 self._non_head_expansion_counts[ 373 (head_rep,expansion,parent_rep) 374 ].inc(non_head_rep) 375 # Recurse to count derivations from the daughters 376 for arg in args: 377 _add_counts(arg)

378 379 # The root of this structure is an extra node to contain all separate 380 # trees. If there's more than one tree, it represents partial parses 381 end = 0 382 successes = 0 383 num_trees = 0 384 for sub_tree in tree.children: 385 # Use each partial tree to get counts 386 length = sub_tree.span_length 387 start = end 388 end += length 389 390 # If this is just a leaf, ignore it - it came from an unlabelled chord 391 if not hasattr(sub_tree, 'chord'): 392 num_trees += 1 393 # Prepare the tagger for this part of the sequence 394 # Get a sign for each annotated chord 395 tags = [] 396 for word,tag in zip(str_inputs[start:end],categories[start:end]): 397 if tag == "": 398 word_signs = [] 399 elif tag not in self.grammar.families: 400 raise ModelTrainingError, "could not get a sign from "\ 401 "the grammar for tag '%s' (chord '%s')" % \ 402 (tag, word) 403 else: 404 # Get all signs that correspond to this tag from the grammar 405 word_signs = self.grammar.get_signs_for_word(word, tags=[tag]) 406 tags.append(word_signs) 407 408 tagger = PretaggedTagger(self.grammar, input.slice(start,end), tags=tags) 409 # Use the directed parser to parse according to this tree 410 parser = DirectedCkyParser(self.grammar, tagger, derivation_tree=sub_tree) 411 try: 412 parser.parse(derivations=True) 413 except DirectedParseError, err: 414 # Parse failed, so we can't train on this sequence 415 logger.error("Parsing using the derivation tree failed: "\ 416 "%s" % err) 417 continue 418 419 # We should now have a complete parse available 420 parses = parser.chart.parses 421 if len(parses) > 1: 422 raise ModelTrainingError, "the annotated tree gave multiple "\ 423 "parse results: %s" % ", ".join(["%s" % p for p in parses]) 424 parse = parses[0] 425 # Hooray! We have a parse! 426 # Now use the derivation trace to add counts to the model 427 # Store the counts in the model recursively 428 _add_counts(parse.derivation_trace) 429 successes += 1 430

431 - def generate(self, logger=None, max_depth=None):

432 """ 433 Generate a chord sequence from the model. 434 435 """ 436 if logger is None: 437 logger = create_dummy_logger() 438 439 def _generate(parent, depth=0, pitch=0): 440 # Transform the parent category so it's relative to itself 441 # All generated categories will be relative to this, 442 # so we need to make the parent self-relative at the 443 # start of each recursion 444 parent_rep = model_category_repr(parent) 445 parent_pitch = (pitch + base_pitch(parent)) % 12 446 logger.debug("%sGenerating from parent: %s" % (" "*depth,parent_rep)) 447 448 if max_depth is not None and depth >= max_depth and \ 449 len(self._lexical_dist[parent_rep].samples()) != 0: 450 # Don't go any deeper than this if we can stop here 451 # Only possible if the parent has generated a leaf before 452 exp = 'leaf' 453 logger.debug("%sForcing leaf" % (" "*depth)) 454 else: 455 # Otherwise freely generate an expansion type 456 exp = generate_from_prob_dist(self._expansion_type_dist[parent_rep]) 457 logger.debug("%sExpansion: %s" % (" "*depth, exp)) 458 exp_parent = (exp,parent_rep) 459 460 if exp == 'leaf': 461 # Generate a leaf node (word) 462 word = generate_from_prob_dist(self._lexical_dist[parent_rep]) 463 logger.debug("%sWord: %s, pitch: %d" % (" "*depth, word, parent_pitch)) 464 chord = Chord.from_name(word) 465 chord.root = (chord.root + parent_pitch) % 12 466 return [chord] 467 else: 468 # First generate a head node 469 head = generate_from_prob_dist(self._head_expansion_dist[exp_parent]) 470 logger.debug("%sHead: %s" % (" "*depth, head)) 471 # Continue to expand this recursively to a word sequence 472 head_generated = _generate(head, depth=depth+1, \ 473 pitch=parent_pitch) 474 475 head_exp_parent = (head,exp,parent_rep) 476 # Now generate a non-head node 477 non_head = generate_from_prob_dist( 478 self._non_head_expansion_dist[head_exp_parent]) 479 logger.debug("%sNon-head: %s" % (" "*depth, non_head)) 480 # Continue to expand this too 481 non_head_generated = _generate(non_head, depth=depth+1, \ 482 pitch=parent_pitch) 483 484 return non_head_generated + head_generated

485 486 # Choose a start node 487 # Build a I^T-I^T as the root 488 root = syntax_from_string("I^T-I^T") 489 logger.debug("Root: %s" % root) 490 return _generate(root) 491

492 493 -def model_category_repr(category, base_category=None):

494 """ 495 Given a syntactic category, generates a representation that will 496 be used by the model. The result should be a string and must 497 uniquely identify all of the features of the category that the 498 model should distinguish. 499 500 If base_category is given, it may be used as a base to compare the 501 category to. In this case, observations of generated categories 502 always take their parent as a base category and syntactic pitches 503 are all relative to it. 504 505 If base_category is not given, the category itself will be 506 considered the base category. 507 508 """ 509 from .syntax import make_absolute_category_from_relative 510 511 category = category.copy() 512 if base_category is None: 513 base_category = category 514 # Get a root from the base category 515 root = base_pitch(base_category) 516 if root is None: 517 # No root pitch found in the base category 518 # Just make the category itself relative to its own root 519 root = base_pitch(category) 520 # If root is still None, it doesn't matter - there are no roots to adjust 521 522 if root is not None: 523 # The function used below adds a cat's root to the base: 524 # we want to subtract - i.e. make a relative category from 525 # an absolute 526 root_pitch = (12 - root) % 12 527 # Make the category's roots all relative to this base root 528 make_absolute_category_from_relative(category, root_pitch) 529 530 return category

531

532 -def base_pitch(cat):

533 """ 534 Arbitrarily picks a root out of the category, so that we can take 535 the rest of the category and other categories relative to this 536 consistently picked root. 537 538 """ 539 from .syntax import AtomicCategory, ComplexCategory 540 if isinstance(cat, AtomicCategory): 541 # Use the start part of the category as the root 542 return cat.from_half.root 543 elif isinstance(cat, ComplexCategory): 544 # Get a root from the left side of the slash 545 return cat.result.root 546 else: 547 raise TypeError, "category given to base_pitch was neither "\ 548 "complex nor atomic. Type was %s" % (type(cat).__name__)

549

550 -def category_relative_chord(chord, category=None):

551 """ 552 Returns a version of the Chord object that's relative to the base 553 pitch of the category. If no category is given, the chord returned 554 will be rooted at I (i.e. relative to itself). 555 556 """ 557 if category is not None: 558 base = base_pitch(category) 559 if base is None: 560 base = chord.root 561 else: 562 base = chord.root 563 chord_rel = copy.deepcopy(chord) 564 chord_rel.root -= base 565 return chord_rel

566

Source Code for Module jazzparser.formalisms.music_halfspan.pcfg