Package jazzparser :: Package utils :: Package nltk :: Package ngram :: Module model
[hide private]
[frames] | no frames]

Source Code for Module jazzparser.utils.nltk.ngram.model

   1  """Generic n-gram model implementation, using NLTK's probability handling. 
   2   
   3  NLTK provides an n-gram POS tagger, but it can only assign the most  
   4  likely tag sequence to observations. It doesn't calculate probabilities,  
   5  so is no use for our supertagger component. Here I provide a generic  
   6  n-gram model, using NLTK's probability stuff. 
   7   
   8  """ 
   9  """ 
  10  ============================== License ======================================== 
  11   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
  12    
  13   This file is part of The Jazz Parser. 
  14    
  15   The Jazz Parser is free software: you can redistribute it and/or modify 
  16   it under the terms of the GNU General Public License as published by 
  17   the Free Software Foundation, either version 3 of the License, or 
  18   (at your option) any later version. 
  19    
  20   The Jazz Parser is distributed in the hope that it will be useful, 
  21   but WITHOUT ANY WARRANTY; without even the implied warranty of 
  22   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  23   GNU General Public License for more details. 
  24    
  25   You should have received a copy of the GNU General Public License 
  26   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
  27   
  28  ============================ End license ====================================== 
  29   
  30  """ 
  31  __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"  
  32   
  33  import copy,sys,logging, math, warnings 
  34  import numpy 
  35  from numpy import sum as array_sum 
  36  from jazzparser.utils.nltk.probability import laplace_estimator, logprob, \ 
  37                      cond_prob_dist_to_dictionary_cond_prob_dist, sum_logs 
  38  from nltk.probability import ConditionalProbDist, sum_logs, DictionaryProbDist, \ 
  39                      ConditionalFreqDist, FreqDist 
  40   
  41  NGRAM_JOINER = "::" 
  42   
  43  # Get the logger from the logging system 
  44  logger = logging.getLogger("main_logger") 
45 46 -def matrix_log_probs_to_probs(matrix):
47 """ 48 Utility function for methods below. 49 Converts all the probabilities in the time-state matrix from log 50 probabilities to probabilities. 51 52 """ 53 return [dict([(label, 2**log_prob) for (label,log_prob) in timestep.items()]) for timestep in matrix]
54
55 -def sum_matrix_dims(matrix, dims=2):
56 """ 57 Takes an n-dimensional matrix and sums over all but the first C{dims} 58 dimensions, returning a C{dims}-dimensional matrix. You can do this 59 easily in later versions of Numpy! 60 61 """ 62 # Sum over all but the first two dimensions: time and last state in ngram 63 start_dims = matrix.ndim 64 for i in range(start_dims-dims): 65 matrix = numpy.sum(matrix, axis=-1) 66 return matrix
67
68 -def _all_indices(length, num_labels):
69 """ 70 Function to generate all index n-grams of a given length 71 72 """ 73 if length < 1: 74 return [[]] 75 else: 76 return sum([ [[i]+sub for i in range(num_labels)] 77 for sub in _all_indices(length-1, num_labels)], [])
78
79 -class NgramModel(object):
80 """ 81 A general n-gram model, trained on some labelled data. 82 Generate models using NgramModel.train(). 83 84 Note that backoff assumes that an estimator with discounting is 85 being used. The backed off probabilities are scaled according to the 86 smoothing probability that would have been assigned to a zero 87 count. If you use this with MLE, the backoff is effectively disabled. 88 89 The estimator should be picklable. This means you can't use a 90 lambda, for example. 91 92 """
93 - def __init__(self, order, label_counts, emission_counts, \ 94 estimator, \ 95 backoff_model, \ 96 label_dom, emission_dom):
97 from ..storage import is_picklable 98 99 self.order = order 100 self.label_dom = label_dom 101 self.num_labels = len(label_dom) 102 self.emission_dom = emission_dom 103 self.num_emissions = len(emission_dom) 104 105 # Check the estimator's picklable 106 # Otherwise we'll get mysterious errors when storing the model 107 if not is_picklable(estimator): 108 raise NgramError, "the estimator given for an n-gram model "\ 109 "is not picklable. Use a statically-defined function as "\ 110 "your estimator (got %s)" % type(estimator).__name__ 111 112 # Keep the freq dists to get the raw counts 113 self.label_counts = label_counts 114 self.emission_counts = emission_counts 115 # Construct a probability distribution from these frequency dists 116 self.label_dist = ConditionalProbDist(label_counts, estimator, self.num_labels+1) 117 self.emission_dist = ConditionalProbDist(emission_counts, estimator, self.num_emissions) 118 # Keep hold of this for when we store the model 119 # It doesn't get used otherwise, as it was only to construct the prob dists 120 self._estimator = estimator 121 122 self.backoff_model = backoff_model 123 124 self.clear_cache()
125
126 - def __repr__(self):
127 return "<%s model,%d lab,%d em>" % (self.model_type, self.num_labels, self.num_emissions)
128
129 - def _get_model_type(self):
130 """ 131 model_type gives a sensible name to the order of n-gram. Uses 132 unigram, bigram, trigram, or x-gram. 133 134 """ 135 if self.order == 1: 136 return "unigram" 137 elif self.order == 2: 138 return "bigram" 139 elif self.order == 3: 140 return "trigram" 141 else: 142 return "%d-gram" % self.order
143 model_type = property(_get_model_type) 144
145 - def clear_cache(self):
146 """ 147 Subclasses that use caches should override this method to reset them 148 and call this in the superclass. 149 150 """ 151 self._transition_matrix_cache = None 152 self._transition_matrix_transpose_cache = None 153 # These will be filled as we access probabilities 154 self._discount_cache = {} 155 self._emission_discount_cache = {} 156 self._transition_cache = {}
157 158 @staticmethod
159 - def train(order, labelled_data, label_dom, emission_dom=None, cutoff=0, 160 backoff_order=0, estimator=None, ignore_list=None, backoff_kwargs={}):
161 """ 162 Trains the n-gram model given some labelled data. The data 163 should be in the form of a sequence of sequences of tuples (e,t), 164 where e is an emission and t a tag. 165 166 If ignore_list is given, all ngrams containing a label in 167 ignore_list will be ignored altogether. One use for this is to 168 ignore blank labels, so we don't learn the gaps in the labelled 169 data. 170 171 """ 172 if order <= 0: 173 raise NgramError, "cannot construct a %d-gram model!" % order 174 if backoff_order >= order: 175 raise NgramError, "cannot construct a %d-gram model "\ 176 "with %d-order backoff" % (order, backoff_order) 177 178 if estimator is None: 179 estimator = laplace_estimator 180 181 labelled_data = list(labelled_data) 182 label_dist = ConditionalFreqDist() 183 emission_dist = ConditionalFreqDist() 184 seen_emissions = set() 185 186 if ignore_list is not None: 187 ignores = set(ignore_list) 188 else: 189 ignores = set() 190 191 for training_sequence in labelled_data: 192 # Add blanks to the start of the sequence to get our first n-grams 193 # Add a blank to the end to get the final probabilities 194 training_data = [(None,None)]*(order-1) + list(training_sequence) + [(None,None)] 195 # Add counts to the model 196 for i in range(len(training_data)-order): 197 # This is actually a full ngram, to give us all states for 198 # the transition 199 ngram = list(reversed(training_data[i:i+order])) 200 outputs,tags = zip(*ngram) 201 # Check whether the tags include any of those we should be ignoring 202 if ignores.isdisjoint(set(tags)): 203 tag = tags[0] 204 # Don't count emissions from None 205 if tag is not None and outputs[0] is not None: 206 # Emission count 207 emission_dist[tag].inc(outputs[0]) 208 # Keep track of what emissions we've seen 209 seen_emissions.add(outputs[0]) 210 # Labelled ngram count 211 context = tuple(tags[1:]) 212 label_dist[context].inc(tag) 213 214 # Apply low-count cutoff to both distributions 215 for label in emission_dist.conditions(): 216 for sample in emission_dist[label].samples(): 217 if emission_dist[label][sample] <= cutoff: 218 emission_dist[label][sample] = 0 219 for context in label_dist.conditions(): 220 for sample in label_dist[context].samples(): 221 if label_dist[context][sample] <= cutoff: 222 label_dist[context][sample] = 0 223 224 # If no domain of emissions was given, use what we've seen in the data 225 if emission_dom is None: 226 emission_dom = seen_emissions 227 else: 228 emission_dom = set(emission_dom) 229 230 if backoff_order > 0: 231 kwargs = { 232 "emission_dom" : emission_dom, 233 "cutoff" : cutoff, 234 "backoff_order" : backoff_order-1, 235 "estimator" : estimator, 236 "ignore_list" : ignores 237 } 238 # Add in any args we've been told to use 239 kwargs.update(backoff_kwargs) 240 # Construct a model to back off to 241 backoff_model = NgramModel.train(order-1, 242 labelled_data, 243 label_dom, 244 **kwargs) 245 else: 246 backoff_model = None 247 248 # Return an actual model as trained above 249 return NgramModel(order, label_dist, emission_dist, estimator, backoff_model, label_dom, emission_dom)
250
251 - def _get_transition_backoff_scaler(self, context):
252 """ 253 Returns the amount to scale the backed off probabilities by 254 when backing off to an order n-1 model in the given context. 255 This is presented as alpha in Jurafsky and Martin. 256 257 Returned as a base 2 log. 258 259 A more efficient way to do this would be to supply a function 260 of the context specific to the discounting technique. In this 261 case it wouldn't be necessary to sum the discounted mass each 262 time. 263 264 """ 265 if context not in self._discount_cache: 266 # The prob mass reserved for unseen events can be computed by 267 # summing probabilities over all seen events and subtracting 268 # from 1. 269 # Our discounting model distributes this probability evenly over 270 # the unseen events, so we can compute the discounted mass by 271 # getting the probability of one unseen event and multiplying it. 272 seen_labels = set([lab for lab in self.label_dom+[None] if 273 self.label_counts[context][lab] > 0]) 274 if len(seen_labels) == 0: 275 # Not seen anything in this context. All mass is discounted! 276 self._discount_cache[context] = 0.0 277 else: 278 unseen_labels = set(self.label_dom+[None]) - seen_labels 279 # Try getting some event that won't have been seen 280 # Compute how much mass is reserved for unseen events 281 discounted_mass = self.label_dist[context].prob( 282 "%%% UNSEEN LABEL %%%") \ 283 * len(unseen_labels) 284 # Compute how much probability the n-1 order model assigns to 285 # things unseen by this model 286 backoff_context = context[:-1] 287 backoff_seen_mass = sum([ 288 self.backoff_model.transition_probability(lab, 289 *backoff_context) 290 for lab in unseen_labels], 0.0) 291 self._discount_cache[context] = logprob(discounted_mass) - \ 292 logprob(backoff_seen_mass) 293 return self._discount_cache[context]
294
295 - def transition_log_probability(self, *ngram):
296 """ 297 Gives the probability P(label_i | label_(i-1), ..., label_(i-n)), 298 where the previous labels are given in the sequence 299 label_context. The context should be in reverse order, i.e. 300 with the most recent label at the start. 301 302 Note that this is the probability of a label given the previous 303 n-1 labels, which is the same as the probability of the n-gram 304 [label_i, ..., label_(i-n+1)] given the ngram 305 [label_(i-1), ..., label_(i-n)], since all but the last element 306 of the ngram overlaps with the condition, so has probability 1. 307 308 Caches all computed transition probabilities. This is 309 particularly important for backoff models. Many n-grams will 310 back off to the same (n-1)-gram and we don't want to recompute 311 the transition probability for that each time. 312 313 """ 314 # Cache the probabilities we've computed 315 if ngram not in self._transition_cache: 316 # Check the context is the right size 317 # This method gets called millions of times, but most times the 318 # value is cached. We only need check this the first time 319 if len(ngram) != self.order: 320 raise NgramError, "a %d-gram model can only give transition "\ 321 "probabilities for a context of length %d. Tried to use "\ 322 "%d" % (self.order, self.order-1, len(ngram)-1) 323 324 # Compute the probability as we've not seen it before 325 context = tuple(ngram[1:]) 326 label = ngram[0] 327 # Check whether we have enough observations of this whole n-gram 328 if self.backoff_model is not None and self.label_counts[context][label] == 0: 329 # Backoff to a lower-order model 330 # Work out how much prob mass is reserved for unseen events 331 scale = self._get_transition_backoff_scaler(context) 332 # Backoff and scale to fill the reserved mass 333 prob = scale + self.backoff_model.transition_log_probability(*(ngram[:-1])) 334 else: 335 prob = self.label_dist[context].logprob(label) 336 # Don't compute this one again 337 self._transition_cache[ngram] = prob 338 return self._transition_cache[ngram]
339
340 - def transition_log_probability_debug(self, *ngram):
341 """ 342 Debugging version of the above. 343 Use this only for debugging. It prints stuff out and doesn't cache. 344 345 """ 346 if len(ngram) != self.order: 347 raise NgramError, "a %d-gram model can only give transition "\ 348 "probabilities for a context of length %d. Tried to use "\ 349 "%d" % (self.order, self.order-1, len(ngram)-1) 350 351 # Compute the probability as we've not seen it before 352 context = tuple(ngram[1:]) 353 label = ngram[0] 354 # Check whether we have enough observations of this whole n-gram 355 if self.backoff_model is not None and self.label_counts[context][label] == 0: 356 print "Backing off: %s" % ", ".join([str(s) for s in ngram[:-1]]) 357 # Work out how much prob mass is reserved for unseen events 358 scale = self._get_transition_backoff_scaler(context) 359 print " Scaler: %s, %f" % (", ".join([str(s) for s in context]),2**scale) 360 # Backoff and scale by the reserved mass 361 prob = scale + self.backoff_model.transition_log_probability_debug(*(ngram[:-1])) 362 else: 363 prob = self.label_dist[context].logprob(label) 364 print "Using %d-gram model probability: %f" % (self.order, 2**prob) 365 return prob
366
367 - def transition_probability(self, label, *label_context):
368 """ 369 Wrapper for transition_log_probability to return a real 370 probability. 371 372 """ 373 return 2**self.transition_log_probability(label, *label_context)
374
375 - def transition_probability_debug(self, *ngram):
376 """ Debugging version of the above """ 377 return 2**self.transition_log_probability_debug(*ngram)
378
379 - def emission_log_probability(self, emission, label):
380 """ 381 Gives the probability P(emission | label). Returned as a base 2 382 log. 383 384 """ 385 return self.emission_dist[label].logprob(emission)
386
387 - def emission_probability(self, emission, label):
388 """ 389 Wrapper for emission_log_probability to return an real 390 probability. 391 392 """ 393 return 2**self.emission_log_probability(emission, label)
394
395 - def get_backoff_models(self):
396 """ 397 Returns a list consisting of this model and all the recursive 398 backoff models, until no more backoff models are provided. 399 400 """ 401 if self.backoff_model is None: 402 return [self] 403 else: 404 return [self]+self.backoff_model.get_backoff_models()
405
406 - def get_transition_matrix(self, transpose=False):
407 """ 408 Produces a matrix of the transition probabilities from every 409 (n-1)-gram to every state. Matrix indices are based on enumeration of 410 C{self.label_dom}. The matrix is returned as a numpy array. 411 412 The matrix has n dimensions. The first index is the current state, 413 the second the previous, etc. Thus, 414 matrix[i,j,...] = p(state_t = i | state_(t-1) = j, ...). 415 416 Probabilities are not logs. 417 418 """ 419 if transpose: 420 if self._transition_matrix_transpose_cache is None: 421 # Ensure that the normal transition matrix has been generated 422 mat = self.get_transition_matrix() 423 # Tranpose it and copy it, so it's a real array, not a view 424 self._transition_matrix_transpose_cache = numpy.copy(mat.transpose()) 425 return self._transition_matrix_transpose_cache 426 else: 427 if self._transition_matrix_cache is None: 428 # Compute the matrix from scratch, as we've not done it yet 429 N = len(self.label_dom) 430 shape = tuple([N]*self.order) 431 trans = numpy.zeros(shape, numpy.float64) 432 433 states = self.label_dom 434 # Build a list of lists, each n long, with all possible array indices 435 index_sets = _all_indices(self.order, N) 436 437 # Fill the matrix 438 for indices in index_sets: 439 # Get the ngram of state labels for this ngram of array indices 440 state_labels = [states[i] for i in indices] 441 trans[tuple(indices)] = self.transition_probability(*state_labels) 442 443 self._transition_matrix_cache = trans 444 return self._transition_matrix_cache
445
446 - def get_emission_matrix(self, sequence):
447 """ 448 Produces a matrix of the probability of each timestep's emission from 449 each state. 450 451 matrix[t,i] = p(o_t | state=i) 452 453 """ 454 T = len(sequence) 455 N = len(self.label_dom) 456 457 ems = numpy.zeros((T, N), numpy.float64) 458 459 # Set the probability for every timestep... 460 for t,emission in enumerate(sequence): 461 # ...given each state 462 for i,label in enumerate(self.label_dom): 463 ems[t,i] = self.emission_probability(emission, label) 464 return ems
465
466 - def decode_forward(self, sequence):
467 """ 468 Uses the forward probabilities to decode the sequence and 469 returns a list of labels. 470 471 """ 472 forwards = self.normal_forward_probabilities(sequence) 473 # Sum up ngram dimensions, leaving just time and state 474 forwards = sum_matrix_dims(forwards) 475 # Do an argmax over the possible states for each timestep 476 states = numpy.argmax(forwards, axis=1) 477 # Convert these indices into state labels 478 return [self.label_dom[state] for state in states]
479
480 - def normal_forward_probabilities(self, sequence, seq_prob=False):
481 """ 482 Return the forward probability matrix as a Numpy array. 483 This is equivalent to L{forward_probabilities}, but much faster. 484 It doesn't need logs because it's normalizing at each timestep. 485 486 @param seq_prob: return the log probability of the whole sequence 487 as well as the array (tuple of (array,logprob)). 488 @return: S-dimensional Numpy array, where S is the number of states. 489 The first dimension represents timesteps, the rest the (S-1) 490 states in the ngrams. 491 492 """ 493 N = len(sequence) 494 states = self.label_dom 495 496 # Create an array for the total logprobs 497 coefficients = numpy.zeros((N,), numpy.float64) 498 499 # Prepare the transition and emission matrices 500 ems = self.get_emission_matrix(sequence) 501 trans = self.get_transition_matrix() 502 503 if self.order == 1: 504 # We can do this quickly for unigrams and it saves dealing with 505 # dodgy cases of everything below 506 # We only use emission probabilities: transition probs 507 # just supply (unconditioned) priors 508 forward_matrix = trans*ems 509 # Sum the probabilities in each timestep 510 total_probs = numpy.sum(forward_matrix, axis=1) 511 for t in range(N): 512 coefficients[t] = logprob(total_probs[t]) 513 forward_matrix[t] /= total_probs[t] 514 if seq_prob: 515 return forward_matrix, numpy.sum(coefficients) 516 else: 517 return forward_matrix 518 519 # Initialize an empty matrix 520 forward_matrix = numpy.zeros([N]+[len(states)]*(self.order-1), numpy.float64) 521 522 # First fill in the first columns with histories padded with Nones 523 # In these columns, we only use a subset of the dimensions 524 for time in range(self.order-1): 525 # Get the indices corresponding to all possible sub-n-grams 526 # between the start and here 527 index_sets = _all_indices(time+1, len(states)) 528 529 for indices in index_sets: 530 # Get the actual ngram for these indices 531 ngram = [self.label_dom[i] for i in indices] 532 # Pad with Nones to a length of n 533 ngram.extend([None]*(self.order-time-1)) 534 # Pad the indices with 0s so we use all the dimensions 535 indices.extend([0]*(self.order-time-2)) 536 selector = tuple([time]+indices) 537 538 # Fill in with the (None-padded) transition probability 539 prob = self.transition_probability(*ngram) 540 if time > 0: 541 # Multiply by the prob we must have come from 542 previous_state = tuple([time-1]+indices[1:]+[0]) 543 prob *= forward_matrix[previous_state] 544 forward_matrix[selector] += prob 545 546 # Multiply in the emission probabilities 547 forward_matrix[time] = (forward_matrix[time].transpose() * ems[time]).transpose() 548 # Normalize 549 coefficients[time] = logprob(numpy.sum(forward_matrix[time])) 550 forward_matrix[time] /= numpy.sum(forward_matrix[time]) 551 552 for time in range(self.order-1, N): 553 # Multiplying, the previous timestep gets broadcast over the 554 # first axis of the transition matrix, which is what we need 555 # (that axis represents the most recent state in the n-gram) 556 trans_step = forward_matrix[time-1] * trans 557 # Sum probabilities over the last axis, i.e. the earliest state 558 # in the n-gram 559 trans_step = numpy.sum(trans_step, axis=-1) 560 # Multiply in the emission probabilities 561 forward_matrix[time] = (trans_step.transpose() * ems[time]).transpose() 562 # Normalize the timestep 563 coefficients[time] = logprob(numpy.sum(forward_matrix[time])) 564 forward_matrix[time] /= numpy.sum(forward_matrix[time]) 565 566 if seq_prob: 567 return forward_matrix, numpy.sum(coefficients) 568 else: 569 return forward_matrix
570
571 - def normal_backward_probabilities(self, sequence):
572 """ 573 Return the backward probability matrices a Numpy array. This is 574 faster than L{backward_log_probabilities} because it uses Numpy 575 arrays with non-log probabilities and normalizes each timestep. 576 577 @return: matrix over timesteps and all labels, with a dimension for 578 each state in each (n-1)-gram: for time steps 579 i and labels k, P(word^(i+1), ..., word^T | label_k^i) 580 581 """ 582 N = len(sequence) 583 states = self.label_dom 584 S = len(states) 585 586 # Prepare the transition and emission matrices 587 trans_back = self.get_transition_matrix(transpose=True) 588 ems = self.get_emission_matrix(sequence) 589 590 if self.order == 1: 591 # We can do this quickly for unigrams and it saves dealing with 592 # dodgy cases of everything below 593 # For unigrams, the backward probs are uniform 594 backward_matrix = numpy.ones((N,S), numpy.float64) / S 595 return backward_matrix 596 597 # For other cases, things are a little more complicated 598 # Initialize an empty matrix 599 backward_matrix = numpy.zeros([N]+[S]*(self.order-1), numpy.float64) 600 601 index_sets = _all_indices(self.order-1, S) 602 603 # Initialize from the end 604 for indices in index_sets: 605 # Make an (n-1)-gram for the context 606 ngram = [states[i] for i in indices] 607 # Put the probability of going to the end state from each context 608 # We read this from the model, since it's not in the trans matrix 609 # Note that it's stored in transposed form (see below) 610 selector = tuple([N-1]+list(reversed(indices))) 611 backward_matrix[selector] = self.transition_probability(None, *ngram) 612 backward_matrix[N-1] /= numpy.sum(backward_matrix[N-1]) 613 614 # Work backwards, filling in the matrix 615 for time in range(N-2, self.order-2, -1): 616 # Transposing the next timestep and the trans matrix causes their 617 # indices to be reversed, so that the timestep is broadcast over 618 # the last index (now the first) 619 # Then we multiply in the emission probabilities, broadcast over 620 # the first index (by doing it while transposed) and transpose 621 # back to correct the indices 622 # Summing over the 0th axis sums over possible next states 623 # (the last element of the ngram in the next timestep) 624 # To speed up the computations, we keep the whole lot transposed 625 backward_matrix[time] = numpy.sum((trans_back * 626 backward_matrix[time+1] 627 * ems[time+1]), 628 axis=-1) 629 # Normalize over the timestep 630 backward_matrix[time] /= numpy.sum(backward_matrix[time]) 631 632 # Fill in the first columns now, using None-padding histories 633 for time in range(self.order-2, -1, -1): 634 # Get the indices corresponding to all possible t-grams 635 id_tgrams = _all_indices(time+1, S) 636 637 for id_tgram in id_tgrams: 638 # Get the actual ngram for these indices 639 tgram = [self.label_dom[i] for i in id_tgram] 640 # Fill this out to an (n-1)-gram stretching before the 641 # start with Nones 642 nm1gram = tgram + [None]*(self.order-2-time) 643 # Pad indices with 0s 644 id_nm1gram = id_tgram + [0]*(self.order-2-time) 645 646 # Sum over the possible next states following this (n-1)-gram 647 summed_prob = 0.0 648 for next_id in range(S): 649 # Get transition probability from nm1gram to state next_id 650 prob = self.transition_probability(states[next_id], *nm1gram) 651 # Multiply in the emission probability for the next state 652 prob *= ems[time+1, next_id] 653 654 # Multiply by the bwd prob for the next state 655 next_state = tuple([time+1]+list(reversed(id_nm1gram[:-1]))+[next_id]) 656 prob *= backward_matrix[next_state] 657 # Sum this over the possible next states 658 summed_prob += prob 659 backward_matrix[tuple([time]+list(reversed(id_nm1gram)))] = summed_prob 660 661 # Normalize 662 backward_matrix[time] /= numpy.sum(backward_matrix[time]) 663 664 # Now transpose all the timestep matrices back so the ngrams 665 # correspond correctly to the indices 666 for time in range(N): 667 backward_matrix[time] = backward_matrix[time].transpose() 668 669 return backward_matrix
670
671 - def normal_forward_backward_probabilities(self, sequence, forward=None, 672 backward=None):
673 """ 674 A faster implementation of L{forward_backward_probabilities} for the 675 case where we're normalizing, using Numpy and non-log probabilities. 676 677 This is still an S-dimensional matrix, not the state-occupation 678 probabilities. Use L{gamma_probabilities} to get that. 679 680 """ 681 if forward is None: 682 forward = self.normal_forward_probabilities(sequence) 683 if backward is None: 684 backward = self.normal_backward_probabilities(sequence) 685 686 # This is simple with Numpy, since it does pointwise multiplication 687 gamma = forward * backward 688 # Now renormalize 689 for time in range(len(sequence)): 690 gamma[time] /= numpy.sum(gamma[time]) 691 return gamma
692
693 - def gamma_probabilities(self, sequence, dictionary=False, 694 forward=None, backward=None):
695 """ 696 State-occupation probabilities. 697 698 @type dictionary: bool 699 @param dictionary: return a list of label dictionaries instead 700 of a numpy matrix 701 702 """ 703 fwd_bwd = self.normal_forward_backward_probabilities(sequence, 704 forward=forward, backward=backward) 705 # Sum over all but the first two dimensions: time and last state in ngram 706 for i in range(fwd_bwd.ndim-2): 707 fwd_bwd = numpy.sum(fwd_bwd, axis=-1) 708 709 if dictionary: 710 # Convert to a list of dictionaries, keyed by label 711 gamma = [] 712 for t in range(fwd_bwd.shape[0]): 713 dic = {} 714 for s,state in enumerate(self.label_dom): 715 dic[state] = fwd_bwd[t,s] 716 gamma.append(dic) 717 return gamma 718 else: 719 return fwd_bwd
720
721 - def decode_gamma(self, sequence):
722 """ 723 Use the state occupation probabilities to decode a sequence. 724 725 """ 726 gamma = self.gamma_probabilities(sequence, dictionary=True) 727 states = [max(timestep.items(), key=lambda x:x[1])[0] for timestep in gamma] 728 return states
729
730 - def viterbi_decode(self, sequence):
731 """ 732 Applies the Viterbi algorithm to return a single sequence of 733 states that maximises the probability of the sequence of 734 observations. 735 736 """ 737 N = len(sequence) 738 viterbi_matrix = [{} for i in range(N)] 739 back_pointers = [{} for i in range(N)] 740 def _trace_pointers(time, start_state, n): 741 """ 742 Trace back through the pointer matrix from state start_state 743 at time time, return a list of length n of the states 744 (backwards in time). 745 """ 746 if n == 0: 747 return [] 748 elif time == 0: 749 return [start_state] + ([None] * (n-1)) 750 else: 751 state = back_pointers[time-1][start_state] 752 return [start_state] + _trace_pointers(time-1, state, n-1)
753 754 # Initialize the first column 755 for label in self.label_dom: 756 viterbi_matrix[0][label] = self.emission_log_probability(sequence[0], label) \ 757 + self.transition_log_probability(label, *([None]*(self.order-1))) 758 759 # Fill in the other columns 760 for i in range(1, N): 761 for label in self.label_dom: 762 # Work out the possible probabilities 763 em = self.emission_log_probability(sequence[i], label) 764 transitions = [ \ 765 (self.transition_log_probability(label, *_trace_pointers(i-1, prev_label, self.order-1)) + 766 viterbi_matrix[i-1][prev_label], 767 prev_label) for prev_label in self.label_dom] 768 # Choose the previous state that maximises the Viterbi probability 769 trans,prev_label = max(transitions) 770 viterbi_matrix[i][label] = trans + em 771 # Set the pointer so we know what state we used 772 back_pointers[i-1][label] = prev_label 773 774 # Choose the most probable state to end in 775 final_state = max([(prob,lab) for (lab,prob) in viterbi_matrix[N-1].items()])[1] 776 states = _trace_pointers(N-1, final_state, N) 777 return list(reversed(states))
778
779 - def generalized_viterbi(self, sequence, N=2):
780 """ 781 Applies the N-best variant of the Viterbi algorithm to return 782 N sequences of states that maximize the probability of the 783 sequence of observations. 784 785 @see: Generalization of the Viterbi Algorithm, Foreman, 1993 786 787 @type N: int 788 @param N: number of label sequences to return (defaults to 2) 789 @rtype: list of (label sequence,probability) pairs 790 @return: ordered list of possible decodings, paired with their 791 Viterbi probabilities 792 793 """ 794 length = len(sequence) 795 viterbi_matrix = [{} for i in range(length)] 796 back_pointers = [{} for i in range(length)] 797 def _trace_pointers(time, start_state, rank, n): 798 """ 799 Trace back through the pointer matrix from state start_state 800 at time time, return, for a particular path (of the N-best we've 801 kept track of), a list of length n of the states (backwards 802 in time). 803 804 Returns a single path 805 806 """ 807 if n == 0: 808 return [] 809 elif n==1: 810 return [start_state] 811 elif time == 0: 812 return [start_state] + ([None] * (n-1)) 813 else: 814 (prev_state,prev_rank) = back_pointers[time-1][start_state][rank] 815 return [start_state] + _trace_pointers(time-1, prev_state, prev_rank, n-1)
816 817 # Initialize the first column 818 for t in range(self.order-1): 819 for l,label in enumerate(self.label_dom): 820 # There's only one possible path for each state at this point 821 viterbi_matrix[t][l] = [ self.emission_log_probability(sequence[t], label) \ 822 + self.transition_log_probability(label, *([None]*(self.order-1-t))) ] 823 824 trans = self.get_transition_matrix() 825 826 # Fill in the other columns 827 for i in range(self.order-1, length): 828 for l0,label in enumerate(self.label_dom): 829 # Work out the possible probabilities 830 em = self.emission_log_probability(sequence[i], label) 831 transitions = [] 832 for l1,prev_label in enumerate(self.label_dom): 833 # Work out the probability for coming from each of the 834 # top N ranked paths at the previous step 835 for prev_rank,prob in enumerate(viterbi_matrix[i-1][l1]): 836 selector = tuple([l0]+_trace_pointers(i-1, l1, prev_rank, self.order-1)) 837 transitions.append( 838 (trans[selector] + prob, l1, prev_rank) 839 ) 840 # Choose up to N from the top of these 841 top_N = list(reversed(sorted(transitions)))[:N] 842 # Multiply in the emission probability to each 843 top_probs = [trans_prob+em for (trans_prob,lab,rank) in top_N] 844 viterbi_matrix[i][l0] = top_probs 845 # Set the pointer so we know what states and ranks we used 846 back_pointers[i-1][l0] = [(lab,rank) for (trans_prob,lab,rank) in top_N] 847 848 # Transition to the final state from each possible end state 849 final_states = [] 850 for l1,prev_label in enumerate(self.label_dom): 851 # Work out the probability for coming from each of the 852 # top N ranked paths at the final step 853 for prev_rank,prob in enumerate(viterbi_matrix[length-1][l1]): 854 history = [self.label_dom[l] for l in \ 855 _trace_pointers(length-1, l1, prev_rank, self.order-1)] 856 final_prob = self.transition_log_probability(None, *history) 857 final_states.append((prob+final_prob, l1, prev_rank)) 858 final_states = list(reversed(sorted(final_states)))[:N] 859 # Now get a path for each of these 860 path_probs = [ 861 (list(reversed(_trace_pointers(length-1, lab, rank, length))), prob) \ 862 for prob,lab,rank in final_states] 863 # Transform the label indices into actual labels 864 path_probs = [ 865 ([self.label_dom[l] for l in path], prob) 866 for path,prob in path_probs] 867 return path_probs 868
869 - def viterbi_selector_probabilities(self, sequence):
870 """ 871 Returns a probability matrix like that given by the 872 forward-backward algorithm which assigns prob 1.0 to the Viterbi 873 chosen tags and 0.0 to all others. 874 875 """ 876 vit_seq = self.viterbi_decode(sequence) 877 return [dict([(label, 1.0 if label == vit_lab else 0.0) for label in self.label_dom]) \ 878 for vit_lab in vit_seq]
879
880 - def generate(self, length=10, labels=False):
881 """ 882 Generate a sequence of emissions at random using the n-gram 883 model. 884 If labels=True, outputs a sequence of (emission,label) pairs 885 indicating what hidden labels emitted the emissions. 886 887 The sequence will have maximum length C{length}, but may be shorter 888 if the model so determines. 889 890 """ 891 from jazzparser.utils.probabilities import random_selection 892 sequence = [] 893 label_seq = [] 894 tag_context = [None]*(self.order-1) 895 896 for i in range(length): 897 # Get the transition probabilities to all possible states 898 trans_probs = {} 899 for lab in self.label_dom+[None]: 900 trans_probs[lab] = self.transition_probability(lab, *tag_context) 901 902 # Pick a label to use next 903 new_lab = random_selection(trans_probs.items()) 904 if new_lab is None: 905 # We've reached the end state: stop here 906 break 907 908 label_seq.append(new_lab) 909 # Set the state to reflect this label choice 910 if self.order > 1: 911 tag_context = [new_lab] + tag_context[:-1] 912 913 # Get all the emission probabilities from this state 914 em_probs = {} 915 for em in self.emission_dom: 916 em_probs[em] = self.emission_probability(em, new_lab) 917 # Pick an emission randomly 918 sequence.append(random_selection(em_probs.items())) 919 920 if labels: 921 return zip(sequence, label_seq) 922 else: 923 return sequence
924
925 - def labeled_sequence_log_probability(self, sequence, labels):
926 """ 927 Computes the joint probability that the model assigns to a sequence 928 and its gold standard labels. 929 930 Probability is a log, because we'd get underflow otherwise. 931 932 """ 933 from collections import deque 934 labeled_data = zip(sequence, labels) 935 936 # Keep track of the current label history 937 history = deque([None] * (self.order-1)) 938 939 # Multiply the probabilities of all the labels and the emissions 940 joint_logprob = 0.0 941 942 for em,label in labeled_data: 943 # Get the transition probability of this label given its history 944 joint_logprob += self.transition_log_probability(label, *history) 945 joint_logprob += self.emission_log_probability(em, label) 946 947 if self.order > 1: 948 # Move the history along 949 history.pop() 950 history.appendleft(label) 951 return joint_logprob
952
953 - def get_all_ngrams(self, n):
954 """ 955 Returns all possible ngrams of length n composed of elements 956 from this model's domain. 957 958 """ 959 if n == 0: 960 return [[]] 961 elif n == 1: 962 return [[lab] for lab in self.label_dom] 963 else: 964 return sum([[[label]+copy.deepcopy(ngram) for label in self.label_dom] for ngram in self.get_all_ngrams(n-1)], [])
965
966 - def precompute(self):
967 """ 968 Creates a L{PrecomputedNgramModel} from this NgramModel. 969 970 """ 971 trans_mat = self.get_transition_matrix() 972 return PrecomputedNgramModel( 973 order = self.order, 974 label_counts = self.label_counts, 975 emission_counts = self.emission_counts, \ 976 estimator = self._estimator, \ 977 backoff_model = self.backoff_model, \ 978 label_dom = self.label_dom, 979 emission_dom = self.emission_dom, 980 transition_matrix = trans_mat)
981
982 - def to_picklable_dict(self):
983 """ 984 Produces a picklable representation of model as a dict. 985 You can't just pickle the object directly because some of the 986 NLTK classes can't be pickled. You can pickle this dict and 987 reconstruct the model using NgramModel.from_picklable_dict(dict). 988 989 """ 990 from jazzparser.utils.nltk.storage import object_to_dict 991 if self.backoff_model is None: 992 backoff = None 993 else: 994 backoff = self.backoff_model.to_picklable_dict() 995 996 return { 997 'order' : self.order, 998 'label_dom' : self.label_dom, 999 'emission_dom' : self.emission_dom, 1000 'label_counts' : object_to_dict(self.label_counts), 1001 'emission_counts' : object_to_dict(self.emission_counts), 1002 'estimator' : self._estimator, 1003 'backoff_model' : backoff, 1004 }
1005 1006 @staticmethod
1007 - def from_picklable_dict(data, *args, **kwargs):
1008 """ 1009 Reproduces an n-gram model that was converted to a picklable 1010 form using to_picklable_dict. 1011 1012 Extra args/kwargs are passed to the class constructor. 1013 1014 """ 1015 from jazzparser.utils.nltk.storage import dict_to_object 1016 1017 # Instantiate a different class (subclass) if given 1018 cls = kwargs.pop('cls', NgramModel) 1019 1020 if data['backoff_model'] is None: 1021 backoff = None 1022 else: 1023 # Recursively construct the backoff model 1024 # This is always an NgramModel, not the subclass 1025 backoff = NgramModel.from_picklable_dict(data['backoff_model']) 1026 1027 return cls(data['order'], 1028 dict_to_object(data['label_counts']), 1029 dict_to_object(data['emission_counts']), 1030 data['estimator'], 1031 backoff, 1032 data['label_dom'], 1033 data['emission_dom'], 1034 *args, **kwargs)
1035 1036 ####################################### 1037 ##### Old methods, now removed
1038 - def forward_log_probabilities(self, sequence, normalize=True):
1039 """ 1040 B{Removed.} Use L{normal_forward_probabilities} and take logs if 1041 you're happy with normalized probabilities. Otherwise, 1042 L{normal_forward_probabilities} needs to be made to return the 1043 sums it normalizes by. 1044 1045 """ 1046 raise NotImplementedError, "deprecated"
1047
1048 - def forward_probabilities(self, sequence, normalize=True):
1049 """ B{Removed.} See note on L{forward_log_probabilities}. """ 1050 raise NotImplementedError, "deprecated"
1051
1052 - def backward_log_probabilities(self, sequence, normalize=True):
1053 """ B{Removed.} See L{forward_log_probabilities} """ 1054 raise NotImplementedError, "deprecated"
1055
1056 - def backward_probabilities(self, sequence, normalize=True):
1057 """ B{Removed.} See L{forward_log_probabilities} """ 1058 raise NotImplementedError, "deprecated"
1059
1060 - def forward_backward_log_probabilities(self, sequence, normalize=True):
1061 """ 1062 B{Removed.} See L{forward_log_probabilities}. Use either 1063 L{normal_forward_backward_probabilities} or L{gamma_probabilities}. 1064 1065 """ 1066 raise NotImplementedError, "deprecated"
1067
1068 - def forward_backward_probabilities(self, *args, **kwargs):
1069 """ B{Removed.} See L{forward_backward_log_probabilities} """ 1070 raise NotImplementedError, "deprecated"
1071
1072 -class PrecomputedNgramModel(NgramModel):
1073 """ 1074 Overrides parts of L{NgramModel} to provide exactly the same interface, but 1075 stores the precomputed transition matrix and uses this to provide 1076 transition probabilities. This makes using the model a lot faster if 1077 you're doing things like forward-backward computations, since it needs the 1078 full transition matrix anyway. This processing is effectively pushed to 1079 training time instead of testing time. 1080 1081 """
1082 - def __init__(self, *args, **kwargs):
1083 transition_matrix = kwargs.pop('transition_matrix') 1084 NgramModel.__init__(self, *args, **kwargs) 1085 # Use the precomputed transition matrix to get transition probabilities 1086 self.transition_matrix = transition_matrix 1087 # Prepare a dictionary mapping ngrams to their matrix indices 1088 self._indices = dict( 1089 [(tuple([self.label_dom[s] for s in ngram]), tuple(ngram)) 1090 for ngram in _all_indices(self.order, len(self.label_dom))])
1091 1092 @staticmethod
1093 - def train(*args, **kwargs):
1094 """ 1095 Just calls L{NgramModel}'s train method and converts the result to a 1096 PrecomputedNgramModel. 1097 1098 """ 1099 model = NgramModel.train(*args, **kwargs) 1100 return model.precompute()
1101
1102 - def transition_probability(self, *ngram):
1103 """ 1104 Like the superclass, but read off from the precomputed matrix. 1105 1106 @see: NgramModel.transition_log_probability 1107 1108 """ 1109 if None in ngram: 1110 # Can't read this from the matrix, since it doesn't store 1111 # initial and final transitions 1112 return NgramModel.transition_probability(self, *ngram) 1113 else: 1114 return self.transition_matrix[self._indices[ngram]]
1115
1116 - def transition_log_probability(self, *ngram):
1117 """ 1118 Like the superclass, but read off from the precomputed matrix. 1119 1120 @see: NgramModel.transition_log_probability 1121 1122 """ 1123 if None in ngram: 1124 # Can't read this from the matrix, since it doesn't store 1125 # initial and final transitions 1126 return NgramModel.transition_log_probability(self, *ngram) 1127 else: 1128 return logprob(self.transition_matrix[self._indices[ngram]])
1129
1130 - def get_transition_matrix(self, transpose=False):
1131 """ 1132 Returns the precomputed transition matrix. 1133 1134 @see: NgramModel.get_transition_matrix 1135 1136 """ 1137 if transpose: 1138 # We still cache the transposed matrix as in NgramModel 1139 if self._transition_matrix_transpose_cache is None: 1140 # Ensure that the normal transition matrix has been generated 1141 mat = self.get_transition_matrix() 1142 # Tranpose it and copy it, so it's a real array, not a view 1143 self._transition_matrix_transpose_cache = numpy.copy(mat.transpose()) 1144 return self._transition_matrix_transpose_cache 1145 else: 1146 # Just return the precomputed matrix 1147 return self.transition_matrix
1148
1149 - def to_picklable_dict(self):
1150 from StringIO import StringIO 1151 # Use the super method to pickle everything it stores 1152 data = NgramModel.to_picklable_dict(self) 1153 1154 # Additionally store the transition matrix 1155 # Use Numpy's save method to convert it to binary data 1156 buff = StringIO() 1157 numpy.save(buff, self.transition_matrix) 1158 matrix_data = buff.getvalue() 1159 buff.close() 1160 1161 data['transition_matrix'] = matrix_data 1162 return data
1163 1164 @staticmethod
1165 - def from_picklable_dict(data):
1166 from StringIO import StringIO 1167 1168 if 'transition_matrix' not in data: 1169 # Probably trying to load from NgramModel data 1170 raise NgramError, "could not load PrecomputedNgramModel: "\ 1171 "no precomputed transition matrix has been stored" 1172 # Pull out the transition matrix 1173 matrix_data = data.pop('transition_matrix') 1174 # Load a Numpy array from this data 1175 buff = StringIO(matrix_data) 1176 transition_matrix = numpy.load(buff) 1177 buff.close() 1178 1179 model = NgramModel.from_picklable_dict(data, 1180 cls=PrecomputedNgramModel, 1181 # Extra kwarg to constructor 1182 transition_matrix=transition_matrix) 1183 return model
1184
1185 1186 -class NgramError(Exception):
1187 pass
1188