jazzparser.utils.nltk.ngram.model

80 """ 81 A general n-gram model, trained on some labelled data. 82 Generate models using NgramModel.train(). 83 84 Note that backoff assumes that an estimator with discounting is 85 being used. The backed off probabilities are scaled according to the 86 smoothing probability that would have been assigned to a zero 87 count. If you use this with MLE, the backoff is effectively disabled. 88 89 The estimator should be picklable. This means you can't use a 90 lambda, for example. 91 92 """

93 - def __init__(self, order, label_counts, emission_counts, \ 94 estimator, \ 95 backoff_model, \ 96 label_dom, emission_dom):

97 from ..storage import is_picklable 98 99 self.order = order 100 self.label_dom = label_dom 101 self.num_labels = len(label_dom) 102 self.emission_dom = emission_dom 103 self.num_emissions = len(emission_dom) 104 105 # Check the estimator's picklable 106 # Otherwise we'll get mysterious errors when storing the model 107 if not is_picklable(estimator): 108 raise NgramError, "the estimator given for an n-gram model "\ 109 "is not picklable. Use a statically-defined function as "\ 110 "your estimator (got %s)" % type(estimator).__name__ 111 112 # Keep the freq dists to get the raw counts 113 self.label_counts = label_counts 114 self.emission_counts = emission_counts 115 # Construct a probability distribution from these frequency dists 116 self.label_dist = ConditionalProbDist(label_counts, estimator, self.num_labels+1) 117 self.emission_dist = ConditionalProbDist(emission_counts, estimator, self.num_emissions) 118 # Keep hold of this for when we store the model 119 # It doesn't get used otherwise, as it was only to construct the prob dists 120 self._estimator = estimator 121 122 self.backoff_model = backoff_model 123 124 self.clear_cache()

125

126 - def __repr__(self):

127 return "<%s model,%d lab,%d em>" % (self.model_type, self.num_labels, self.num_emissions)

128

129 - def _get_model_type(self):

130 """ 131 model_type gives a sensible name to the order of n-gram. Uses 132 unigram, bigram, trigram, or x-gram. 133 134 """ 135 if self.order == 1: 136 return "unigram" 137 elif self.order == 2: 138 return "bigram" 139 elif self.order == 3: 140 return "trigram" 141 else: 142 return "%d-gram" % self.order

143 model_type = property(_get_model_type) 144

145 - def clear_cache(self):

146 """ 147 Subclasses that use caches should override this method to reset them 148 and call this in the superclass. 149 150 """ 151 self._transition_matrix_cache = None 152 self._transition_matrix_transpose_cache = None 153 # These will be filled as we access probabilities 154 self._discount_cache = {} 155 self._emission_discount_cache = {} 156 self._transition_cache = {}

157 158 @staticmethod

159 - def train(order, labelled_data, label_dom, emission_dom=None, cutoff=0, 160 backoff_order=0, estimator=None, ignore_list=None, backoff_kwargs={}):

161 """ 162 Trains the n-gram model given some labelled data. The data 163 should be in the form of a sequence of sequences of tuples (e,t), 164 where e is an emission and t a tag. 165 166 If ignore_list is given, all ngrams containing a label in 167 ignore_list will be ignored altogether. One use for this is to 168 ignore blank labels, so we don't learn the gaps in the labelled 169 data. 170 171 """ 172 if order <= 0: 173 raise NgramError, "cannot construct a %d-gram model!" % order 174 if backoff_order >= order: 175 raise NgramError, "cannot construct a %d-gram model "\ 176 "with %d-order backoff" % (order, backoff_order) 177 178 if estimator is None: 179 estimator = laplace_estimator 180 181 labelled_data = list(labelled_data) 182 label_dist = ConditionalFreqDist() 183 emission_dist = ConditionalFreqDist() 184 seen_emissions = set() 185 186 if ignore_list is not None: 187 ignores = set(ignore_list) 188 else: 189 ignores = set() 190 191 for training_sequence in labelled_data: 192 # Add blanks to the start of the sequence to get our first n-grams 193 # Add a blank to the end to get the final probabilities 194 training_data = [(None,None)]*(order-1) + list(training_sequence) + [(None,None)] 195 # Add counts to the model 196 for i in range(len(training_data)-order): 197 # This is actually a full ngram, to give us all states for 198 # the transition 199 ngram = list(reversed(training_data[i:i+order])) 200 outputs,tags = zip(*ngram) 201 # Check whether the tags include any of those we should be ignoring 202 if ignores.isdisjoint(set(tags)): 203 tag = tags[0] 204 # Don't count emissions from None 205 if tag is not None and outputs[0] is not None: 206 # Emission count 207 emission_dist[tag].inc(outputs[0]) 208 # Keep track of what emissions we've seen 209 seen_emissions.add(outputs[0]) 210 # Labelled ngram count 211 context = tuple(tags[1:]) 212 label_dist[context].inc(tag) 213 214 # Apply low-count cutoff to both distributions 215 for label in emission_dist.conditions(): 216 for sample in emission_dist[label].samples(): 217 if emission_dist[label][sample] <= cutoff: 218 emission_dist[label][sample] = 0 219 for context in label_dist.conditions(): 220 for sample in label_dist[context].samples(): 221 if label_dist[context][sample] <= cutoff: 222 label_dist[context][sample] = 0 223 224 # If no domain of emissions was given, use what we've seen in the data 225 if emission_dom is None: 226 emission_dom = seen_emissions 227 else: 228 emission_dom = set(emission_dom) 229 230 if backoff_order > 0: 231 kwargs = { 232 "emission_dom" : emission_dom, 233 "cutoff" : cutoff, 234 "backoff_order" : backoff_order-1, 235 "estimator" : estimator, 236 "ignore_list" : ignores 237 } 238 # Add in any args we've been told to use 239 kwargs.update(backoff_kwargs) 240 # Construct a model to back off to 241 backoff_model = NgramModel.train(order-1, 242 labelled_data, 243 label_dom, 244 **kwargs) 245 else: 246 backoff_model = None 247 248 # Return an actual model as trained above 249 return NgramModel(order, label_dist, emission_dist, estimator, backoff_model, label_dom, emission_dom)

250

251 - def _get_transition_backoff_scaler(self, context):

252 """ 253 Returns the amount to scale the backed off probabilities by 254 when backing off to an order n-1 model in the given context. 255 This is presented as alpha in Jurafsky and Martin. 256 257 Returned as a base 2 log. 258 259 A more efficient way to do this would be to supply a function 260 of the context specific to the discounting technique. In this 261 case it wouldn't be necessary to sum the discounted mass each 262 time. 263 264 """ 265 if context not in self._discount_cache: 266 # The prob mass reserved for unseen events can be computed by 267 # summing probabilities over all seen events and subtracting 268 # from 1. 269 # Our discounting model distributes this probability evenly over 270 # the unseen events, so we can compute the discounted mass by 271 # getting the probability of one unseen event and multiplying it. 272 seen_labels = set([lab for lab in self.label_dom+[None] if 273 self.label_counts[context][lab] > 0]) 274 if len(seen_labels) == 0: 275 # Not seen anything in this context. All mass is discounted! 276 self._discount_cache[context] = 0.0 277 else: 278 unseen_labels = set(self.label_dom+[None]) - seen_labels 279 # Try getting some event that won't have been seen 280 # Compute how much mass is reserved for unseen events 281 discounted_mass = self.label_dist[context].prob( 282 "%%% UNSEEN LABEL %%%") \ 283 * len(unseen_labels) 284 # Compute how much probability the n-1 order model assigns to 285 # things unseen by this model 286 backoff_context = context[:-1] 287 backoff_seen_mass = sum([ 288 self.backoff_model.transition_probability(lab, 289 *backoff_context) 290 for lab in unseen_labels], 0.0) 291 self._discount_cache[context] = logprob(discounted_mass) - \ 292 logprob(backoff_seen_mass) 293 return self._discount_cache[context]

294

295 - def transition_log_probability(self, *ngram):

296 """ 297 Gives the probability P(label_i | label_(i-1), ..., label_(i-n)), 298 where the previous labels are given in the sequence 299 label_context. The context should be in reverse order, i.e. 300 with the most recent label at the start. 301 302 Note that this is the probability of a label given the previous 303 n-1 labels, which is the same as the probability of the n-gram 304 [label_i, ..., label_(i-n+1)] given the ngram 305 [label_(i-1), ..., label_(i-n)], since all but the last element 306 of the ngram overlaps with the condition, so has probability 1. 307 308 Caches all computed transition probabilities. This is 309 particularly important for backoff models. Many n-grams will 310 back off to the same (n-1)-gram and we don't want to recompute 311 the transition probability for that each time. 312 313 """ 314 # Cache the probabilities we've computed 315 if ngram not in self._transition_cache: 316 # Check the context is the right size 317 # This method gets called millions of times, but most times the 318 # value is cached. We only need check this the first time 319 if len(ngram) != self.order: 320 raise NgramError, "a %d-gram model can only give transition "\ 321 "probabilities for a context of length %d. Tried to use "\ 322 "%d" % (self.order, self.order-1, len(ngram)-1) 323 324 # Compute the probability as we've not seen it before 325 context = tuple(ngram[1:]) 326 label = ngram[0] 327 # Check whether we have enough observations of this whole n-gram 328 if self.backoff_model is not None and self.label_counts[context][label] == 0: 329 # Backoff to a lower-order model 330 # Work out how much prob mass is reserved for unseen events 331 scale = self._get_transition_backoff_scaler(context) 332 # Backoff and scale to fill the reserved mass 333 prob = scale + self.backoff_model.transition_log_probability(*(ngram[:-1])) 334 else: 335 prob = self.label_dist[context].logprob(label) 336 # Don't compute this one again 337 self._transition_cache[ngram] = prob 338 return self._transition_cache[ngram]

339

340 - def transition_log_probability_debug(self, *ngram):

341 """ 342 Debugging version of the above. 343 Use this only for debugging. It prints stuff out and doesn't cache. 344 345 """ 346 if len(ngram) != self.order: 347 raise NgramError, "a %d-gram model can only give transition "\ 348 "probabilities for a context of length %d. Tried to use "\ 349 "%d" % (self.order, self.order-1, len(ngram)-1) 350 351 # Compute the probability as we've not seen it before 352 context = tuple(ngram[1:]) 353 label = ngram[0] 354 # Check whether we have enough observations of this whole n-gram 355 if self.backoff_model is not None and self.label_counts[context][label] == 0: 356 print "Backing off: %s" % ", ".join([str(s) for s in ngram[:-1]]) 357 # Work out how much prob mass is reserved for unseen events 358 scale = self._get_transition_backoff_scaler(context) 359 print " Scaler: %s, %f" % (", ".join([str(s) for s in context]),2**scale) 360 # Backoff and scale by the reserved mass 361 prob = scale + self.backoff_model.transition_log_probability_debug(*(ngram[:-1])) 362 else: 363 prob = self.label_dist[context].logprob(label) 364 print "Using %d-gram model probability: %f" % (self.order, 2**prob) 365 return prob

366

367 - def transition_probability(self, label, *label_context):

368 """ 369 Wrapper for transition_log_probability to return a real 370 probability. 371 372 """ 373 return 2**self.transition_log_probability(label, *label_context)

374

375 - def transition_probability_debug(self, *ngram):

376 """ Debugging version of the above """ 377 return 2**self.transition_log_probability_debug(*ngram)

378

379 - def emission_log_probability(self, emission, label):

380 """ 381 Gives the probability P(emission | label). Returned as a base 2 382 log. 383 384 """ 385 return self.emission_dist[label].logprob(emission)

386

387 - def emission_probability(self, emission, label):

388 """ 389 Wrapper for emission_log_probability to return an real 390 probability. 391 392 """ 393 return 2**self.emission_log_probability(emission, label)

394

395 - def get_backoff_models(self):

396 """ 397 Returns a list consisting of this model and all the recursive 398 backoff models, until no more backoff models are provided. 399 400 """ 401 if self.backoff_model is None: 402 return [self] 403 else: 404 return [self]+self.backoff_model.get_backoff_models()

405

406 - def get_transition_matrix(self, transpose=False):

407 """ 408 Produces a matrix of the transition probabilities from every 409 (n-1)-gram to every state. Matrix indices are based on enumeration of 410 C{self.label_dom}. The matrix is returned as a numpy array. 411 412 The matrix has n dimensions. The first index is the current state, 413 the second the previous, etc. Thus, 414 matrix[i,j,...] = p(state_t = i | state_(t-1) = j, ...). 415 416 Probabilities are not logs. 417 418 """ 419 if transpose: 420 if self._transition_matrix_transpose_cache is None: 421 # Ensure that the normal transition matrix has been generated 422 mat = self.get_transition_matrix() 423 # Tranpose it and copy it, so it's a real array, not a view 424 self._transition_matrix_transpose_cache = numpy.copy(mat.transpose()) 425 return self._transition_matrix_transpose_cache 426 else: 427 if self._transition_matrix_cache is None: 428 # Compute the matrix from scratch, as we've not done it yet 429 N = len(self.label_dom) 430 shape = tuple([N]*self.order) 431 trans = numpy.zeros(shape, numpy.float64) 432 433 states = self.label_dom 434 # Build a list of lists, each n long, with all possible array indices 435 index_sets = _all_indices(self.order, N) 436 437 # Fill the matrix 438 for indices in index_sets: 439 # Get the ngram of state labels for this ngram of array indices 440 state_labels = [states[i] for i in indices] 441 trans[tuple(indices)] = self.transition_probability(*state_labels) 442 443 self._transition_matrix_cache = trans 444 return self._transition_matrix_cache

445

446 - def get_emission_matrix(self, sequence):

447 """ 448 Produces a matrix of the probability of each timestep's emission from 449 each state. 450 451 matrix[t,i] = p(o_t | state=i) 452 453 """ 454 T = len(sequence) 455 N = len(self.label_dom) 456 457 ems = numpy.zeros((T, N), numpy.float64) 458 459 # Set the probability for every timestep... 460 for t,emission in enumerate(sequence): 461 # ...given each state 462 for i,label in enumerate(self.label_dom): 463 ems[t,i] = self.emission_probability(emission, label) 464 return ems

465

466 - def decode_forward(self, sequence):

467 """ 468 Uses the forward probabilities to decode the sequence and 469 returns a list of labels. 470 471 """ 472 forwards = self.normal_forward_probabilities(sequence) 473 # Sum up ngram dimensions, leaving just time and state 474 forwards = sum_matrix_dims(forwards) 475 # Do an argmax over the possible states for each timestep 476 states = numpy.argmax(forwards, axis=1) 477 # Convert these indices into state labels 478 return [self.label_dom[state] for state in states]

479

480 - def normal_forward_probabilities(self, sequence, seq_prob=False):

481 """ 482 Return the forward probability matrix as a Numpy array. 483 This is equivalent to L{forward_probabilities}, but much faster. 484 It doesn't need logs because it's normalizing at each timestep. 485 486 @param seq_prob: return the log probability of the whole sequence 487 as well as the array (tuple of (array,logprob)). 488 @return: S-dimensional Numpy array, where S is the number of states. 489 The first dimension represents timesteps, the rest the (S-1) 490 states in the ngrams. 491 492 """ 493 N = len(sequence) 494 states = self.label_dom 495 496 # Create an array for the total logprobs 497 coefficients = numpy.zeros((N,), numpy.float64) 498 499 # Prepare the transition and emission matrices 500 ems = self.get_emission_matrix(sequence) 501 trans = self.get_transition_matrix() 502 503 if self.order == 1: 504 # We can do this quickly for unigrams and it saves dealing with 505 # dodgy cases of everything below 506 # We only use emission probabilities: transition probs 507 # just supply (unconditioned) priors 508 forward_matrix = trans*ems 509 # Sum the probabilities in each timestep 510 total_probs = numpy.sum(forward_matrix, axis=1) 511 for t in range(N): 512 coefficients[t] = logprob(total_probs[t]) 513 forward_matrix[t] /= total_probs[t] 514 if seq_prob: 515 return forward_matrix, numpy.sum(coefficients) 516 else: 517 return forward_matrix 518 519 # Initialize an empty matrix 520 forward_matrix = numpy.zeros([N]+[len(states)]*(self.order-1), numpy.float64) 521 522 # First fill in the first columns with histories padded with Nones 523 # In these columns, we only use a subset of the dimensions 524 for time in range(self.order-1): 525 # Get the indices corresponding to all possible sub-n-grams 526 # between the start and here 527 index_sets = _all_indices(time+1, len(states)) 528 529 for indices in index_sets: 530 # Get the actual ngram for these indices 531 ngram = [self.label_dom[i] for i in indices] 532 # Pad with Nones to a length of n 533 ngram.extend([None]*(self.order-time-1)) 534 # Pad the indices with 0s so we use all the dimensions 535 indices.extend([0]*(self.order-time-2)) 536 selector = tuple([time]+indices) 537 538 # Fill in with the (None-padded) transition probability 539 prob = self.transition_probability(*ngram) 540 if time > 0: 541 # Multiply by the prob we must have come from 542 previous_state = tuple([time-1]+indices[1:]+[0]) 543 prob *= forward_matrix[previous_state] 544 forward_matrix[selector] += prob 545 546 # Multiply in the emission probabilities 547 forward_matrix[time] = (forward_matrix[time].transpose() * ems[time]).transpose() 548 # Normalize 549 coefficients[time] = logprob(numpy.sum(forward_matrix[time])) 550 forward_matrix[time] /= numpy.sum(forward_matrix[time]) 551 552 for time in range(self.order-1, N): 553 # Multiplying, the previous timestep gets broadcast over the 554 # first axis of the transition matrix, which is what we need 555 # (that axis represents the most recent state in the n-gram) 556 trans_step = forward_matrix[time-1] * trans 557 # Sum probabilities over the last axis, i.e. the earliest state 558 # in the n-gram 559 trans_step = numpy.sum(trans_step, axis=-1) 560 # Multiply in the emission probabilities 561 forward_matrix[time] = (trans_step.transpose() * ems[time]).transpose() 562 # Normalize the timestep 563 coefficients[time] = logprob(numpy.sum(forward_matrix[time])) 564 forward_matrix[time] /= numpy.sum(forward_matrix[time]) 565 566 if seq_prob: 567 return forward_matrix, numpy.sum(coefficients) 568 else: 569 return forward_matrix

570

571 - def normal_backward_probabilities(self, sequence):

572 """ 573 Return the backward probability matrices a Numpy array. This is 574 faster than L{backward_log_probabilities} because it uses Numpy 575 arrays with non-log probabilities and normalizes each timestep. 576 577 @return: matrix over timesteps and all labels, with a dimension for 578 each state in each (n-1)-gram: for time steps 579 i and labels k, P(word^(i+1), ..., word^T | label_k^i) 580 581 """ 582 N = len(sequence) 583 states = self.label_dom 584 S = len(states) 585 586 # Prepare the transition and emission matrices 587 trans_back = self.get_transition_matrix(transpose=True) 588 ems = self.get_emission_matrix(sequence) 589 590 if self.order == 1: 591 # We can do this quickly for unigrams and it saves dealing with 592 # dodgy cases of everything below 593 # For unigrams, the backward probs are uniform 594 backward_matrix = numpy.ones((N,S), numpy.float64) / S 595 return backward_matrix 596 597 # For other cases, things are a little more complicated 598 # Initialize an empty matrix 599 backward_matrix = numpy.zeros([N]+[S]*(self.order-1), numpy.float64) 600 601 index_sets = _all_indices(self.order-1, S) 602 603 # Initialize from the end 604 for indices in index_sets: 605 # Make an (n-1)-gram for the context 606 ngram = [states[i] for i in indices] 607 # Put the probability of going to the end state from each context 608 # We read this from the model, since it's not in the trans matrix 609 # Note that it's stored in transposed form (see below) 610 selector = tuple([N-1]+list(reversed(indices))) 611 backward_matrix[selector] = self.transition_probability(None, *ngram) 612 backward_matrix[N-1] /= numpy.sum(backward_matrix[N-1]) 613 614 # Work backwards, filling in the matrix 615 for time in range(N-2, self.order-2, -1): 616 # Transposing the next timestep and the trans matrix causes their 617 # indices to be reversed, so that the timestep is broadcast over 618 # the last index (now the first) 619 # Then we multiply in the emission probabilities, broadcast over 620 # the first index (by doing it while transposed) and transpose 621 # back to correct the indices 622 # Summing over the 0th axis sums over possible next states 623 # (the last element of the ngram in the next timestep) 624 # To speed up the computations, we keep the whole lot transposed 625 backward_matrix[time] = numpy.sum((trans_back * 626 backward_matrix[time+1] 627 * ems[time+1]), 628 axis=-1) 629 # Normalize over the timestep 630 backward_matrix[time] /= numpy.sum(backward_matrix[time]) 631 632 # Fill in the first columns now, using None-padding histories 633 for time in range(self.order-2, -1, -1): 634 # Get the indices corresponding to all possible t-grams 635 id_tgrams = _all_indices(time+1, S) 636 637 for id_tgram in id_tgrams: 638 # Get the actual ngram for these indices 639 tgram = [self.label_dom[i] for i in id_tgram] 640 # Fill this out to an (n-1)-gram stretching before the 641 # start with Nones 642 nm1gram = tgram + [None]*(self.order-2-time) 643 # Pad indices with 0s 644 id_nm1gram = id_tgram + [0]*(self.order-2-time) 645 646 # Sum over the possible next states following this (n-1)-gram 647 summed_prob = 0.0 648 for next_id in range(S): 649 # Get transition probability from nm1gram to state next_id 650 prob = self.transition_probability(states[next_id], *nm1gram) 651 # Multiply in the emission probability for the next state 652 prob *= ems[time+1, next_id] 653 654 # Multiply by the bwd prob for the next state 655 next_state = tuple([time+1]+list(reversed(id_nm1gram[:-1]))+[next_id]) 656 prob *= backward_matrix[next_state] 657 # Sum this over the possible next states 658 summed_prob += prob 659 backward_matrix[tuple([time]+list(reversed(id_nm1gram)))] = summed_prob 660 661 # Normalize 662 backward_matrix[time] /= numpy.sum(backward_matrix[time]) 663 664 # Now transpose all the timestep matrices back so the ngrams 665 # correspond correctly to the indices 666 for time in range(N): 667 backward_matrix[time] = backward_matrix[time].transpose() 668 669 return backward_matrix

670

671 - def normal_forward_backward_probabilities(self, sequence, forward=None, 672 backward=None):

673 """ 674 A faster implementation of L{forward_backward_probabilities} for the 675 case where we're normalizing, using Numpy and non-log probabilities. 676 677 This is still an S-dimensional matrix, not the state-occupation 678 probabilities. Use L{gamma_probabilities} to get that. 679 680 """ 681 if forward is None: 682 forward = self.normal_forward_probabilities(sequence) 683 if backward is None: 684 backward = self.normal_backward_probabilities(sequence) 685 686 # This is simple with Numpy, since it does pointwise multiplication 687 gamma = forward * backward 688 # Now renormalize 689 for time in range(len(sequence)): 690 gamma[time] /= numpy.sum(gamma[time]) 691 return gamma

692

693 - def gamma_probabilities(self, sequence, dictionary=False, 694 forward=None, backward=None):

695 """ 696 State-occupation probabilities. 697 698 @type dictionary: bool 699 @param dictionary: return a list of label dictionaries instead 700 of a numpy matrix 701 702 """ 703 fwd_bwd = self.normal_forward_backward_probabilities(sequence, 704 forward=forward, backward=backward) 705 # Sum over all but the first two dimensions: time and last state in ngram 706 for i in range(fwd_bwd.ndim-2): 707 fwd_bwd = numpy.sum(fwd_bwd, axis=-1) 708 709 if dictionary: 710 # Convert to a list of dictionaries, keyed by label 711 gamma = [] 712 for t in range(fwd_bwd.shape[0]): 713 dic = {} 714 for s,state in enumerate(self.label_dom): 715 dic[state] = fwd_bwd[t,s] 716 gamma.append(dic) 717 return gamma 718 else: 719 return fwd_bwd

720

721 - def decode_gamma(self, sequence):

722 """ 723 Use the state occupation probabilities to decode a sequence. 724 725 """ 726 gamma = self.gamma_probabilities(sequence, dictionary=True) 727 states = [max(timestep.items(), key=lambda x:x[1])[0] for timestep in gamma] 728 return states

729

730 - def viterbi_decode(self, sequence):

731 """ 732 Applies the Viterbi algorithm to return a single sequence of 733 states that maximises the probability of the sequence of 734 observations. 735 736 """ 737 N = len(sequence) 738 viterbi_matrix = [{} for i in range(N)] 739 back_pointers = [{} for i in range(N)] 740 def _trace_pointers(time, start_state, n): 741 """ 742 Trace back through the pointer matrix from state start_state 743 at time time, return a list of length n of the states 744 (backwards in time). 745 """ 746 if n == 0: 747 return [] 748 elif time == 0: 749 return [start_state] + ([None] * (n-1)) 750 else: 751 state = back_pointers[time-1][start_state] 752 return [start_state] + _trace_pointers(time-1, state, n-1)

753 754 # Initialize the first column 755 for label in self.label_dom: 756 viterbi_matrix[0][label] = self.emission_log_probability(sequence[0], label) \ 757 + self.transition_log_probability(label, *([None]*(self.order-1))) 758 759 # Fill in the other columns 760 for i in range(1, N): 761 for label in self.label_dom: 762 # Work out the possible probabilities 763 em = self.emission_log_probability(sequence[i], label) 764 transitions = [ \ 765 (self.transition_log_probability(label, *_trace_pointers(i-1, prev_label, self.order-1)) + 766 viterbi_matrix[i-1][prev_label], 767 prev_label) for prev_label in self.label_dom] 768 # Choose the previous state that maximises the Viterbi probability 769 trans,prev_label = max(transitions) 770 viterbi_matrix[i][label] = trans + em 771 # Set the pointer so we know what state we used 772 back_pointers[i-1][label] = prev_label 773 774 # Choose the most probable state to end in 775 final_state = max([(prob,lab) for (lab,prob) in viterbi_matrix[N-1].items()])[1] 776 states = _trace_pointers(N-1, final_state, N) 777 return list(reversed(states))

Source Code for Module jazzparser.utils.nltk.ngram.model