1 """Generic n-gram model implementation, using NLTK's probability handling.
2
3 NLTK provides an n-gram POS tagger, but it can only assign the most
4 likely tag sequence to observations. It doesn't calculate probabilities,
5 so is no use for our supertagger component. Here I provide a generic
6 n-gram model, using NLTK's probability stuff.
7
8 """
9 """
10 ============================== License ========================================
11 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
12
13 This file is part of The Jazz Parser.
14
15 The Jazz Parser is free software: you can redistribute it and/or modify
16 it under the terms of the GNU General Public License as published by
17 the Free Software Foundation, either version 3 of the License, or
18 (at your option) any later version.
19
20 The Jazz Parser is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 GNU General Public License for more details.
24
25 You should have received a copy of the GNU General Public License
26 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
27
28 ============================ End license ======================================
29
30 """
31 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
32
33 import copy,sys,logging, math, warnings
34 import numpy
35 from numpy import sum as array_sum
36 from jazzparser.utils.nltk.probability import laplace_estimator, logprob, \
37 cond_prob_dist_to_dictionary_cond_prob_dist, sum_logs
38 from nltk.probability import ConditionalProbDist, sum_logs, DictionaryProbDist, \
39 ConditionalFreqDist, FreqDist
40
41 NGRAM_JOINER = "::"
42
43
44 logger = logging.getLogger("main_logger")
47 """
48 Utility function for methods below.
49 Converts all the probabilities in the time-state matrix from log
50 probabilities to probabilities.
51
52 """
53 return [dict([(label, 2**log_prob) for (label,log_prob) in timestep.items()]) for timestep in matrix]
54
56 """
57 Takes an n-dimensional matrix and sums over all but the first C{dims}
58 dimensions, returning a C{dims}-dimensional matrix. You can do this
59 easily in later versions of Numpy!
60
61 """
62
63 start_dims = matrix.ndim
64 for i in range(start_dims-dims):
65 matrix = numpy.sum(matrix, axis=-1)
66 return matrix
67
69 """
70 Function to generate all index n-grams of a given length
71
72 """
73 if length < 1:
74 return [[]]
75 else:
76 return sum([ [[i]+sub for i in range(num_labels)]
77 for sub in _all_indices(length-1, num_labels)], [])
78
80 """
81 A general n-gram model, trained on some labelled data.
82 Generate models using NgramModel.train().
83
84 Note that backoff assumes that an estimator with discounting is
85 being used. The backed off probabilities are scaled according to the
86 smoothing probability that would have been assigned to a zero
87 count. If you use this with MLE, the backoff is effectively disabled.
88
89 The estimator should be picklable. This means you can't use a
90 lambda, for example.
91
92 """
93 - def __init__(self, order, label_counts, emission_counts, \
94 estimator, \
95 backoff_model, \
96 label_dom, emission_dom):
97 from ..storage import is_picklable
98
99 self.order = order
100 self.label_dom = label_dom
101 self.num_labels = len(label_dom)
102 self.emission_dom = emission_dom
103 self.num_emissions = len(emission_dom)
104
105
106
107 if not is_picklable(estimator):
108 raise NgramError, "the estimator given for an n-gram model "\
109 "is not picklable. Use a statically-defined function as "\
110 "your estimator (got %s)" % type(estimator).__name__
111
112
113 self.label_counts = label_counts
114 self.emission_counts = emission_counts
115
116 self.label_dist = ConditionalProbDist(label_counts, estimator, self.num_labels+1)
117 self.emission_dist = ConditionalProbDist(emission_counts, estimator, self.num_emissions)
118
119
120 self._estimator = estimator
121
122 self.backoff_model = backoff_model
123
124 self.clear_cache()
125
127 return "<%s model,%d lab,%d em>" % (self.model_type, self.num_labels, self.num_emissions)
128
130 """
131 model_type gives a sensible name to the order of n-gram. Uses
132 unigram, bigram, trigram, or x-gram.
133
134 """
135 if self.order == 1:
136 return "unigram"
137 elif self.order == 2:
138 return "bigram"
139 elif self.order == 3:
140 return "trigram"
141 else:
142 return "%d-gram" % self.order
143 model_type = property(_get_model_type)
144
146 """
147 Subclasses that use caches should override this method to reset them
148 and call this in the superclass.
149
150 """
151 self._transition_matrix_cache = None
152 self._transition_matrix_transpose_cache = None
153
154 self._discount_cache = {}
155 self._emission_discount_cache = {}
156 self._transition_cache = {}
157
158 @staticmethod
159 - def train(order, labelled_data, label_dom, emission_dom=None, cutoff=0,
160 backoff_order=0, estimator=None, ignore_list=None, backoff_kwargs={}):
161 """
162 Trains the n-gram model given some labelled data. The data
163 should be in the form of a sequence of sequences of tuples (e,t),
164 where e is an emission and t a tag.
165
166 If ignore_list is given, all ngrams containing a label in
167 ignore_list will be ignored altogether. One use for this is to
168 ignore blank labels, so we don't learn the gaps in the labelled
169 data.
170
171 """
172 if order <= 0:
173 raise NgramError, "cannot construct a %d-gram model!" % order
174 if backoff_order >= order:
175 raise NgramError, "cannot construct a %d-gram model "\
176 "with %d-order backoff" % (order, backoff_order)
177
178 if estimator is None:
179 estimator = laplace_estimator
180
181 labelled_data = list(labelled_data)
182 label_dist = ConditionalFreqDist()
183 emission_dist = ConditionalFreqDist()
184 seen_emissions = set()
185
186 if ignore_list is not None:
187 ignores = set(ignore_list)
188 else:
189 ignores = set()
190
191 for training_sequence in labelled_data:
192
193
194 training_data = [(None,None)]*(order-1) + list(training_sequence) + [(None,None)]
195
196 for i in range(len(training_data)-order):
197
198
199 ngram = list(reversed(training_data[i:i+order]))
200 outputs,tags = zip(*ngram)
201
202 if ignores.isdisjoint(set(tags)):
203 tag = tags[0]
204
205 if tag is not None and outputs[0] is not None:
206
207 emission_dist[tag].inc(outputs[0])
208
209 seen_emissions.add(outputs[0])
210
211 context = tuple(tags[1:])
212 label_dist[context].inc(tag)
213
214
215 for label in emission_dist.conditions():
216 for sample in emission_dist[label].samples():
217 if emission_dist[label][sample] <= cutoff:
218 emission_dist[label][sample] = 0
219 for context in label_dist.conditions():
220 for sample in label_dist[context].samples():
221 if label_dist[context][sample] <= cutoff:
222 label_dist[context][sample] = 0
223
224
225 if emission_dom is None:
226 emission_dom = seen_emissions
227 else:
228 emission_dom = set(emission_dom)
229
230 if backoff_order > 0:
231 kwargs = {
232 "emission_dom" : emission_dom,
233 "cutoff" : cutoff,
234 "backoff_order" : backoff_order-1,
235 "estimator" : estimator,
236 "ignore_list" : ignores
237 }
238
239 kwargs.update(backoff_kwargs)
240
241 backoff_model = NgramModel.train(order-1,
242 labelled_data,
243 label_dom,
244 **kwargs)
245 else:
246 backoff_model = None
247
248
249 return NgramModel(order, label_dist, emission_dist, estimator, backoff_model, label_dom, emission_dom)
250
252 """
253 Returns the amount to scale the backed off probabilities by
254 when backing off to an order n-1 model in the given context.
255 This is presented as alpha in Jurafsky and Martin.
256
257 Returned as a base 2 log.
258
259 A more efficient way to do this would be to supply a function
260 of the context specific to the discounting technique. In this
261 case it wouldn't be necessary to sum the discounted mass each
262 time.
263
264 """
265 if context not in self._discount_cache:
266
267
268
269
270
271
272 seen_labels = set([lab for lab in self.label_dom+[None] if
273 self.label_counts[context][lab] > 0])
274 if len(seen_labels) == 0:
275
276 self._discount_cache[context] = 0.0
277 else:
278 unseen_labels = set(self.label_dom+[None]) - seen_labels
279
280
281 discounted_mass = self.label_dist[context].prob(
282 "%%% UNSEEN LABEL %%%") \
283 * len(unseen_labels)
284
285
286 backoff_context = context[:-1]
287 backoff_seen_mass = sum([
288 self.backoff_model.transition_probability(lab,
289 *backoff_context)
290 for lab in unseen_labels], 0.0)
291 self._discount_cache[context] = logprob(discounted_mass) - \
292 logprob(backoff_seen_mass)
293 return self._discount_cache[context]
294
296 """
297 Gives the probability P(label_i | label_(i-1), ..., label_(i-n)),
298 where the previous labels are given in the sequence
299 label_context. The context should be in reverse order, i.e.
300 with the most recent label at the start.
301
302 Note that this is the probability of a label given the previous
303 n-1 labels, which is the same as the probability of the n-gram
304 [label_i, ..., label_(i-n+1)] given the ngram
305 [label_(i-1), ..., label_(i-n)], since all but the last element
306 of the ngram overlaps with the condition, so has probability 1.
307
308 Caches all computed transition probabilities. This is
309 particularly important for backoff models. Many n-grams will
310 back off to the same (n-1)-gram and we don't want to recompute
311 the transition probability for that each time.
312
313 """
314
315 if ngram not in self._transition_cache:
316
317
318
319 if len(ngram) != self.order:
320 raise NgramError, "a %d-gram model can only give transition "\
321 "probabilities for a context of length %d. Tried to use "\
322 "%d" % (self.order, self.order-1, len(ngram)-1)
323
324
325 context = tuple(ngram[1:])
326 label = ngram[0]
327
328 if self.backoff_model is not None and self.label_counts[context][label] == 0:
329
330
331 scale = self._get_transition_backoff_scaler(context)
332
333 prob = scale + self.backoff_model.transition_log_probability(*(ngram[:-1]))
334 else:
335 prob = self.label_dist[context].logprob(label)
336
337 self._transition_cache[ngram] = prob
338 return self._transition_cache[ngram]
339
341 """
342 Debugging version of the above.
343 Use this only for debugging. It prints stuff out and doesn't cache.
344
345 """
346 if len(ngram) != self.order:
347 raise NgramError, "a %d-gram model can only give transition "\
348 "probabilities for a context of length %d. Tried to use "\
349 "%d" % (self.order, self.order-1, len(ngram)-1)
350
351
352 context = tuple(ngram[1:])
353 label = ngram[0]
354
355 if self.backoff_model is not None and self.label_counts[context][label] == 0:
356 print "Backing off: %s" % ", ".join([str(s) for s in ngram[:-1]])
357
358 scale = self._get_transition_backoff_scaler(context)
359 print " Scaler: %s, %f" % (", ".join([str(s) for s in context]),2**scale)
360
361 prob = scale + self.backoff_model.transition_log_probability_debug(*(ngram[:-1]))
362 else:
363 prob = self.label_dist[context].logprob(label)
364 print "Using %d-gram model probability: %f" % (self.order, 2**prob)
365 return prob
366
368 """
369 Wrapper for transition_log_probability to return a real
370 probability.
371
372 """
373 return 2**self.transition_log_probability(label, *label_context)
374
378
380 """
381 Gives the probability P(emission | label). Returned as a base 2
382 log.
383
384 """
385 return self.emission_dist[label].logprob(emission)
386
388 """
389 Wrapper for emission_log_probability to return an real
390 probability.
391
392 """
393 return 2**self.emission_log_probability(emission, label)
394
396 """
397 Returns a list consisting of this model and all the recursive
398 backoff models, until no more backoff models are provided.
399
400 """
401 if self.backoff_model is None:
402 return [self]
403 else:
404 return [self]+self.backoff_model.get_backoff_models()
405
407 """
408 Produces a matrix of the transition probabilities from every
409 (n-1)-gram to every state. Matrix indices are based on enumeration of
410 C{self.label_dom}. The matrix is returned as a numpy array.
411
412 The matrix has n dimensions. The first index is the current state,
413 the second the previous, etc. Thus,
414 matrix[i,j,...] = p(state_t = i | state_(t-1) = j, ...).
415
416 Probabilities are not logs.
417
418 """
419 if transpose:
420 if self._transition_matrix_transpose_cache is None:
421
422 mat = self.get_transition_matrix()
423
424 self._transition_matrix_transpose_cache = numpy.copy(mat.transpose())
425 return self._transition_matrix_transpose_cache
426 else:
427 if self._transition_matrix_cache is None:
428
429 N = len(self.label_dom)
430 shape = tuple([N]*self.order)
431 trans = numpy.zeros(shape, numpy.float64)
432
433 states = self.label_dom
434
435 index_sets = _all_indices(self.order, N)
436
437
438 for indices in index_sets:
439
440 state_labels = [states[i] for i in indices]
441 trans[tuple(indices)] = self.transition_probability(*state_labels)
442
443 self._transition_matrix_cache = trans
444 return self._transition_matrix_cache
445
447 """
448 Produces a matrix of the probability of each timestep's emission from
449 each state.
450
451 matrix[t,i] = p(o_t | state=i)
452
453 """
454 T = len(sequence)
455 N = len(self.label_dom)
456
457 ems = numpy.zeros((T, N), numpy.float64)
458
459
460 for t,emission in enumerate(sequence):
461
462 for i,label in enumerate(self.label_dom):
463 ems[t,i] = self.emission_probability(emission, label)
464 return ems
465
467 """
468 Uses the forward probabilities to decode the sequence and
469 returns a list of labels.
470
471 """
472 forwards = self.normal_forward_probabilities(sequence)
473
474 forwards = sum_matrix_dims(forwards)
475
476 states = numpy.argmax(forwards, axis=1)
477
478 return [self.label_dom[state] for state in states]
479
481 """
482 Return the forward probability matrix as a Numpy array.
483 This is equivalent to L{forward_probabilities}, but much faster.
484 It doesn't need logs because it's normalizing at each timestep.
485
486 @param seq_prob: return the log probability of the whole sequence
487 as well as the array (tuple of (array,logprob)).
488 @return: S-dimensional Numpy array, where S is the number of states.
489 The first dimension represents timesteps, the rest the (S-1)
490 states in the ngrams.
491
492 """
493 N = len(sequence)
494 states = self.label_dom
495
496
497 coefficients = numpy.zeros((N,), numpy.float64)
498
499
500 ems = self.get_emission_matrix(sequence)
501 trans = self.get_transition_matrix()
502
503 if self.order == 1:
504
505
506
507
508 forward_matrix = trans*ems
509
510 total_probs = numpy.sum(forward_matrix, axis=1)
511 for t in range(N):
512 coefficients[t] = logprob(total_probs[t])
513 forward_matrix[t] /= total_probs[t]
514 if seq_prob:
515 return forward_matrix, numpy.sum(coefficients)
516 else:
517 return forward_matrix
518
519
520 forward_matrix = numpy.zeros([N]+[len(states)]*(self.order-1), numpy.float64)
521
522
523
524 for time in range(self.order-1):
525
526
527 index_sets = _all_indices(time+1, len(states))
528
529 for indices in index_sets:
530
531 ngram = [self.label_dom[i] for i in indices]
532
533 ngram.extend([None]*(self.order-time-1))
534
535 indices.extend([0]*(self.order-time-2))
536 selector = tuple([time]+indices)
537
538
539 prob = self.transition_probability(*ngram)
540 if time > 0:
541
542 previous_state = tuple([time-1]+indices[1:]+[0])
543 prob *= forward_matrix[previous_state]
544 forward_matrix[selector] += prob
545
546
547 forward_matrix[time] = (forward_matrix[time].transpose() * ems[time]).transpose()
548
549 coefficients[time] = logprob(numpy.sum(forward_matrix[time]))
550 forward_matrix[time] /= numpy.sum(forward_matrix[time])
551
552 for time in range(self.order-1, N):
553
554
555
556 trans_step = forward_matrix[time-1] * trans
557
558
559 trans_step = numpy.sum(trans_step, axis=-1)
560
561 forward_matrix[time] = (trans_step.transpose() * ems[time]).transpose()
562
563 coefficients[time] = logprob(numpy.sum(forward_matrix[time]))
564 forward_matrix[time] /= numpy.sum(forward_matrix[time])
565
566 if seq_prob:
567 return forward_matrix, numpy.sum(coefficients)
568 else:
569 return forward_matrix
570
572 """
573 Return the backward probability matrices a Numpy array. This is
574 faster than L{backward_log_probabilities} because it uses Numpy
575 arrays with non-log probabilities and normalizes each timestep.
576
577 @return: matrix over timesteps and all labels, with a dimension for
578 each state in each (n-1)-gram: for time steps
579 i and labels k, P(word^(i+1), ..., word^T | label_k^i)
580
581 """
582 N = len(sequence)
583 states = self.label_dom
584 S = len(states)
585
586
587 trans_back = self.get_transition_matrix(transpose=True)
588 ems = self.get_emission_matrix(sequence)
589
590 if self.order == 1:
591
592
593
594 backward_matrix = numpy.ones((N,S), numpy.float64) / S
595 return backward_matrix
596
597
598
599 backward_matrix = numpy.zeros([N]+[S]*(self.order-1), numpy.float64)
600
601 index_sets = _all_indices(self.order-1, S)
602
603
604 for indices in index_sets:
605
606 ngram = [states[i] for i in indices]
607
608
609
610 selector = tuple([N-1]+list(reversed(indices)))
611 backward_matrix[selector] = self.transition_probability(None, *ngram)
612 backward_matrix[N-1] /= numpy.sum(backward_matrix[N-1])
613
614
615 for time in range(N-2, self.order-2, -1):
616
617
618
619
620
621
622
623
624
625 backward_matrix[time] = numpy.sum((trans_back *
626 backward_matrix[time+1]
627 * ems[time+1]),
628 axis=-1)
629
630 backward_matrix[time] /= numpy.sum(backward_matrix[time])
631
632
633 for time in range(self.order-2, -1, -1):
634
635 id_tgrams = _all_indices(time+1, S)
636
637 for id_tgram in id_tgrams:
638
639 tgram = [self.label_dom[i] for i in id_tgram]
640
641
642 nm1gram = tgram + [None]*(self.order-2-time)
643
644 id_nm1gram = id_tgram + [0]*(self.order-2-time)
645
646
647 summed_prob = 0.0
648 for next_id in range(S):
649
650 prob = self.transition_probability(states[next_id], *nm1gram)
651
652 prob *= ems[time+1, next_id]
653
654
655 next_state = tuple([time+1]+list(reversed(id_nm1gram[:-1]))+[next_id])
656 prob *= backward_matrix[next_state]
657
658 summed_prob += prob
659 backward_matrix[tuple([time]+list(reversed(id_nm1gram)))] = summed_prob
660
661
662 backward_matrix[time] /= numpy.sum(backward_matrix[time])
663
664
665
666 for time in range(N):
667 backward_matrix[time] = backward_matrix[time].transpose()
668
669 return backward_matrix
670
673 """
674 A faster implementation of L{forward_backward_probabilities} for the
675 case where we're normalizing, using Numpy and non-log probabilities.
676
677 This is still an S-dimensional matrix, not the state-occupation
678 probabilities. Use L{gamma_probabilities} to get that.
679
680 """
681 if forward is None:
682 forward = self.normal_forward_probabilities(sequence)
683 if backward is None:
684 backward = self.normal_backward_probabilities(sequence)
685
686
687 gamma = forward * backward
688
689 for time in range(len(sequence)):
690 gamma[time] /= numpy.sum(gamma[time])
691 return gamma
692
695 """
696 State-occupation probabilities.
697
698 @type dictionary: bool
699 @param dictionary: return a list of label dictionaries instead
700 of a numpy matrix
701
702 """
703 fwd_bwd = self.normal_forward_backward_probabilities(sequence,
704 forward=forward, backward=backward)
705
706 for i in range(fwd_bwd.ndim-2):
707 fwd_bwd = numpy.sum(fwd_bwd, axis=-1)
708
709 if dictionary:
710
711 gamma = []
712 for t in range(fwd_bwd.shape[0]):
713 dic = {}
714 for s,state in enumerate(self.label_dom):
715 dic[state] = fwd_bwd[t,s]
716 gamma.append(dic)
717 return gamma
718 else:
719 return fwd_bwd
720
722 """
723 Use the state occupation probabilities to decode a sequence.
724
725 """
726 gamma = self.gamma_probabilities(sequence, dictionary=True)
727 states = [max(timestep.items(), key=lambda x:x[1])[0] for timestep in gamma]
728 return states
729
731 """
732 Applies the Viterbi algorithm to return a single sequence of
733 states that maximises the probability of the sequence of
734 observations.
735
736 """
737 N = len(sequence)
738 viterbi_matrix = [{} for i in range(N)]
739 back_pointers = [{} for i in range(N)]
740 def _trace_pointers(time, start_state, n):
741 """
742 Trace back through the pointer matrix from state start_state
743 at time time, return a list of length n of the states
744 (backwards in time).
745 """
746 if n == 0:
747 return []
748 elif time == 0:
749 return [start_state] + ([None] * (n-1))
750 else:
751 state = back_pointers[time-1][start_state]
752 return [start_state] + _trace_pointers(time-1, state, n-1)
753
754
755 for label in self.label_dom:
756 viterbi_matrix[0][label] = self.emission_log_probability(sequence[0], label) \
757 + self.transition_log_probability(label, *([None]*(self.order-1)))
758
759
760 for i in range(1, N):
761 for label in self.label_dom:
762
763 em = self.emission_log_probability(sequence[i], label)
764 transitions = [ \
765 (self.transition_log_probability(label, *_trace_pointers(i-1, prev_label, self.order-1)) +
766 viterbi_matrix[i-1][prev_label],
767 prev_label) for prev_label in self.label_dom]
768
769 trans,prev_label = max(transitions)
770 viterbi_matrix[i][label] = trans + em
771
772 back_pointers[i-1][label] = prev_label
773
774
775 final_state = max([(prob,lab) for (lab,prob) in viterbi_matrix[N-1].items()])[1]
776 states = _trace_pointers(N-1, final_state, N)
777 return list(reversed(states))
778
780 """
781 Applies the N-best variant of the Viterbi algorithm to return
782 N sequences of states that maximize the probability of the
783 sequence of observations.
784
785 @see: Generalization of the Viterbi Algorithm, Foreman, 1993
786
787 @type N: int
788 @param N: number of label sequences to return (defaults to 2)
789 @rtype: list of (label sequence,probability) pairs
790 @return: ordered list of possible decodings, paired with their
791 Viterbi probabilities
792
793 """
794 length = len(sequence)
795 viterbi_matrix = [{} for i in range(length)]
796 back_pointers = [{} for i in range(length)]
797 def _trace_pointers(time, start_state, rank, n):
798 """
799 Trace back through the pointer matrix from state start_state
800 at time time, return, for a particular path (of the N-best we've
801 kept track of), a list of length n of the states (backwards
802 in time).
803
804 Returns a single path
805
806 """
807 if n == 0:
808 return []
809 elif n==1:
810 return [start_state]
811 elif time == 0:
812 return [start_state] + ([None] * (n-1))
813 else:
814 (prev_state,prev_rank) = back_pointers[time-1][start_state][rank]
815 return [start_state] + _trace_pointers(time-1, prev_state, prev_rank, n-1)
816
817
818 for t in range(self.order-1):
819 for l,label in enumerate(self.label_dom):
820
821 viterbi_matrix[t][l] = [ self.emission_log_probability(sequence[t], label) \
822 + self.transition_log_probability(label, *([None]*(self.order-1-t))) ]
823
824 trans = self.get_transition_matrix()
825
826
827 for i in range(self.order-1, length):
828 for l0,label in enumerate(self.label_dom):
829
830 em = self.emission_log_probability(sequence[i], label)
831 transitions = []
832 for l1,prev_label in enumerate(self.label_dom):
833
834
835 for prev_rank,prob in enumerate(viterbi_matrix[i-1][l1]):
836 selector = tuple([l0]+_trace_pointers(i-1, l1, prev_rank, self.order-1))
837 transitions.append(
838 (trans[selector] + prob, l1, prev_rank)
839 )
840
841 top_N = list(reversed(sorted(transitions)))[:N]
842
843 top_probs = [trans_prob+em for (trans_prob,lab,rank) in top_N]
844 viterbi_matrix[i][l0] = top_probs
845
846 back_pointers[i-1][l0] = [(lab,rank) for (trans_prob,lab,rank) in top_N]
847
848
849 final_states = []
850 for l1,prev_label in enumerate(self.label_dom):
851
852
853 for prev_rank,prob in enumerate(viterbi_matrix[length-1][l1]):
854 history = [self.label_dom[l] for l in \
855 _trace_pointers(length-1, l1, prev_rank, self.order-1)]
856 final_prob = self.transition_log_probability(None, *history)
857 final_states.append((prob+final_prob, l1, prev_rank))
858 final_states = list(reversed(sorted(final_states)))[:N]
859
860 path_probs = [
861 (list(reversed(_trace_pointers(length-1, lab, rank, length))), prob) \
862 for prob,lab,rank in final_states]
863
864 path_probs = [
865 ([self.label_dom[l] for l in path], prob)
866 for path,prob in path_probs]
867 return path_probs
868
870 """
871 Returns a probability matrix like that given by the
872 forward-backward algorithm which assigns prob 1.0 to the Viterbi
873 chosen tags and 0.0 to all others.
874
875 """
876 vit_seq = self.viterbi_decode(sequence)
877 return [dict([(label, 1.0 if label == vit_lab else 0.0) for label in self.label_dom]) \
878 for vit_lab in vit_seq]
879
880 - def generate(self, length=10, labels=False):
881 """
882 Generate a sequence of emissions at random using the n-gram
883 model.
884 If labels=True, outputs a sequence of (emission,label) pairs
885 indicating what hidden labels emitted the emissions.
886
887 The sequence will have maximum length C{length}, but may be shorter
888 if the model so determines.
889
890 """
891 from jazzparser.utils.probabilities import random_selection
892 sequence = []
893 label_seq = []
894 tag_context = [None]*(self.order-1)
895
896 for i in range(length):
897
898 trans_probs = {}
899 for lab in self.label_dom+[None]:
900 trans_probs[lab] = self.transition_probability(lab, *tag_context)
901
902
903 new_lab = random_selection(trans_probs.items())
904 if new_lab is None:
905
906 break
907
908 label_seq.append(new_lab)
909
910 if self.order > 1:
911 tag_context = [new_lab] + tag_context[:-1]
912
913
914 em_probs = {}
915 for em in self.emission_dom:
916 em_probs[em] = self.emission_probability(em, new_lab)
917
918 sequence.append(random_selection(em_probs.items()))
919
920 if labels:
921 return zip(sequence, label_seq)
922 else:
923 return sequence
924
926 """
927 Computes the joint probability that the model assigns to a sequence
928 and its gold standard labels.
929
930 Probability is a log, because we'd get underflow otherwise.
931
932 """
933 from collections import deque
934 labeled_data = zip(sequence, labels)
935
936
937 history = deque([None] * (self.order-1))
938
939
940 joint_logprob = 0.0
941
942 for em,label in labeled_data:
943
944 joint_logprob += self.transition_log_probability(label, *history)
945 joint_logprob += self.emission_log_probability(em, label)
946
947 if self.order > 1:
948
949 history.pop()
950 history.appendleft(label)
951 return joint_logprob
952
954 """
955 Returns all possible ngrams of length n composed of elements
956 from this model's domain.
957
958 """
959 if n == 0:
960 return [[]]
961 elif n == 1:
962 return [[lab] for lab in self.label_dom]
963 else:
964 return sum([[[label]+copy.deepcopy(ngram) for label in self.label_dom] for ngram in self.get_all_ngrams(n-1)], [])
965
967 """
968 Creates a L{PrecomputedNgramModel} from this NgramModel.
969
970 """
971 trans_mat = self.get_transition_matrix()
972 return PrecomputedNgramModel(
973 order = self.order,
974 label_counts = self.label_counts,
975 emission_counts = self.emission_counts, \
976 estimator = self._estimator, \
977 backoff_model = self.backoff_model, \
978 label_dom = self.label_dom,
979 emission_dom = self.emission_dom,
980 transition_matrix = trans_mat)
981
983 """
984 Produces a picklable representation of model as a dict.
985 You can't just pickle the object directly because some of the
986 NLTK classes can't be pickled. You can pickle this dict and
987 reconstruct the model using NgramModel.from_picklable_dict(dict).
988
989 """
990 from jazzparser.utils.nltk.storage import object_to_dict
991 if self.backoff_model is None:
992 backoff = None
993 else:
994 backoff = self.backoff_model.to_picklable_dict()
995
996 return {
997 'order' : self.order,
998 'label_dom' : self.label_dom,
999 'emission_dom' : self.emission_dom,
1000 'label_counts' : object_to_dict(self.label_counts),
1001 'emission_counts' : object_to_dict(self.emission_counts),
1002 'estimator' : self._estimator,
1003 'backoff_model' : backoff,
1004 }
1005
1006 @staticmethod
1035
1036
1037
1039 """
1040 B{Removed.} Use L{normal_forward_probabilities} and take logs if
1041 you're happy with normalized probabilities. Otherwise,
1042 L{normal_forward_probabilities} needs to be made to return the
1043 sums it normalizes by.
1044
1045 """
1046 raise NotImplementedError, "deprecated"
1047
1049 """ B{Removed.} See note on L{forward_log_probabilities}. """
1050 raise NotImplementedError, "deprecated"
1051
1053 """ B{Removed.} See L{forward_log_probabilities} """
1054 raise NotImplementedError, "deprecated"
1055
1057 """ B{Removed.} See L{forward_log_probabilities} """
1058 raise NotImplementedError, "deprecated"
1059
1061 """
1062 B{Removed.} See L{forward_log_probabilities}. Use either
1063 L{normal_forward_backward_probabilities} or L{gamma_probabilities}.
1064
1065 """
1066 raise NotImplementedError, "deprecated"
1067
1069 """ B{Removed.} See L{forward_backward_log_probabilities} """
1070 raise NotImplementedError, "deprecated"
1071
1073 """
1074 Overrides parts of L{NgramModel} to provide exactly the same interface, but
1075 stores the precomputed transition matrix and uses this to provide
1076 transition probabilities. This makes using the model a lot faster if
1077 you're doing things like forward-backward computations, since it needs the
1078 full transition matrix anyway. This processing is effectively pushed to
1079 training time instead of testing time.
1080
1081 """
1083 transition_matrix = kwargs.pop('transition_matrix')
1084 NgramModel.__init__(self, *args, **kwargs)
1085
1086 self.transition_matrix = transition_matrix
1087
1088 self._indices = dict(
1089 [(tuple([self.label_dom[s] for s in ngram]), tuple(ngram))
1090 for ngram in _all_indices(self.order, len(self.label_dom))])
1091
1092 @staticmethod
1093 - def train(*args, **kwargs):
1094 """
1095 Just calls L{NgramModel}'s train method and converts the result to a
1096 PrecomputedNgramModel.
1097
1098 """
1099 model = NgramModel.train(*args, **kwargs)
1100 return model.precompute()
1101
1103 """
1104 Like the superclass, but read off from the precomputed matrix.
1105
1106 @see: NgramModel.transition_log_probability
1107
1108 """
1109 if None in ngram:
1110
1111
1112 return NgramModel.transition_probability(self, *ngram)
1113 else:
1114 return self.transition_matrix[self._indices[ngram]]
1115
1117 """
1118 Like the superclass, but read off from the precomputed matrix.
1119
1120 @see: NgramModel.transition_log_probability
1121
1122 """
1123 if None in ngram:
1124
1125
1126 return NgramModel.transition_log_probability(self, *ngram)
1127 else:
1128 return logprob(self.transition_matrix[self._indices[ngram]])
1129
1131 """
1132 Returns the precomputed transition matrix.
1133
1134 @see: NgramModel.get_transition_matrix
1135
1136 """
1137 if transpose:
1138
1139 if self._transition_matrix_transpose_cache is None:
1140
1141 mat = self.get_transition_matrix()
1142
1143 self._transition_matrix_transpose_cache = numpy.copy(mat.transpose())
1144 return self._transition_matrix_transpose_cache
1145 else:
1146
1147 return self.transition_matrix
1148
1150 from StringIO import StringIO
1151
1152 data = NgramModel.to_picklable_dict(self)
1153
1154
1155
1156 buff = StringIO()
1157 numpy.save(buff, self.transition_matrix)
1158 matrix_data = buff.getvalue()
1159 buff.close()
1160
1161 data['transition_matrix'] = matrix_data
1162 return data
1163
1164 @staticmethod
1166 from StringIO import StringIO
1167
1168 if 'transition_matrix' not in data:
1169
1170 raise NgramError, "could not load PrecomputedNgramModel: "\
1171 "no precomputed transition matrix has been stored"
1172
1173 matrix_data = data.pop('transition_matrix')
1174
1175 buff = StringIO(matrix_data)
1176 transition_matrix = numpy.load(buff)
1177 buff.close()
1178
1179 model = NgramModel.from_picklable_dict(data,
1180 cls=PrecomputedNgramModel,
1181
1182 transition_matrix=transition_matrix)
1183 return model
1184
1188