1 """Utilities for processing probability distributions.
2
3 """
4 """
5 ============================== License ========================================
6 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
7
8 This file is part of The Jazz Parser.
9
10 The Jazz Parser is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or
13 (at your option) any later version.
14
15 The Jazz Parser is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
22
23 ============================ End license ======================================
24
25 """
26 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
27
28 -def batch_sizes(probabilities, batch_ratio, max_batch=0):
29 """
30 Given a list of probabilities, returns a list of integers that
31 represents the ordered sizes of batches (from high to low
32 probability) required so that the ratio between the highest and
33 lowest probabilities in the batch is at most batch_ratio.
34
35 If C{max_batch} is non-zero, a maximum of C{max_batch} items are included
36 in each batch.
37
38 """
39
40 probs = reversed(sorted(probabilities))
41 sizes = []
42 batch_top = None
43 batch_length = 1
44 for prob in probs:
45 if batch_top is None:
46 batch_top = prob
47 else:
48
49 if prob >= batch_ratio * batch_top and \
50 (not max_batch or batch_length < max_batch):
51
52 batch_length += 1
53 else:
54
55 sizes.append(batch_length)
56
57 batch_length = 1
58 batch_top = prob
59 sizes.append(batch_length)
60 return sizes
61
63 """
64 An alternative to L{batch_sizes} which processes many lists of
65 probabilities at once (i.e. one per word).
66
67 The lists returned contain the number
68 of values that should be returned in each batch to represent a
69 progressively widening probability beam, which is the same across
70 all the words. The main difference between this and applying
71 L{batch_sizes} to each word independently is that this may result
72 in some words having some batches empty, if the beam is not wide
73 enough to catch the next highest probability.
74
75 The one exception to this is the first batch, which will always
76 contain at least one value, even if this means effectively lowering
77 the beam for that one word.
78
79 Every batch will include at least one value on at least one word.
80
81 It is assumed that every word has at least one value.
82
83 If C{max_batch} is non-zero, a maximum of C{max_batch} items are included
84 in each batch for each word.
85
86 @type probabilities: list of lists of floats
87 @param probabilities: a list for each word of the probabilities
88 to batch up.
89 @type batch_ratio: float
90 @param batch_ratio: maximum ratio between the highest probability in a
91 particular batch and the lowest (over all words).
92 @rtype: list of lists of ints
93 @return: the list of sizes of each batch for each word.
94
95 """
96 words = len(probabilities)
97
98 queues = [list(reversed(sorted(probs))) for probs in probabilities]
99
100 batch_lists = [[] for i in range(words)]
101
102 first_batch = True
103
104
105 while sum([len(q) for q in queues]) > 0:
106
107 beam_top = max([q[0] for q in queues if len(q) > 0])
108 beam_bottom = batch_ratio * beam_top
109
110
111 for word in range(words):
112 vals_in_batch = len([prob for prob in queues[word] if prob >= beam_bottom])
113
114 if max_batch:
115 vals_in_batch = min(vals_in_batch, max_batch)
116 queues[word] = queues[word][vals_in_batch:]
117 batch_lists[word].append(vals_in_batch)
118
119
120 if first_batch:
121 first_batch = False
122 for word in range(words):
123 if batch_lists[word][0] == 0:
124
125
126
127 vals_in_batch = len([prob for prob in queues[word] if prob == queues[word][0]])
128 queues[word] = queues[word][vals_in_batch:]
129 batch_lists[word][0] = vals_in_batch
130 return batch_lists
131
133 """
134 Given a distribution in the form of (object,prob) pairs, picks an
135 object randomly with probabilities according to the distribution.
136
137 The probabilities should obviously sum to 1.0. If they don't, an
138 error might be raised, but it might go unnoticed. Make sure your
139 distributions are healthy.
140 Alternatively, set normalize=True and the probabilities will be
141 scaled into a real probability distribution.
142
143 The objects may be of any type.
144
145 """
146 import random
147 if normalize:
148 total_prob = sum(pair[1] for pair in obj_probs)
149
150 obj_probs = [(obj,prob/total_prob) for (obj,prob) in obj_probs]
151
152 rand_num = random.random()
153
154 cum_prob = 0.0
155 for obj,prob in obj_probs:
156 cum_prob += prob
157 if cum_prob >= rand_num:
158 return obj
159 raise ProbabilityError, "probability distribution should reach 1.0, but didn't get as far as %f" % rand_num
160
163