jazzparser.utils.probabilities

1 """Utilities for processing probability distributions. 2 3 """ 4 """ 5 ============================== License ======================================== 6 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 7 8 This file is part of The Jazz Parser. 9 10 The Jazz Parser is free software: you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation, either version 3 of the License, or 13 (at your option) any later version. 14 15 The Jazz Parser is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 22 23 ============================ End license ====================================== 24 25 """ 26 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 27

28 -def batch_sizes(probabilities, batch_ratio, max_batch=0):

29 """ 30 Given a list of probabilities, returns a list of integers that 31 represents the ordered sizes of batches (from high to low 32 probability) required so that the ratio between the highest and 33 lowest probabilities in the batch is at most batch_ratio. 34 35 If C{max_batch} is non-zero, a maximum of C{max_batch} items are included 36 in each batch. 37 38 """ 39 # Make sure we go through the probabilities in order, high to low 40 probs = reversed(sorted(probabilities)) 41 sizes = [] 42 batch_top = None 43 batch_length = 1 44 for prob in probs: 45 if batch_top is None: 46 batch_top = prob 47 else: 48 # Check the probability ratio and whether we've filled the max size 49 if prob >= batch_ratio * batch_top and \ 50 (not max_batch or batch_length < max_batch): 51 # This is a high enough probability to go in this batch 52 batch_length += 1 53 else: 54 # Start a new batch 55 sizes.append(batch_length) 56 # This is now the top one in the batch 57 batch_length = 1 58 batch_top = prob 59 sizes.append(batch_length) 60 return sizes

61

62 -def beamed_batch_sizes(probabilities, batch_ratio, max_batch=0):

63 """ 64 An alternative to L{batch_sizes} which processes many lists of 65 probabilities at once (i.e. one per word). 66 67 The lists returned contain the number 68 of values that should be returned in each batch to represent a 69 progressively widening probability beam, which is the same across 70 all the words. The main difference between this and applying 71 L{batch_sizes} to each word independently is that this may result 72 in some words having some batches empty, if the beam is not wide 73 enough to catch the next highest probability. 74 75 The one exception to this is the first batch, which will always 76 contain at least one value, even if this means effectively lowering 77 the beam for that one word. 78 79 Every batch will include at least one value on at least one word. 80 81 It is assumed that every word has at least one value. 82 83 If C{max_batch} is non-zero, a maximum of C{max_batch} items are included 84 in each batch for each word. 85 86 @type probabilities: list of lists of floats 87 @param probabilities: a list for each word of the probabilities 88 to batch up. 89 @type batch_ratio: float 90 @param batch_ratio: maximum ratio between the highest probability in a 91 particular batch and the lowest (over all words). 92 @rtype: list of lists of ints 93 @return: the list of sizes of each batch for each word. 94 95 """ 96 words = len(probabilities) 97 # Copy the input to use as a queue 98 queues = [list(reversed(sorted(probs))) for probs in probabilities] 99 # Build up a list of batch sizes 100 batch_lists = [[] for i in range(words)] 101 102 first_batch = True 103 104 # Keep making more batches until the queues are all empty 105 while sum([len(q) for q in queues]) > 0: 106 # Get the highest probability still waiting to be taken 107 beam_top = max([q[0] for q in queues if len(q) > 0]) 108 beam_bottom = batch_ratio * beam_top 109 # For each word, add all values that lie within the beam and 110 # remove them from the queue 111 for word in range(words): 112 vals_in_batch = len([prob for prob in queues[word] if prob >= beam_bottom]) 113 # Don't take more than max_batch at once (if given) 114 if max_batch: 115 vals_in_batch = min(vals_in_batch, max_batch) 116 queues[word] = queues[word][vals_in_batch:] 117 batch_lists[word].append(vals_in_batch) 118 119 # If this is the first batch, check every word got at least one 120 if first_batch: 121 first_batch = False 122 for word in range(words): 123 if batch_lists[word][0] == 0: 124 # Nothing was assigned to this word - give it many 125 # values as have the highest probability (probably 126 # just one) 127 vals_in_batch = len([prob for prob in queues[word] if prob == queues[word][0]]) 128 queues[word] = queues[word][vals_in_batch:] 129 batch_lists[word][0] = vals_in_batch 130 return batch_lists

131

132 -def random_selection(obj_probs, normalize=False):

133 """ 134 Given a distribution in the form of (object,prob) pairs, picks an 135 object randomly with probabilities according to the distribution. 136 137 The probabilities should obviously sum to 1.0. If they don't, an 138 error might be raised, but it might go unnoticed. Make sure your 139 distributions are healthy. 140 Alternatively, set normalize=True and the probabilities will be 141 scaled into a real probability distribution. 142 143 The objects may be of any type. 144 145 """ 146 import random 147 if normalize: 148 total_prob = sum(pair[1] for pair in obj_probs) 149 # Rescale probs so they sum to 1.0 150 obj_probs = [(obj,prob/total_prob) for (obj,prob) in obj_probs] 151 # Pick a number between 0 and 1 152 rand_num = random.random() 153 # Step through the cumulative probability dist until we get to this number 154 cum_prob = 0.0 155 for obj,prob in obj_probs: 156 cum_prob += prob 157 if cum_prob >= rand_num: 158 return obj 159 raise ProbabilityError, "probability distribution should reach 1.0, but didn't get as far as %f" % rand_num

160

161 -class ProbabilityError(Exception):

162 pass

163

Source Code for Module jazzparser.utils.probabilities