jazzparser.parsers.pcfg.parser

1 """Probabilistic (PCFG) extension to the CKY parser implementation. 2 3 This implements the CCG equivalent of a basic PCFG parser. 4 It is modelling on Julia Hockenmaier's original basic model of 2001. 5 6 @note: This used to have to be used with the PCFG tagger, which took care of 7 assigning lexical probabilities, using the same model as the parser. This has 8 now changed. You can use this with any tagger to limit the choice of available 9 signs and the probabilities used will be taken by the parser from the 10 parsing model. 11 12 """ 13 """ 14 ============================== License ======================================== 15 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 16 17 This file is part of The Jazz Parser. 18 19 The Jazz Parser is free software: you can redistribute it and/or modify 20 it under the terms of the GNU General Public License as published by 21 the Free Software Foundation, either version 3 of the License, or 22 (at your option) any later version. 23 24 The Jazz Parser is distributed in the hope that it will be useful, 25 but WITHOUT ANY WARRANTY; without even the implied warranty of 26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 27 GNU General Public License for more details. 28 29 You should have received a copy of the GNU General Public License 30 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 31 32 ============================ End license ====================================== 33 34 """ 35 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 36 37 from jazzparser.grammar import Grammar 38 from jazzparser.utils.input import assign_durations 39 from jazzparser.utils.base import filter_latex 40 from jazzparser.utils.options import ModuleOption, zero_to_one_float 41 from jazzparser.utils.strings import str_to_bool 42 from jazzparser.utils.nltk.probability import logprob 43 from .chart import PcfgChart 44 from jazzparser.parsers.cky.parser import CkyParser 45 from jazzparser.parsers.cky.tools import ChartTool 46 from jazzparser.parsers import ParserInitializationError, ParseError 47 from jazzparser.data import Chord 48 from .chart import ProbabilisticSignHashSet 49 from .tools import ProbabilisticResultListTool, ProbabilityTool, \ 50 ProbabilisticChartTool, ProbabilisticDerivationTraceTool 51 52 import sys, re 53 import logging 54 from scipy.misc import comb 55 56 from jazzparser import settings 57 58 # Get the logger from the logging system 59 logger = logging.getLogger("main_logger")

60 61 -class PcfgParser(CkyParser):

62 """ 63 """ 64 shell_tools = CkyParser.shell_tools + [ 65 ProbabilisticResultListTool(), 66 ProbabilityTool(), 67 ProbabilisticChartTool(), 68 ProbabilisticDerivationTraceTool(), 69 ] 70 71 PARSER_OPTIONS = CkyParser.PARSER_OPTIONS + [ 72 ModuleOption('threshold', filter=zero_to_one_float, 73 help_text="Ratio between the highest probability on an arc "\ 74 "and the probability of a new sign below which the new "\ 75 "sign will be ignored. (Lower throws away more and runs faster.)", 76 usage="threshold=X, where X is a float between 0 and 1 (default %s)."\ 77 % settings.PCFG_PARSER.DEFAULT_THRESHOLD, 78 default=settings.PCFG_PARSER.DEFAULT_THRESHOLD 79 ), 80 ModuleOption('maxarc', filter=int, 81 help_text="An absolute maximum on the number of signs on an "\ 82 "arc. If an arc gets more signs than this even after the "\ 83 "beam is applied, the lowest probability signs will just "\ 84 "be dropped. Set to 0 to enforce no maximum at all.", 85 usage="maxarc=X, where X is an integer (default %d)."\ 86 % settings.PCFG_PARSER.DEFAULT_MAX_ARC_SIZE, 87 default=settings.PCFG_PARSER.DEFAULT_MAX_ARC_SIZE 88 ), 89 ModuleOption('model', filter=str, 90 help_text="Name of a trained PCFG model to use for parsing.", 91 usage="model=X, where X is the name of the model.", 92 required=True 93 ), 94 ModuleOption('partition', filter=int, 95 help_text="If given, the numbered partition of the partitioned "\ 96 "model will be used. (This generally involves appending the "\ 97 "partition number to the model name.)", 98 usage="partition=P, where P is an int", 99 default=None 100 ), 101 ModuleOption('nolex', filter=str_to_bool, 102 help_text="Ignore lexical probabilities in model and force it "\ 103 "not to be a lexical model, even if it was trained with "\ 104 "lexical probabilities. Some input types force nolex to be "\ 105 "true. In these cases, this option will be overridden. "\ 106 "If the tagger is able to supply lexical probabilities, "\ 107 "these will be used instead of the model's probabilities, but "\ 108 "only if nolex=False", 109 usage="nolex=B, where B is 'true' or 'false'", 110 default=False, 111 ), 112 ] 113

114 - def _create_chart(self, *args, **kwargs):

115 kwargs['threshold'] = self.options['threshold'] 116 kwargs['maxarc'] = self.options['maxarc'] 117 kwargs['model'] = self.model 118 self.chart = PcfgChart(self.grammar, *args, **kwargs) 119 return self.chart

120

121 - def _add_signs(self, offset):

122 # Use our PCFG model to get lexical probabilities for all signs 123 def prob_adder(start, end, signtup, words): 124 sign, tag, tag_prob = signtup 125 if self.use_tagger_probs: 126 # Use the tagger to get lexical probabilities 127 lex_prob = self.tagger.lexical_probability(start, end, tag) 128 else: 129 # We might get multiple words here: use the first 130 # This is not really a satisfactory solution: better would be 131 # to get the tagger to tell us which word to use 132 if isinstance(words, list): 133 word = words[0] 134 elif not isinstance(words, basestring): 135 # Check the word is a string 136 # If not, we probably shouldn't be trying to get a probability 137 raise ParseError, "PCFG model is trying to assign lexical "\ 138 "probabilities to words, but the words aren't strings. "\ 139 "Maybe you should have disabled lexical probs wtih "\ 140 "parser option 'nolex'" 141 else: 142 word = words 143 # Consult the model to get the lexical probability of this sign 144 lex_prob = self.model.inside_probability('leaf', sign, word) 145 # Triangular number: nodes in the tree for multiword categories 146 # This has the effect of penalizing multiword categories 147 # proportionally to the number of tree nodes deriving the 148 # categories they're competing with derived from single-word cats 149 tree_size = comb(end-start+1, 2) 150 lex_prob = lex_prob ** tree_size 151 # Add the probabilities to the category 152 sign.inside_probability = logprob(lex_prob) 153 sign.probability = logprob(self.model.outside_probability(sign)) \ 154 + sign.inside_probability

155 # Call the CkyParser's method to get the basic tuples 156 vals = super(PcfgParser, self)._add_signs(offset, prob_adder=prob_adder) 157 return vals

158

159 - def __init__(self, *args, **kwargs):

160 super(PcfgParser, self).__init__(*args, **kwargs) 161 # Check that the formalism in use provides what we need to use this parser 162 f = self.grammar.formalism 163 if not hasattr(f, 'PcfgParser'): 164 raise ParserInitializationError, "PcfgParser is not compatible "\ 165 "with the formalism %s" % f.get_name() 166 # Load the PCFG probabilistic model 167 if self.options['partition'] is not None: 168 model_name = type(self).partition_model_name(self.options['model'], 169 self.options['partition']) 170 else: 171 model_name = self.options['model'] 172 self.model = self.grammar.formalism.PcfgModel.load_model(self.options['model']) 173 self.logger.info("Parsing model: %s" % model_name) 174 175 self.use_tagger_probs = False 176 self.model.lexical = True 177 if not isinstance(self.tagger.wrapped_input, tuple(self.model.LEX_INPUT_TYPES)) \ 178 and not self.tagger.LEXICAL_PROBABILITY: 179 # Model has to be non-lexical, since it's not an allowed lexical type 180 self.model.lexical = False 181 if not self.options['nolex']: 182 # The user wasn't expecting this: warn them 183 self.logger.warn("Could not use a lexical PCFG model with "\ 184 "input type '%s'" % type(self.tagger.wrapped_input).__name__) 185 elif self.options['nolex']: 186 # Force the model to be non-lexical 187 self.model.lexical = False 188 elif self.tagger.LEXICAL_PROBABILITY: 189 # The tagger can supply us with probabilities instead of the model's 190 self.use_tagger_probs = True

191 192 @staticmethod

193 - def partition_model_name(model_name, partition_number):

194 """ 195 The model name to use when the given partition number is requested. 196 The default implementation simply appends the number to the model 197 name. Subclasses may override this if they want to do something 198 different. 199 200 """ 201 return "%s%d" % (model_name, partition_number)

202

203 - def parse(self, *args, **kwargs):

204 """ 205 Performs a full parse and returns the results ranked by 206 probability. 207 208 """ 209 parses = super(PcfgParser, self).parse(*args, **kwargs) 210 # Rank the parses 211 # We can't use chart.ranked_parses because the parse might return parses 212 # not from the chart (as in the case of backoff) 213 return list(reversed(sorted(parses, key=lambda s:s.probability)))

214

Source Code for Module jazzparser.parsers.pcfg.parser