Package jazzparser :: Package parsers :: Package pcfg :: Module parser
[hide private]
[frames] | no frames]

Source Code for Module jazzparser.parsers.pcfg.parser

  1  """Probabilistic (PCFG) extension to the CKY parser implementation. 
  2   
  3  This implements the CCG equivalent of a basic PCFG parser. 
  4  It is modelling on Julia Hockenmaier's original basic model of 2001. 
  5   
  6  @note: This used to have to be used with the PCFG tagger, which took care of  
  7  assigning lexical probabilities, using the same model as the parser. This has  
  8  now changed. You can use this with any tagger to limit the choice of available  
  9  signs and the probabilities used will be taken by the parser from the  
 10  parsing model. 
 11   
 12  """ 
 13  """ 
 14  ============================== License ======================================== 
 15   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
 16    
 17   This file is part of The Jazz Parser. 
 18    
 19   The Jazz Parser is free software: you can redistribute it and/or modify 
 20   it under the terms of the GNU General Public License as published by 
 21   the Free Software Foundation, either version 3 of the License, or 
 22   (at your option) any later version. 
 23    
 24   The Jazz Parser is distributed in the hope that it will be useful, 
 25   but WITHOUT ANY WARRANTY; without even the implied warranty of 
 26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 27   GNU General Public License for more details. 
 28    
 29   You should have received a copy of the GNU General Public License 
 30   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
 31   
 32  ============================ End license ====================================== 
 33   
 34  """ 
 35  __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"  
 36   
 37  from jazzparser.grammar import Grammar 
 38  from jazzparser.utils.input import assign_durations 
 39  from jazzparser.utils.base import filter_latex 
 40  from jazzparser.utils.options import ModuleOption, zero_to_one_float 
 41  from jazzparser.utils.strings import str_to_bool 
 42  from jazzparser.utils.nltk.probability import logprob 
 43  from .chart import PcfgChart 
 44  from jazzparser.parsers.cky.parser import CkyParser 
 45  from jazzparser.parsers.cky.tools import ChartTool 
 46  from jazzparser.parsers import ParserInitializationError, ParseError 
 47  from jazzparser.data import Chord 
 48  from .chart import ProbabilisticSignHashSet 
 49  from .tools import ProbabilisticResultListTool, ProbabilityTool, \ 
 50      ProbabilisticChartTool, ProbabilisticDerivationTraceTool 
 51   
 52  import sys, re 
 53  import logging 
 54  from scipy.misc import comb 
 55   
 56  from jazzparser import settings 
 57   
 58  # Get the logger from the logging system 
 59  logger = logging.getLogger("main_logger") 
60 61 -class PcfgParser(CkyParser):
62 """ 63 """ 64 shell_tools = CkyParser.shell_tools + [ 65 ProbabilisticResultListTool(), 66 ProbabilityTool(), 67 ProbabilisticChartTool(), 68 ProbabilisticDerivationTraceTool(), 69 ] 70 71 PARSER_OPTIONS = CkyParser.PARSER_OPTIONS + [ 72 ModuleOption('threshold', filter=zero_to_one_float, 73 help_text="Ratio between the highest probability on an arc "\ 74 "and the probability of a new sign below which the new "\ 75 "sign will be ignored. (Lower throws away more and runs faster.)", 76 usage="threshold=X, where X is a float between 0 and 1 (default %s)."\ 77 % settings.PCFG_PARSER.DEFAULT_THRESHOLD, 78 default=settings.PCFG_PARSER.DEFAULT_THRESHOLD 79 ), 80 ModuleOption('maxarc', filter=int, 81 help_text="An absolute maximum on the number of signs on an "\ 82 "arc. If an arc gets more signs than this even after the "\ 83 "beam is applied, the lowest probability signs will just "\ 84 "be dropped. Set to 0 to enforce no maximum at all.", 85 usage="maxarc=X, where X is an integer (default %d)."\ 86 % settings.PCFG_PARSER.DEFAULT_MAX_ARC_SIZE, 87 default=settings.PCFG_PARSER.DEFAULT_MAX_ARC_SIZE 88 ), 89 ModuleOption('model', filter=str, 90 help_text="Name of a trained PCFG model to use for parsing.", 91 usage="model=X, where X is the name of the model.", 92 required=True 93 ), 94 ModuleOption('partition', filter=int, 95 help_text="If given, the numbered partition of the partitioned "\ 96 "model will be used. (This generally involves appending the "\ 97 "partition number to the model name.)", 98 usage="partition=P, where P is an int", 99 default=None 100 ), 101 ModuleOption('nolex', filter=str_to_bool, 102 help_text="Ignore lexical probabilities in model and force it "\ 103 "not to be a lexical model, even if it was trained with "\ 104 "lexical probabilities. Some input types force nolex to be "\ 105 "true. In these cases, this option will be overridden. "\ 106 "If the tagger is able to supply lexical probabilities, "\ 107 "these will be used instead of the model's probabilities, but "\ 108 "only if nolex=False", 109 usage="nolex=B, where B is 'true' or 'false'", 110 default=False, 111 ), 112 ] 113
114 - def _create_chart(self, *args, **kwargs):
115 kwargs['threshold'] = self.options['threshold'] 116 kwargs['maxarc'] = self.options['maxarc'] 117 kwargs['model'] = self.model 118 self.chart = PcfgChart(self.grammar, *args, **kwargs) 119 return self.chart
120
121 - def _add_signs(self, offset):
122 # Use our PCFG model to get lexical probabilities for all signs 123 def prob_adder(start, end, signtup, words): 124 sign, tag, tag_prob = signtup 125 if self.use_tagger_probs: 126 # Use the tagger to get lexical probabilities 127 lex_prob = self.tagger.lexical_probability(start, end, tag) 128 else: 129 # We might get multiple words here: use the first 130 # This is not really a satisfactory solution: better would be 131 # to get the tagger to tell us which word to use 132 if isinstance(words, list): 133 word = words[0] 134 elif not isinstance(words, basestring): 135 # Check the word is a string 136 # If not, we probably shouldn't be trying to get a probability 137 raise ParseError, "PCFG model is trying to assign lexical "\ 138 "probabilities to words, but the words aren't strings. "\ 139 "Maybe you should have disabled lexical probs wtih "\ 140 "parser option 'nolex'" 141 else: 142 word = words 143 # Consult the model to get the lexical probability of this sign 144 lex_prob = self.model.inside_probability('leaf', sign, word) 145 # Triangular number: nodes in the tree for multiword categories 146 # This has the effect of penalizing multiword categories 147 # proportionally to the number of tree nodes deriving the 148 # categories they're competing with derived from single-word cats 149 tree_size = comb(end-start+1, 2) 150 lex_prob = lex_prob ** tree_size 151 # Add the probabilities to the category 152 sign.inside_probability = logprob(lex_prob) 153 sign.probability = logprob(self.model.outside_probability(sign)) \ 154 + sign.inside_probability
155 # Call the CkyParser's method to get the basic tuples 156 vals = super(PcfgParser, self)._add_signs(offset, prob_adder=prob_adder) 157 return vals
158
159 - def __init__(self, *args, **kwargs):
160 super(PcfgParser, self).__init__(*args, **kwargs) 161 # Check that the formalism in use provides what we need to use this parser 162 f = self.grammar.formalism 163 if not hasattr(f, 'PcfgParser'): 164 raise ParserInitializationError, "PcfgParser is not compatible "\ 165 "with the formalism %s" % f.get_name() 166 # Load the PCFG probabilistic model 167 if self.options['partition'] is not None: 168 model_name = type(self).partition_model_name(self.options['model'], 169 self.options['partition']) 170 else: 171 model_name = self.options['model'] 172 self.model = self.grammar.formalism.PcfgModel.load_model(self.options['model']) 173 self.logger.info("Parsing model: %s" % model_name) 174 175 self.use_tagger_probs = False 176 self.model.lexical = True 177 if not isinstance(self.tagger.wrapped_input, tuple(self.model.LEX_INPUT_TYPES)) \ 178 and not self.tagger.LEXICAL_PROBABILITY: 179 # Model has to be non-lexical, since it's not an allowed lexical type 180 self.model.lexical = False 181 if not self.options['nolex']: 182 # The user wasn't expecting this: warn them 183 self.logger.warn("Could not use a lexical PCFG model with "\ 184 "input type '%s'" % type(self.tagger.wrapped_input).__name__) 185 elif self.options['nolex']: 186 # Force the model to be non-lexical 187 self.model.lexical = False 188 elif self.tagger.LEXICAL_PROBABILITY: 189 # The tagger can supply us with probabilities instead of the model's 190 self.use_tagger_probs = True
191 192 @staticmethod
193 - def partition_model_name(model_name, partition_number):
194 """ 195 The model name to use when the given partition number is requested. 196 The default implementation simply appends the number to the model 197 name. Subclasses may override this if they want to do something 198 different. 199 200 """ 201 return "%s%d" % (model_name, partition_number)
202
203 - def parse(self, *args, **kwargs):
204 """ 205 Performs a full parse and returns the results ranked by 206 probability. 207 208 """ 209 parses = super(PcfgParser, self).parse(*args, **kwargs) 210 # Rank the parses 211 # We can't use chart.ranked_parses because the parse might return parses 212 # not from the chart (as in the case of backoff) 213 return list(reversed(sorted(parses, key=lambda s:s.probability)))
214