1 """Probabilistic (PCFG) extension to the CKY parser implementation.
2
3 This implements the CCG equivalent of a basic PCFG parser.
4 It is modelling on Julia Hockenmaier's original basic model of 2001.
5
6 @note: This used to have to be used with the PCFG tagger, which took care of
7 assigning lexical probabilities, using the same model as the parser. This has
8 now changed. You can use this with any tagger to limit the choice of available
9 signs and the probabilities used will be taken by the parser from the
10 parsing model.
11
12 """
13 """
14 ============================== License ========================================
15 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
16
17 This file is part of The Jazz Parser.
18
19 The Jazz Parser is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 The Jazz Parser is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
31
32 ============================ End license ======================================
33
34 """
35 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
36
37 from jazzparser.grammar import Grammar
38 from jazzparser.utils.input import assign_durations
39 from jazzparser.utils.base import filter_latex
40 from jazzparser.utils.options import ModuleOption, zero_to_one_float
41 from jazzparser.utils.strings import str_to_bool
42 from jazzparser.utils.nltk.probability import logprob
43 from .chart import PcfgChart
44 from jazzparser.parsers.cky.parser import CkyParser
45 from jazzparser.parsers.cky.tools import ChartTool
46 from jazzparser.parsers import ParserInitializationError, ParseError
47 from jazzparser.data import Chord
48 from .chart import ProbabilisticSignHashSet
49 from .tools import ProbabilisticResultListTool, ProbabilityTool, \
50 ProbabilisticChartTool, ProbabilisticDerivationTraceTool
51
52 import sys, re
53 import logging
54 from scipy.misc import comb
55
56 from jazzparser import settings
57
58
59 logger = logging.getLogger("main_logger")
62 """
63 """
64 shell_tools = CkyParser.shell_tools + [
65 ProbabilisticResultListTool(),
66 ProbabilityTool(),
67 ProbabilisticChartTool(),
68 ProbabilisticDerivationTraceTool(),
69 ]
70
71 PARSER_OPTIONS = CkyParser.PARSER_OPTIONS + [
72 ModuleOption('threshold', filter=zero_to_one_float,
73 help_text="Ratio between the highest probability on an arc "\
74 "and the probability of a new sign below which the new "\
75 "sign will be ignored. (Lower throws away more and runs faster.)",
76 usage="threshold=X, where X is a float between 0 and 1 (default %s)."\
77 % settings.PCFG_PARSER.DEFAULT_THRESHOLD,
78 default=settings.PCFG_PARSER.DEFAULT_THRESHOLD
79 ),
80 ModuleOption('maxarc', filter=int,
81 help_text="An absolute maximum on the number of signs on an "\
82 "arc. If an arc gets more signs than this even after the "\
83 "beam is applied, the lowest probability signs will just "\
84 "be dropped. Set to 0 to enforce no maximum at all.",
85 usage="maxarc=X, where X is an integer (default %d)."\
86 % settings.PCFG_PARSER.DEFAULT_MAX_ARC_SIZE,
87 default=settings.PCFG_PARSER.DEFAULT_MAX_ARC_SIZE
88 ),
89 ModuleOption('model', filter=str,
90 help_text="Name of a trained PCFG model to use for parsing.",
91 usage="model=X, where X is the name of the model.",
92 required=True
93 ),
94 ModuleOption('partition', filter=int,
95 help_text="If given, the numbered partition of the partitioned "\
96 "model will be used. (This generally involves appending the "\
97 "partition number to the model name.)",
98 usage="partition=P, where P is an int",
99 default=None
100 ),
101 ModuleOption('nolex', filter=str_to_bool,
102 help_text="Ignore lexical probabilities in model and force it "\
103 "not to be a lexical model, even if it was trained with "\
104 "lexical probabilities. Some input types force nolex to be "\
105 "true. In these cases, this option will be overridden. "\
106 "If the tagger is able to supply lexical probabilities, "\
107 "these will be used instead of the model's probabilities, but "\
108 "only if nolex=False",
109 usage="nolex=B, where B is 'true' or 'false'",
110 default=False,
111 ),
112 ]
113
120
122
123 def prob_adder(start, end, signtup, words):
124 sign, tag, tag_prob = signtup
125 if self.use_tagger_probs:
126
127 lex_prob = self.tagger.lexical_probability(start, end, tag)
128 else:
129
130
131
132 if isinstance(words, list):
133 word = words[0]
134 elif not isinstance(words, basestring):
135
136
137 raise ParseError, "PCFG model is trying to assign lexical "\
138 "probabilities to words, but the words aren't strings. "\
139 "Maybe you should have disabled lexical probs wtih "\
140 "parser option 'nolex'"
141 else:
142 word = words
143
144 lex_prob = self.model.inside_probability('leaf', sign, word)
145
146
147
148
149 tree_size = comb(end-start+1, 2)
150 lex_prob = lex_prob ** tree_size
151
152 sign.inside_probability = logprob(lex_prob)
153 sign.probability = logprob(self.model.outside_probability(sign)) \
154 + sign.inside_probability
155
156 vals = super(PcfgParser, self)._add_signs(offset, prob_adder=prob_adder)
157 return vals
158
191
192 @staticmethod
194 """
195 The model name to use when the given partition number is requested.
196 The default implementation simply appends the number to the model
197 name. Subclasses may override this if they want to do something
198 different.
199
200 """
201 return "%s%d" % (model_name, partition_number)
202
203 - def parse(self, *args, **kwargs):
204 """
205 Performs a full parse and returns the results ranked by
206 probability.
207
208 """
209 parses = super(PcfgParser, self).parse(*args, **kwargs)
210
211
212
213 return list(reversed(sorted(parses, key=lambda s:s.probability)))
214