1 """Probabilistic models for the PCFG parser.
2
3 The PCFG parser need to be able to access certain probabilities to
4 parse. The interface contained in this module
5 provide access to these probabilities, using a model previously
6 trained on training data. The model implementation itself is provided by
7 the formalism, since it needs to manipulate categories.
8
9 """
10 """
11 ============================== License ========================================
12 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
13
14 This file is part of The Jazz Parser.
15
16 The Jazz Parser is free software: you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or
19 (at your option) any later version.
20
21 The Jazz Parser is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
25
26 You should have received a copy of the GNU General Public License
27 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
28
29 ============================ End license ======================================
30
31 """
32 __author__ = "Mark Granroth-Wilding <mark@granroth-wilding.co.uk>"
33
34 import os
35 import cPickle as pickle
36 from jazzparser.taggers import Tagger
37 from jazzparser import settings
38 from jazzparser.utils.options import ModuleOption
39 from jazzparser.utils.base import abstractmethod
40
41 FILE_EXTENSION = "mdl"
44 """
45 A trainable model used by a pcfg parser.
46
47 """
48
49 MODEL_TYPE = None
50
51 TRAINING_OPTIONS = []
52
53 LEX_INPUT_TYPES = []
54
55 - def __init__(self, model_name, overwrite=False, options={},
56 description=None, grammar=None):
57 """
58 Creates an empty, untrained model. To load a previously
59 stored model, use from_file().
60
61 Optionally stores some custom descriptive text. This will be
62 included in the descriptive text that gets stored along with
63 the model.
64
65 """
66 self.model_name = model_name
67 if overwrite and os.exists(self._filename):
68
69 if overwrite:
70 os.remove(self._filename)
71 self._options = None
72 self._options_dict = options
73 self._generate_description()
74 self.model_description = description
75 self.grammar = grammar
76
77 @classmethod
82 _filename = property(__get_my_filename)
83
92
94 """
95 Instead of processing training options when instantiating (which makes
96 it impossible to have required options, since we're not always training
97 when instantiating), we process the training options the first time
98 they're needed.
99
100 If you want to do this ahead of time to verify the validity of the
101 values, call L{process_training_options}.
102
103 """
104 if self._options is None:
105 self.process_training_options()
106 return self._options
107 options = property(_get_options)
108
109 @classmethod
112
113 @classmethod
115 """ Returns a list of the names of available models. """
116 model_dir = cls._get_model_dir()
117 if not os.path.exists(model_dir):
118 return []
119 model_ext = ".%s" % FILE_EXTENSION
120 names = [name.rpartition(model_ext) for name in os.listdir(model_dir)]
121 return [name for name,ext,right in names if ext == model_ext and len(right) == 0]
122
124 """ Saves the model data to a file. """
125 data = {
126 'data' : self._get_model_data(),
127 'desc' : self._description,
128 'model_desc' : self.model_description,
129 }
130 data = pickle.dumps(data, 2)
131 filename = self._filename
132
133 filedir = os.path.dirname(filename)
134 if not os.path.exists(filedir):
135 os.mkdir(filedir)
136 f = open(filename, 'w')
137 f.write(data)
138 f.close()
139
141 """
142 Removes all the model's data. It is assumed that the tagger
143 will not be used at all after this has been called.
144
145 """
146 fn = self._filename
147 if os.path.exists(fn):
148 os.remove(fn)
149
150 for filename in self.get_extra_filenames():
151 if os.path.exists(filename):
152 os.remove(filename)
153
154 @classmethod
156 filename = cls.__get_filename(model_name)
157
158 if os.path.exists(filename):
159 f = open(filename, 'r')
160 model_data = f.read()
161 model_data = pickle.loads(model_data)
162 f.close()
163 else:
164 raise ModelLoadError, "the model '%s' has not been trained" % model_name
165 obj = cls._load_model(model_name, model_data['data'])
166
167 obj._description = model_data['desc']
168 obj.model_description = model_data['model_desc']
169 return obj
170
172 """
173 Don't override this.
174 You can add your own information into the
175 descriptive text (per subclass, for example) by calling
176 __init__ with the description kwarg, or by setting the
177 model_description attribute. You might, for example, want to
178 do this at training time.
179
180 """
181 from datetime import datetime
182
183 desc = """\
184 Model type: %(type)s
185 Model name: %(name)s
186 Created: %(creation)s\
187 """ % \
188 {
189 'type' : self.MODEL_TYPE,
190 'name' : self.model_name,
191 'creation' : datetime.now().strftime('%d %b %Y %H:%M'),
192 }
193 self._description = desc
194
196 if self.model_description is not None:
197 model_desc = "\n\n%s" % self.model_description
198 else:
199 model_desc = ""
200 return "%s%s" % (self._description,model_desc)
201 description = property(__get_description)
202
203 - def generate(self, logger=None, max_depth=None):
204 """
205 Generate a surface form from the PCFG model. A pcfg model might
206 not provide an implementation of this, in which case it will
207 always return None.
208
209 """
210 return
211
212
213 @classmethod
215 """
216 Subclasses should implement this method to load up the model
217 data given in the argument data. They should return an
218 instance of themselves. The data will be in the form of a
219 dictionary, as returned by the class' _get_model_data().
220
221 A default implementation that just uses simple pickling is
222 provided.
223
224 """
225 obj = data
226 if not isinstance(obj, cls):
227 raise ModelLoadError, "loaded file, but got wrong type of object "\
228 "out (%s)" % type(obj).__name__
229 return obj
230
232 """
233 Subclasses should implement this method to return the raw data
234 of the model in a form that can be pickled and written
235 out to a file.
236
237 A default implementation to complement the implementation of
238 _load_model is provided.
239
240 """
241 return self
242
243 @staticmethod
244 - def train(name, training_data, options, grammar=None, logger=None):
245 """
246 Trains a new model using the data in the list of sequences.
247 """
248 raise NotImplementedError, "called train() on abstract TaggerModel"
249
250 @abstractmethod
252 """
253 Probability of a (non-leaf) subtree, computed from the probability
254 of its expansions and the inner probabilities already associated
255 with its components. The result is the inside probability of the
256 subtree.
257
258 There are several different cases. It may be a unary expansion, in
259 which case C{expansion='unary'} and C{right=None}. It may be a
260 right-head expansion: C{expansion='right'} and both C{left} and
261 C{right} daughters are given. Or it may be a left-head expansion:
262 C{expansion='left'} and both daughters are given.
263
264 """
265 return
266
267 @abstractmethod
269 """
270 Outside probability of a subtree. This is approximated in these models
271 as the prior probability of the parent of the tree.
272
273 """
274 return
275
278
281
284
287