1 """Base classes for in-house statistical models.
2
3 Supertagger components that use statistical models that are implemented
4 within the Jazz Parser should use the baseclasses provided here.
5 They then only need to implement a model class, with a training method,
6 and the usual tagger interface.
7
8 """
9 """
10 ============================== License ========================================
11 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
12
13 This file is part of The Jazz Parser.
14
15 The Jazz Parser is free software: you can redistribute it and/or modify
16 it under the terms of the GNU General Public License as published by
17 the Free Software Foundation, either version 3 of the License, or
18 (at your option) any later version.
19
20 The Jazz Parser is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 GNU General Public License for more details.
24
25 You should have received a copy of the GNU General Public License
26 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
27
28 ============================ End license ======================================
29
30 """
31 __author__ = "Mark Granroth-Wilding <mark@granroth-wilding.co.uk>"
32
33 import os
34 import cPickle as pickle
35 from jazzparser.taggers import Tagger
36 from jazzparser import settings
37 from jazzparser.utils.options import ModuleOption
38
39 FILE_EXTENSION = "mdl"
42 """
43 Base class for corpus-trained supertagging models. Provides
44 interface and common methods for model classes, which must each
45 provide their own training methods and the usual tagger methods
46 (get_signs, etc).
47
48 The main thing this provides is stuff for storing and retreiving
49 models.
50
51 """
52 COMPATIBLE_FORMALISMS = [ 'music_keyspan', 'music_halfspan' ]
53
54 TAGGER_OPTIONS = [
55 ModuleOption('model', filter=str,
56 help_text="Model name. This model must have been previously trained. Required",
57 usage="model=X, where X is the name of a trained model",
58 required=True),
59 ModuleOption('partition', filter=int,
60 help_text="If given, a partitioned version of the model will "\
61 "be used, taking the model name as the base name. The "\
62 "partitioned models must have been trained separately.",
63 usage="partition=P, where P is an int",
64 default=None),
65 ModuleOption('batch', filter=float,
66 help_text="Probability ratio between one tag and the next "\
67 "that allows the second to be returned in the same batch.",
68 usage="batch=X, where X is a floating point value between 0 and 1",
69 default=0.8),
70 ModuleOption('max_batch', filter=int,
71 help_text="Maximum number of tags to include in a single batch, "\
72 "regardless of whether they fall within the beam ratio (see "\
73 "batch). 0 (default) means no limit.",
74 usage="max_batch=X, where X is an int",
75 default=0),
76 ModuleOption('best', filter=lambda x: x.lower() != "false",
77 help_text="If true, only the highest probability sign will "\
78 "be used for each word.",
79 usage="best=X, where X is 'True' or 'False'",
80 default=False),
81 ]
82
83 MODEL_CLASS = None
84
103
104
105 @staticmethod
107 """
108 The model name to use when the given partition number is requested.
109 The default implementation simply appends the number to the model
110 name. Subclasses may override this if they want to do something
111 different.
112
113 """
114 return "%s%d" % (model_name, partition_number)
115
117 """
118 A trainable model used by a ModelTagger.
119
120 """
121
122 MODEL_TYPE = None
123
124 TRAINING_OPTIONS = []
125
126 - def __init__(self, model_name, overwrite=False, options={}, description=None):
127 """
128 Creates an empty, untrained model. To load a previously
129 stored model, use from_file().
130
131 Optionally stores some custom descriptive text. This will be
132 included in the descriptive text that gets stored along with
133 the model.
134
135 """
136 self.model_name = model_name
137 if overwrite and os.exists(self._filename):
138
139 if overwrite:
140 os.remove(self._filename)
141 self._options = None
142 self._options_dict = options
143 self._generate_description()
144 self.model_description = description
145
146 @classmethod
151 _filename = property(__get_my_filename)
152
161
163 """
164 Instead of processing training options when instantiating (which makes
165 it impossible to have required options, since we're not always training
166 when instantiating), we process the training options the first time
167 they're needed.
168
169 If you want to do this ahead of time to verify the validity of the
170 values, call L{process_training_options}.
171
172 """
173 if self._options is None:
174 self.process_training_options()
175 return self._options
176 options = property(_get_options)
177
179 """
180 Should return a list of all the files that are stored along
181 with the main model file (not including the main file).
182
183 By default this is an empty list, but some subclasses may want
184 to put some names in this list. These should just be filenames,
185 not full paths. The files are assumed to be in the model type's
186 directory.
187
188 """
189 return []
190
191 @classmethod
196
197 @classmethod
199 """
200 Returns a list of the names of available models.
201
202 """
203 model_dir = cls._get_model_dir()
204 if not os.path.exists(model_dir):
205 return []
206 model_ext = ".%s" % FILE_EXTENSION
207 names = [name.rpartition(model_ext) for name in os.listdir(model_dir)]
208 return [name for name,ext,right in names if ext == model_ext and len(right) == 0]
209
211 """
212 Saves the model data to a file.
213 """
214 data = {
215 'data' : self._get_model_data(),
216 'desc' : self._description,
217 'model_desc' : self.model_description,
218 }
219 data = pickle.dumps(data, 2)
220 filename = self._filename
221
222 filedir = os.path.dirname(filename)
223 if not os.path.exists(filedir):
224 os.mkdir(filedir)
225 f = open(filename, 'w')
226 f.write(data)
227 f.close()
228
230 """
231 Removes all the model's data. It is assumed that the tagger
232 will not be used at all after this has been called.
233
234 """
235 fn = self._filename
236 if os.path.exists(fn):
237 os.remove(fn)
238
239 for filename in self.get_extra_filenames():
240 if os.path.exists(filename):
241 os.remove(filename)
242
243 @classmethod
245 filename = cls.__get_filename(model_name)
246
247 if os.path.exists(filename):
248 f = open(filename, 'r')
249 model_data = f.read()
250 model_data = pickle.loads(model_data)
251 f.close()
252 else:
253 raise ModelLoadError, "the model '%s' has not been trained" % model_name
254 obj = cls._load_model(model_data['data'])
255
256 obj._description = model_data['desc']
257 obj.model_description = model_data['model_desc']
258 return obj
259
261 """
262 Don't override this.
263 You can add your own information into the
264 descriptive text (per subclass, for example) by calling
265 __init__ with the description kwarg, or by setting the
266 model_description attribute. You might, for example, want to
267 do this at training time.
268
269 """
270 from datetime import datetime
271
272 desc = """\
273 Model type: %(type)s
274 Model name: %(name)s
275 Created: %(creation)s\
276 """ % \
277 {
278 'type' : self.MODEL_TYPE,
279 'name' : self.model_name,
280 'creation' : datetime.now().strftime('%d %b %Y %H:%M'),
281 }
282 self._description = desc
283
285 if self.model_description is not None:
286 model_desc = "\n\n%s" % self.model_description
287 else:
288 model_desc = ""
289 return "%s%s" % (self._description,model_desc)
290 description = property(__get_description)
291
292
293 @classmethod
295 """
296 Subclasses should implement this method to load up the model
297 data given in the argument data. They should return an
298 instance of themselves. The data will be in the form of a
299 dictionary, as returned by the class' _get_model_data().
300
301 A default implementation that just uses simple pickling is
302 provided. It assumes that the class can be instantiated using
303 no arguments.
304
305 """
306 obj = data
307 if not isinstance(obj, cls):
308 raise ModelLoadError, "loaded file, but got wrong type of object out (%s)" % type(obj).__name__
309 return obj
310
312 """
313 Subclasses should implement this method to return the raw data
314 of the model in a form that can be pickled and written
315 out to a file.
316
317 A default implementation to complement the implementation of
318 _load_model is provided.
319
320 *** IMPORTANT: ***
321 Some implementations perform part of the model storage in their
322 _get_model_data method, so you shouldn't use this just to
323 get the data if you don't plan to store it. (Not sure why you'd
324 want the raw data anyway and this is a private method - just
325 warning you!)
326
327 """
328 return self
329
330 - def train(self, sequence_index, grammar=None, logger=None):
331 """
332 Trains the loaded model using the data in the list of sequences.
333 """
334 raise NotImplementedError, "called train() on abstract TaggerModel"
335
338
341
343 """
344 For errors encountered while tagging using a model. Usually
345 indicates something's wrong with the model or the way it's being
346 used.
347
348 """
349 pass
350