Package jazzparser :: Package taggers :: Module models
[hide private]
[frames] | no frames]

Source Code for Module jazzparser.taggers.models

  1  """Base classes for in-house statistical models. 
  2   
  3  Supertagger components that use statistical models that are implemented  
  4  within the Jazz Parser should use the baseclasses provided here. 
  5  They then only need to implement a model class, with a training method,  
  6  and the usual tagger interface. 
  7   
  8  """ 
  9  """ 
 10  ============================== License ======================================== 
 11   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
 12    
 13   This file is part of The Jazz Parser. 
 14    
 15   The Jazz Parser is free software: you can redistribute it and/or modify 
 16   it under the terms of the GNU General Public License as published by 
 17   the Free Software Foundation, either version 3 of the License, or 
 18   (at your option) any later version. 
 19    
 20   The Jazz Parser is distributed in the hope that it will be useful, 
 21   but WITHOUT ANY WARRANTY; without even the implied warranty of 
 22   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 23   GNU General Public License for more details. 
 24    
 25   You should have received a copy of the GNU General Public License 
 26   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
 27   
 28  ============================ End license ====================================== 
 29   
 30  """ 
 31  __author__ = "Mark Granroth-Wilding <mark@granroth-wilding.co.uk>"  
 32   
 33  import os 
 34  import cPickle as pickle 
 35  from jazzparser.taggers import Tagger 
 36  from jazzparser import settings 
 37  from jazzparser.utils.options import ModuleOption 
 38   
 39  FILE_EXTENSION = "mdl" 
40 41 -class ModelTagger(Tagger):
42 """ 43 Base class for corpus-trained supertagging models. Provides 44 interface and common methods for model classes, which must each 45 provide their own training methods and the usual tagger methods 46 (get_signs, etc). 47 48 The main thing this provides is stuff for storing and retreiving 49 models. 50 51 """ 52 COMPATIBLE_FORMALISMS = [ 'music_keyspan', 'music_halfspan' ] 53 # When subclassing, make sure to include this in the options if you override 54 TAGGER_OPTIONS = [ 55 ModuleOption('model', filter=str, 56 help_text="Model name. This model must have been previously trained. Required", 57 usage="model=X, where X is the name of a trained model", 58 required=True), 59 ModuleOption('partition', filter=int, 60 help_text="If given, a partitioned version of the model will "\ 61 "be used, taking the model name as the base name. The "\ 62 "partitioned models must have been trained separately.", 63 usage="partition=P, where P is an int", 64 default=None), 65 ModuleOption('batch', filter=float, 66 help_text="Probability ratio between one tag and the next "\ 67 "that allows the second to be returned in the same batch.", 68 usage="batch=X, where X is a floating point value between 0 and 1", 69 default=0.8), 70 ModuleOption('max_batch', filter=int, 71 help_text="Maximum number of tags to include in a single batch, "\ 72 "regardless of whether they fall within the beam ratio (see "\ 73 "batch). 0 (default) means no limit.", 74 usage="max_batch=X, where X is an int", 75 default=0), 76 ModuleOption('best', filter=lambda x: x.lower() != "false", 77 help_text="If true, only the highest probability sign will "\ 78 "be used for each word.", 79 usage="best=X, where X is 'True' or 'False'", 80 default=False), 81 ] 82 # Subclasses should use this to specify a subclass of TaggerModel to use 83 MODEL_CLASS = None 84
85 - def __init__(self, *args, **kwargs):
86 super(ModelTagger, self).__init__(*args, **kwargs) 87 # Check the subclass is properly defined 88 if type(self).MODEL_CLASS is None: 89 raise NotImplementedError, "ModelTagger subclass %s does not define a model class" % type(self).__name__ 90 # Get the partitioned model name if necessary 91 if self.options['partition'] is not None: 92 self.model_name = type(self).partition_model_name( 93 self.options['model'], 94 self.options['partition']) 95 else: 96 self.model_name = self.options['model'] 97 self.logger.info("Tagging model: %s" % self.model_name) 98 # Load a TaggerModel subclass instance to load the trained model data 99 self.model = (type(self).MODEL_CLASS).load_model(self.model_name) 100 101 self.batch_ratio = self.options['batch'] 102 self.best_only = self.options['best']
103 # After calling this, subclasses should perform tagging on the input 104 105 @staticmethod
106 - def partition_model_name(model_name, partition_number):
107 """ 108 The model name to use when the given partition number is requested. 109 The default implementation simply appends the number to the model 110 name. Subclasses may override this if they want to do something 111 different. 112 113 """ 114 return "%s%d" % (model_name, partition_number)
115
116 -class TaggerModel(object):
117 """ 118 A trainable model used by a ModelTagger. 119 120 """ 121 # Subclasses should set the model type, which distinguishes its models from others 122 MODEL_TYPE = None 123 # Options can get passed into a model when instantiated for training 124 TRAINING_OPTIONS = [] 125
126 - def __init__(self, model_name, overwrite=False, options={}, description=None):
127 """ 128 Creates an empty, untrained model. To load a previously 129 stored model, use from_file(). 130 131 Optionally stores some custom descriptive text. This will be 132 included in the descriptive text that gets stored along with 133 the model. 134 135 """ 136 self.model_name = model_name 137 if overwrite and os.exists(self._filename): 138 # Remove the old file if we're asked to overwrite 139 if overwrite: 140 os.remove(self._filename) 141 self._options = None 142 self._options_dict = options 143 self._generate_description() 144 self.model_description = description
145 146 @classmethod
147 - def __get_filename(cls, model_name):
148 return os.path.join(cls._get_model_dir(), "%s.%s" % (model_name, FILE_EXTENSION))
149 - def __get_my_filename(self):
150 return type(self).__get_filename(self.model_name)
151 _filename = property(__get_my_filename) 152
153 - def process_training_options(self):
154 """ 155 Verifies and processes the training option values. Access them in 156 self.options. 157 158 """ 159 self._options = ModuleOption.process_option_dict(self._options_dict, 160 self.TRAINING_OPTIONS)
161
162 - def _get_options(self):
163 """ 164 Instead of processing training options when instantiating (which makes 165 it impossible to have required options, since we're not always training 166 when instantiating), we process the training options the first time 167 they're needed. 168 169 If you want to do this ahead of time to verify the validity of the 170 values, call L{process_training_options}. 171 172 """ 173 if self._options is None: 174 self.process_training_options() 175 return self._options
176 options = property(_get_options) 177
178 - def get_extra_filenames(self):
179 """ 180 Should return a list of all the files that are stored along 181 with the main model file (not including the main file). 182 183 By default this is an empty list, but some subclasses may want 184 to put some names in this list. These should just be filenames, 185 not full paths. The files are assumed to be in the model type's 186 directory. 187 188 """ 189 return []
190 191 @classmethod
192 - def _get_model_dir(cls):
193 if cls.MODEL_TYPE is None: 194 raise NotImplementedError, "cannot load model: %s has not set a model type name" % cls.__name__ 195 return os.path.join(settings.MODEL_DATA_DIR, cls.MODEL_TYPE)
196 197 @classmethod
198 - def list_models(cls):
199 """ 200 Returns a list of the names of available models. 201 202 """ 203 model_dir = cls._get_model_dir() 204 if not os.path.exists(model_dir): 205 return [] 206 model_ext = ".%s" % FILE_EXTENSION 207 names = [name.rpartition(model_ext) for name in os.listdir(model_dir)] 208 return [name for name,ext,right in names if ext == model_ext and len(right) == 0]
209
210 - def save(self):
211 """ 212 Saves the model data to a file. 213 """ 214 data = { 215 'data' : self._get_model_data(), 216 'desc' : self._description, 217 'model_desc' : self.model_description, 218 } 219 data = pickle.dumps(data, 2) 220 filename = self._filename 221 # Check the directory exists 222 filedir = os.path.dirname(filename) 223 if not os.path.exists(filedir): 224 os.mkdir(filedir) 225 f = open(filename, 'w') 226 f.write(data) 227 f.close()
228
229 - def delete(self):
230 """ 231 Removes all the model's data. It is assumed that the tagger 232 will not be used at all after this has been called. 233 234 """ 235 fn = self._filename 236 if os.path.exists(fn): 237 os.remove(fn) 238 # Get rid of any extra files that the model creates 239 for filename in self.get_extra_filenames(): 240 if os.path.exists(filename): 241 os.remove(filename)
242 243 @classmethod
244 - def load_model(cls, model_name):
245 filename = cls.__get_filename(model_name) 246 # Load the model from a file 247 if os.path.exists(filename): 248 f = open(filename, 'r') 249 model_data = f.read() 250 model_data = pickle.loads(model_data) 251 f.close() 252 else: 253 raise ModelLoadError, "the model '%s' has not been trained" % model_name 254 obj = cls._load_model(model_data['data']) 255 # Load the descriptive text (stored for every model type) 256 obj._description = model_data['desc'] 257 obj.model_description = model_data['model_desc'] 258 return obj
259
260 - def _generate_description(self):
261 """ 262 Don't override this. 263 You can add your own information into the 264 descriptive text (per subclass, for example) by calling 265 __init__ with the description kwarg, or by setting the 266 model_description attribute. You might, for example, want to 267 do this at training time. 268 269 """ 270 from datetime import datetime 271 272 desc = """\ 273 Model type: %(type)s 274 Model name: %(name)s 275 Created: %(creation)s\ 276 """ % \ 277 { 278 'type' : self.MODEL_TYPE, 279 'name' : self.model_name, 280 'creation' : datetime.now().strftime('%d %b %Y %H:%M'), 281 } 282 self._description = desc
283
284 - def __get_description(self):
285 if self.model_description is not None: 286 model_desc = "\n\n%s" % self.model_description 287 else: 288 model_desc = "" 289 return "%s%s" % (self._description,model_desc)
290 description = property(__get_description) 291 292 ############### Abstract methods ################# 293 @classmethod
294 - def _load_model(cls,data):
295 """ 296 Subclasses should implement this method to load up the model 297 data given in the argument data. They should return an 298 instance of themselves. The data will be in the form of a 299 dictionary, as returned by the class' _get_model_data(). 300 301 A default implementation that just uses simple pickling is 302 provided. It assumes that the class can be instantiated using 303 no arguments. 304 305 """ 306 obj = data 307 if not isinstance(obj, cls): 308 raise ModelLoadError, "loaded file, but got wrong type of object out (%s)" % type(obj).__name__ 309 return obj
310
311 - def _get_model_data(self):
312 """ 313 Subclasses should implement this method to return the raw data 314 of the model in a form that can be pickled and written 315 out to a file. 316 317 A default implementation to complement the implementation of 318 _load_model is provided. 319 320 *** IMPORTANT: *** 321 Some implementations perform part of the model storage in their 322 _get_model_data method, so you shouldn't use this just to 323 get the data if you don't plan to store it. (Not sure why you'd 324 want the raw data anyway and this is a private method - just 325 warning you!) 326 327 """ 328 return self
329
330 - def train(self, sequence_index, grammar=None, logger=None):
331 """ 332 Trains the loaded model using the data in the list of sequences. 333 """ 334 raise NotImplementedError, "called train() on abstract TaggerModel"
335
336 -class ModelLoadError(Exception):
337 pass
338
339 -class ModelSaveError(Exception):
340 pass
341
342 -class TaggingModelError(Exception):
343 """ 344 For errors encountered while tagging using a model. Usually 345 indicates something's wrong with the model or the way it's being 346 used. 347 348 """ 349 pass
350