Package jazzparser :: Package parsers :: Package pcfg :: Module model
[hide private]
[frames] | no frames]

Source Code for Module jazzparser.parsers.pcfg.model

  1  """Probabilistic models for the PCFG parser. 
  2   
  3  The PCFG parser need to be able to access certain probabilities to  
  4  parse. The interface contained in this module  
  5  provide access to these probabilities, using a model previously  
  6  trained on training data. The model implementation itself is provided by  
  7  the formalism, since it needs to manipulate categories. 
  8   
  9  """ 
 10  """ 
 11  ============================== License ======================================== 
 12   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
 13    
 14   This file is part of The Jazz Parser. 
 15    
 16   The Jazz Parser is free software: you can redistribute it and/or modify 
 17   it under the terms of the GNU General Public License as published by 
 18   the Free Software Foundation, either version 3 of the License, or 
 19   (at your option) any later version. 
 20    
 21   The Jazz Parser is distributed in the hope that it will be useful, 
 22   but WITHOUT ANY WARRANTY; without even the implied warranty of 
 23   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 24   GNU General Public License for more details. 
 25    
 26   You should have received a copy of the GNU General Public License 
 27   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
 28   
 29  ============================ End license ====================================== 
 30   
 31  """ 
 32  __author__ = "Mark Granroth-Wilding <mark@granroth-wilding.co.uk>"  
 33   
 34  import os 
 35  import cPickle as pickle 
 36  from jazzparser.taggers import Tagger 
 37  from jazzparser import settings 
 38  from jazzparser.utils.options import ModuleOption 
 39  from jazzparser.utils.base import abstractmethod 
 40   
 41  FILE_EXTENSION = "mdl" 
42 43 -class PcfgModel(object):
44 """ 45 A trainable model used by a pcfg parser. 46 47 """ 48 # Subclasses should set the model type, which distinguishes its models from others 49 MODEL_TYPE = None 50 # Options can get passed into a model when instantiated for training 51 TRAINING_OPTIONS = [] 52 # Input types for which the model amy be used lexically 53 LEX_INPUT_TYPES = [] 54
55 - def __init__(self, model_name, overwrite=False, options={}, 56 description=None, grammar=None):
57 """ 58 Creates an empty, untrained model. To load a previously 59 stored model, use from_file(). 60 61 Optionally stores some custom descriptive text. This will be 62 included in the descriptive text that gets stored along with 63 the model. 64 65 """ 66 self.model_name = model_name 67 if overwrite and os.exists(self._filename): 68 # Remove the old file if we're asked to overwrite 69 if overwrite: 70 os.remove(self._filename) 71 self._options = None 72 self._options_dict = options 73 self._generate_description() 74 self.model_description = description 75 self.grammar = grammar
76 77 @classmethod
78 - def __get_filename(cls, model_name):
79 return os.path.join(cls._get_model_dir(), "%s.%s" % (model_name, FILE_EXTENSION))
80 - def __get_my_filename(self):
81 return type(self).__get_filename(self.model_name)
82 _filename = property(__get_my_filename) 83
84 - def process_training_options(self):
85 """ 86 Verifies and processes the training option values. Access them in 87 self.options. 88 89 """ 90 self._options = ModuleOption.process_option_dict(self._options_dict, 91 self.TRAINING_OPTIONS)
92
93 - def _get_options(self):
94 """ 95 Instead of processing training options when instantiating (which makes 96 it impossible to have required options, since we're not always training 97 when instantiating), we process the training options the first time 98 they're needed. 99 100 If you want to do this ahead of time to verify the validity of the 101 values, call L{process_training_options}. 102 103 """ 104 if self._options is None: 105 self.process_training_options() 106 return self._options
107 options = property(_get_options) 108 109 @classmethod
110 - def _get_model_dir(cls):
111 return os.path.join(settings.PCFG_MODEL_DATA_DIR, cls.MODEL_TYPE)
112 113 @classmethod
114 - def list_models(cls):
115 """ Returns a list of the names of available models. """ 116 model_dir = cls._get_model_dir() 117 if not os.path.exists(model_dir): 118 return [] 119 model_ext = ".%s" % FILE_EXTENSION 120 names = [name.rpartition(model_ext) for name in os.listdir(model_dir)] 121 return [name for name,ext,right in names if ext == model_ext and len(right) == 0]
122
123 - def save(self):
124 """ Saves the model data to a file. """ 125 data = { 126 'data' : self._get_model_data(), 127 'desc' : self._description, 128 'model_desc' : self.model_description, 129 } 130 data = pickle.dumps(data, 2) 131 filename = self._filename 132 # Check the directory exists 133 filedir = os.path.dirname(filename) 134 if not os.path.exists(filedir): 135 os.mkdir(filedir) 136 f = open(filename, 'w') 137 f.write(data) 138 f.close()
139
140 - def delete(self):
141 """ 142 Removes all the model's data. It is assumed that the tagger 143 will not be used at all after this has been called. 144 145 """ 146 fn = self._filename 147 if os.path.exists(fn): 148 os.remove(fn) 149 # Get rid of any extra files that the model creates 150 for filename in self.get_extra_filenames(): 151 if os.path.exists(filename): 152 os.remove(filename)
153 154 @classmethod
155 - def load_model(cls, model_name):
156 filename = cls.__get_filename(model_name) 157 # Load the model from a file 158 if os.path.exists(filename): 159 f = open(filename, 'r') 160 model_data = f.read() 161 model_data = pickle.loads(model_data) 162 f.close() 163 else: 164 raise ModelLoadError, "the model '%s' has not been trained" % model_name 165 obj = cls._load_model(model_name, model_data['data']) 166 # Load the descriptive text (stored for every model type) 167 obj._description = model_data['desc'] 168 obj.model_description = model_data['model_desc'] 169 return obj
170
171 - def _generate_description(self):
172 """ 173 Don't override this. 174 You can add your own information into the 175 descriptive text (per subclass, for example) by calling 176 __init__ with the description kwarg, or by setting the 177 model_description attribute. You might, for example, want to 178 do this at training time. 179 180 """ 181 from datetime import datetime 182 183 desc = """\ 184 Model type: %(type)s 185 Model name: %(name)s 186 Created: %(creation)s\ 187 """ % \ 188 { 189 'type' : self.MODEL_TYPE, 190 'name' : self.model_name, 191 'creation' : datetime.now().strftime('%d %b %Y %H:%M'), 192 } 193 self._description = desc
194
195 - def __get_description(self):
196 if self.model_description is not None: 197 model_desc = "\n\n%s" % self.model_description 198 else: 199 model_desc = "" 200 return "%s%s" % (self._description,model_desc)
201 description = property(__get_description) 202
203 - def generate(self, logger=None, max_depth=None):
204 """ 205 Generate a surface form from the PCFG model. A pcfg model might 206 not provide an implementation of this, in which case it will 207 always return None. 208 209 """ 210 return
211 212 ############### Abstract methods ################# 213 @classmethod
214 - def _load_model(cls, name, data):
215 """ 216 Subclasses should implement this method to load up the model 217 data given in the argument data. They should return an 218 instance of themselves. The data will be in the form of a 219 dictionary, as returned by the class' _get_model_data(). 220 221 A default implementation that just uses simple pickling is 222 provided. 223 224 """ 225 obj = data 226 if not isinstance(obj, cls): 227 raise ModelLoadError, "loaded file, but got wrong type of object "\ 228 "out (%s)" % type(obj).__name__ 229 return obj
230
231 - def _get_model_data(self):
232 """ 233 Subclasses should implement this method to return the raw data 234 of the model in a form that can be pickled and written 235 out to a file. 236 237 A default implementation to complement the implementation of 238 _load_model is provided. 239 240 """ 241 return self
242 243 @staticmethod
244 - def train(name, training_data, options, grammar=None, logger=None):
245 """ 246 Trains a new model using the data in the list of sequences. 247 """ 248 raise NotImplementedError, "called train() on abstract TaggerModel"
249 250 @abstractmethod
251 - def inside_probability(self, expansion, parent, left, right=None):
252 """ 253 Probability of a (non-leaf) subtree, computed from the probability 254 of its expansions and the inner probabilities already associated 255 with its components. The result is the inside probability of the 256 subtree. 257 258 There are several different cases. It may be a unary expansion, in 259 which case C{expansion='unary'} and C{right=None}. It may be a 260 right-head expansion: C{expansion='right'} and both C{left} and 261 C{right} daughters are given. Or it may be a left-head expansion: 262 C{expansion='left'} and both daughters are given. 263 264 """ 265 return
266 267 @abstractmethod
268 - def outside_probability(self, parent):
269 """ 270 Outside probability of a subtree. This is approximated in these models 271 as the prior probability of the parent of the tree. 272 273 """ 274 return
275
276 -class ModelLoadError(Exception):
277 pass
278
279 -class ModelSaveError(Exception):
280 pass
281
282 -class ModelError(Exception):
283 pass
284
285 -class ModelTrainingError(Exception):
286 pass
287