jazzparser.parsers.pcfg.model

44 """ 45 A trainable model used by a pcfg parser. 46 47 """ 48 # Subclasses should set the model type, which distinguishes its models from others 49 MODEL_TYPE = None 50 # Options can get passed into a model when instantiated for training 51 TRAINING_OPTIONS = [] 52 # Input types for which the model amy be used lexically 53 LEX_INPUT_TYPES = [] 54

55 - def __init__(self, model_name, overwrite=False, options={}, 56 description=None, grammar=None):

57 """ 58 Creates an empty, untrained model. To load a previously 59 stored model, use from_file(). 60 61 Optionally stores some custom descriptive text. This will be 62 included in the descriptive text that gets stored along with 63 the model. 64 65 """ 66 self.model_name = model_name 67 if overwrite and os.exists(self._filename): 68 # Remove the old file if we're asked to overwrite 69 if overwrite: 70 os.remove(self._filename) 71 self._options = None 72 self._options_dict = options 73 self._generate_description() 74 self.model_description = description 75 self.grammar = grammar

76 77 @classmethod

78 - def __get_filename(cls, model_name):

79 return os.path.join(cls._get_model_dir(), "%s.%s" % (model_name, FILE_EXTENSION))

80 - def __get_my_filename(self):

81 return type(self).__get_filename(self.model_name)

82 _filename = property(__get_my_filename) 83

84 - def process_training_options(self):

85 """ 86 Verifies and processes the training option values. Access them in 87 self.options. 88 89 """ 90 self._options = ModuleOption.process_option_dict(self._options_dict, 91 self.TRAINING_OPTIONS)

92

93 - def _get_options(self):

94 """ 95 Instead of processing training options when instantiating (which makes 96 it impossible to have required options, since we're not always training 97 when instantiating), we process the training options the first time 98 they're needed. 99 100 If you want to do this ahead of time to verify the validity of the 101 values, call L{process_training_options}. 102 103 """ 104 if self._options is None: 105 self.process_training_options() 106 return self._options

107 options = property(_get_options) 108 109 @classmethod

110 - def _get_model_dir(cls):

111 return os.path.join(settings.PCFG_MODEL_DATA_DIR, cls.MODEL_TYPE)

112 113 @classmethod

114 - def list_models(cls):

115 """ Returns a list of the names of available models. """ 116 model_dir = cls._get_model_dir() 117 if not os.path.exists(model_dir): 118 return [] 119 model_ext = ".%s" % FILE_EXTENSION 120 names = [name.rpartition(model_ext) for name in os.listdir(model_dir)] 121 return [name for name,ext,right in names if ext == model_ext and len(right) == 0]

122

123 - def save(self):

124 """ Saves the model data to a file. """ 125 data = { 126 'data' : self._get_model_data(), 127 'desc' : self._description, 128 'model_desc' : self.model_description, 129 } 130 data = pickle.dumps(data, 2) 131 filename = self._filename 132 # Check the directory exists 133 filedir = os.path.dirname(filename) 134 if not os.path.exists(filedir): 135 os.mkdir(filedir) 136 f = open(filename, 'w') 137 f.write(data) 138 f.close()

139

140 - def delete(self):

141 """ 142 Removes all the model's data. It is assumed that the tagger 143 will not be used at all after this has been called. 144 145 """ 146 fn = self._filename 147 if os.path.exists(fn): 148 os.remove(fn) 149 # Get rid of any extra files that the model creates 150 for filename in self.get_extra_filenames(): 151 if os.path.exists(filename): 152 os.remove(filename)

153 154 @classmethod

155 - def load_model(cls, model_name):

156 filename = cls.__get_filename(model_name) 157 # Load the model from a file 158 if os.path.exists(filename): 159 f = open(filename, 'r') 160 model_data = f.read() 161 model_data = pickle.loads(model_data) 162 f.close() 163 else: 164 raise ModelLoadError, "the model '%s' has not been trained" % model_name 165 obj = cls._load_model(model_name, model_data['data']) 166 # Load the descriptive text (stored for every model type) 167 obj._description = model_data['desc'] 168 obj.model_description = model_data['model_desc'] 169 return obj

170

171 - def _generate_description(self):

172 """ 173 Don't override this. 174 You can add your own information into the 175 descriptive text (per subclass, for example) by calling 176 __init__ with the description kwarg, or by setting the 177 model_description attribute. You might, for example, want to 178 do this at training time. 179 180 """ 181 from datetime import datetime 182 183 desc = """\ 184 Model type: %(type)s 185 Model name: %(name)s 186 Created: %(creation)s\ 187 """ % \ 188 { 189 'type' : self.MODEL_TYPE, 190 'name' : self.model_name, 191 'creation' : datetime.now().strftime('%d %b %Y %H:%M'), 192 } 193 self._description = desc

194

195 - def __get_description(self):

196 if self.model_description is not None: 197 model_desc = "\n\n%s" % self.model_description 198 else: 199 model_desc = "" 200 return "%s%s" % (self._description,model_desc)

201 description = property(__get_description) 202

203 - def generate(self, logger=None, max_depth=None):

204 """ 205 Generate a surface form from the PCFG model. A pcfg model might 206 not provide an implementation of this, in which case it will 207 always return None. 208 209 """ 210 return

211 212 ############### Abstract methods ################# 213 @classmethod

214 - def _load_model(cls, name, data):

215 """ 216 Subclasses should implement this method to load up the model 217 data given in the argument data. They should return an 218 instance of themselves. The data will be in the form of a 219 dictionary, as returned by the class' _get_model_data(). 220 221 A default implementation that just uses simple pickling is 222 provided. 223 224 """ 225 obj = data 226 if not isinstance(obj, cls): 227 raise ModelLoadError, "loaded file, but got wrong type of object "\ 228 "out (%s)" % type(obj).__name__ 229 return obj

230

231 - def _get_model_data(self):

232 """ 233 Subclasses should implement this method to return the raw data 234 of the model in a form that can be pickled and written 235 out to a file. 236 237 A default implementation to complement the implementation of 238 _load_model is provided. 239 240 """ 241 return self

242 243 @staticmethod

244 - def train(name, training_data, options, grammar=None, logger=None):

245 """ 246 Trains a new model using the data in the list of sequences. 247 """ 248 raise NotImplementedError, "called train() on abstract TaggerModel"

249 250 @abstractmethod

251 - def inside_probability(self, expansion, parent, left, right=None):

252 """ 253 Probability of a (non-leaf) subtree, computed from the probability 254 of its expansions and the inner probabilities already associated 255 with its components. The result is the inside probability of the 256 subtree. 257 258 There are several different cases. It may be a unary expansion, in 259 which case C{expansion='unary'} and C{right=None}. It may be a 260 right-head expansion: C{expansion='right'} and both C{left} and 261 C{right} daughters are given. Or it may be a left-head expansion: 262 C{expansion='left'} and both daughters are given. 263 264 """ 265 return

266 267 @abstractmethod

268 - def outside_probability(self, parent):

269 """ 270 Outside probability of a subtree. This is approximated in these models 271 as the prior probability of the parent of the tree. 272 273 """ 274 return

Source Code for Module jazzparser.parsers.pcfg.model