jazzparser.taggers.candc.tagger

1 """Interface to external C&C supertagger tools. 2 3 Uses the C&C tagger out of the box. 4 The C&C tagger must have been installed in the candc directory for this 5 to work. It must also have be trained on some data before it can be 6 used. 7 8 """ 9 """ 10 ============================== License ======================================== 11 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 12 13 This file is part of The Jazz Parser. 14 15 The Jazz Parser is free software: you can redistribute it and/or modify 16 it under the terms of the GNU General Public License as published by 17 the Free Software Foundation, either version 3 of the License, or 18 (at your option) any later version. 19 20 The Jazz Parser is distributed in the hope that it will be useful, 21 but WITHOUT ANY WARRANTY; without even the implied warranty of 22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23 GNU General Public License for more details. 24 25 You should have received a copy of the GNU General Public License 26 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 27 28 ============================ End license ====================================== 29 30 """ 31 __author__ = "Mark Wilding <mark.wilding@cantab.net>" 32 33 import os, logging, shutil 34 from subprocess import Popen, PIPE 35 36 from jazzparser import settings 37 from jazzparser.utils.base import group_pairs 38 from jazzparser.utils.options import ModuleOption 39 from jazzparser.utils.chords import interval_observation_from_chord_string_pair 40 from jazzparser.utils.probabilities import batch_sizes 41 from jazzparser.utils.strings import str_to_bool 42 from jazzparser.utils.loggers import create_logger 43 from jazzparser.utils.output import remove_ansi_colors 44 from jazzparser.taggers import Tagger, process_chord_input 45 from jazzparser.taggers.models import ModelTagger, TaggerModel 46 from jazzparser.taggers.chordmap import get_chord_mapping, \ 47 get_chord_mapping_module_option 48 from jazzparser.data import Fraction 49 from .training import train_model_on_sequence_list 50 from .utils import read_tag_list 51 52 # Get the logger from the logging system 53 logger = logging.getLogger("main_logger")

54 55 -class CandcTaggerModel(TaggerModel):

56 """ 57 This is really a fake: it doesn't actually save models itself, since we hand 58 over to the C&C tagger to do that. It provides the public methods of 59 tagger models so that we can use all the usual tagger training and 60 evaluation scripts without any special hacks. 61 62 """ 63 MODEL_TYPE = 'candc' 64 # Set up possible options for training 65 TRAINING_OPTIONS = [ 66 get_chord_mapping_module_option(), 67 # There should be some training options for C&C made available here, 68 # but at the moment it's just a standard set 69 ] + TaggerModel.TRAINING_OPTIONS 70

71 - def train(self, input_data, grammar=None, logger=None):

72 # Get the sequence list from the bulk input data 73 sequences = input_data.sequences 74 train_model_on_sequence_list(self.model_name, sequences, 75 chordmap=self.options['chord_mapping']) 76 # Put any extra opts in a dict to put in a file 77 extra_opts = { 78 'chordmap' : self.options['chord_mapping'].name, 79 } 80 # Store extra tagging options that aren't part of the C&C model 81 opts_filename = os.path.join(settings.CANDC.MODELS_PATH, 82 *(self.model_name.split(".")+["jpopts"])) 83 with open(opts_filename, 'w') as opts_file: 84 for (key,val) in extra_opts.items(): 85 print >>opts_file, "%s:%s" % (key,val)

86 87 @staticmethod

88 - def load_model(model_name):

89 """ Override to provide non-standard behaviour """ 90 return CandcTaggerModel(model_name)

91

92 - def save(self):

93 return

94

95 - def _get_tags(self):

96 # TODO 97 raise NotImplementedError, "C&C tagger model can't report tags yet" 98 return []

99 tags = property(_get_tags) 100 101 @staticmethod

102 - def list_models():

103 model_dir = settings.CANDC.MODELS_PATH 104 if not os.path.exists(model_dir): 105 return [] 106 names = [name for name in os.listdir(model_dir) \ 107 if not name.startswith(".") 108 and os.path.isdir(os.path.join(model_dir, name))] 109 model_names = [] 110 # Allow sub-models to be in subdirectories 111 for dirname in names: 112 # Check whether there are subdirs 113 subdirs = [name for name in os.listdir(os.path.join(model_dir, dirname)) \ 114 if not name.startswith(".") \ 115 and os.path.isdir(os.path.join(model_dir, dirname, name))] 116 if len(subdirs) > 0: 117 model_names.extend(["%s/%s" % (dirname,subdir) for subdir in subdirs]) 118 else: 119 model_names.append(dirname) 120 return model_names

121

122 - def delete(self):

123 shutil.rmtree(os.path.join(settings.CANDC.MODELS_PATH, *(self.model_name.split("/"))))

124

125 126 -class CandcTagger(ModelTagger):

127 """ 128 Superclass of both kinds of C&C tagger. Don't use this: use one 129 of the subclasses below. 130 """ 131 MODEL_CLASS = CandcTaggerModel 132 COMPATIBLE_FORMALISMS = [ 133 'music_roman', 134 'music_keyspan', 135 'music_halfspan', 136 ] 137 INPUT_TYPES = ['db', 'chords'] 138 # Probability ratio between one tag and the next that allows the 139 # second to be returned in the same batch as the first 140 TAG_BATCH_RATIO = 0.8 141 DEFAULT_UNSEEN_TAG_PROB = 0.001 142 143 TAGGER_OPTIONS = [ 144 ModuleOption('batch', filter=float, 145 help_text="Probability ratio between one tag and the next "\ 146 "that allows the second to be returned in the same batch.", 147 usage="batch=X, where X is a floating point value between 0 and 1", 148 default=TAG_BATCH_RATIO), 149 ModuleOption('model', 150 help_text="Name of the C&C trained model to use. Use the C&C "\ 151 "training scripts to produce this.", 152 usage="model=X, where X is the model name. Split up multi-level models with dots.", 153 required=True), 154 ModuleOption('unseen_tag_prob', filter=float, 155 help_text="Probability mass reserved on each word so that some "\ 156 "probability is assigned to tags never seen in the training "\ 157 "set. This is a form of plus-n smoothing. "\ 158 "Substracted from the total probability of tags for "\ 159 "each word and distributed evenly across all tags.", 160 usage="unseen_tag_prob=X, where X is a floating point value between 0 and 1", 161 default=DEFAULT_UNSEEN_TAG_PROB), 162 ModuleOption('last_batch', filter=str_to_bool, 163 help_text="Use all possible tags, including the last, lowest "\ 164 "probability batch, which typically acts as a bin for "\ 165 "all remaining tags", 166 usage="last_batch=X, where X is 'true' or 'false'", 167 default=True), 168 ] + ModelTagger.TAGGER_OPTIONS 169

170 - def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs):

171 super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs) 172 process_chord_input(self) 173 174 if type(self) == CandcTagger: 175 raise NotImplementedError, "Tried to instantiate CandcTagger "\ 176 "directly. You should use one of its subclasses." 177 self.tag_batch_ratio = self.options['batch'] 178 model = self.options['model'].split('.') 179 180 # Check that candc is available for supertagging 181 if not os.path.exists(settings.CANDC.BASE_PATH): 182 raise CandcConfigurationError, "The C&C parser base "\ 183 "directory %s does not exist" % settings.CANDC.BASE_PATH 184 if not os.path.exists(settings.CANDC.MODELS_PATH): 185 raise CandcConfigurationError, "The C&C parser models "\ 186 "directory %s does not exist" % settings.CANDC.MODELS_PATH 187 candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command) 188 if not os.path.exists(candc_cmd): 189 raise CandcConfigurationError, "The C&C supertagger command "\ 190 "%s does not exist. Have you built it?" % candc_cmd 191 # Check the model exists 192 candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model)) 193 if not os.path.exists(candc_model): 194 raise CandcConfigurationError, "The C&C model given (%s) "\ 195 "doesn't exist." % candc_model 196 197 # Create a logger to dump the output to 198 logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model)) 199 candc_logger = create_logger(filename=logfile) 200 self.logger.info("Logging C&C output to %s" % logfile) 201 # Note in the log what we're trying to tag 202 candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input])) 203 204 # Read in the list of tags to smooth over 205 self.tag_list = read_tag_list(os.path.join(candc_model, "tags")) 206 207 # Read in extra options 208 opts_filename = os.path.join(candc_model, "jpopts") 209 if not os.path.exists(opts_filename): 210 self.extra_opts = {} 211 else: 212 with open(opts_filename, 'r') as opts_file: 213 self.extra_opts = dict( 214 [line.strip("\n").split(":", 1) 215 for line in opts_file.readlines()]) 216 # Pull the chord mapping out of the options 217 self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None)) 218 219 # Spawn a process to do the tagging 220 candc_command = [candc_cmd, "--model", candc_model, 221 "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args 222 self.tagger = Popen(candc_command, 223 stdin=PIPE, stdout=PIPE, stderr=PIPE) 224 candc_logger.info("C&C command: %s" % " ".join(candc_command)) 225 226 self.tokens = self.input 227 # Build some observations from the tokens 228 observations = [ 229 interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) 230 for ch1,ch2 in group_pairs(self.tokens+[None]) 231 ] 232 # Add a dummy POS tag to each input item 233 self.observations = ["%s|C" % t for t in observations] 234 candc_logger.info("Input: %s" % " ".join(self.observations)) 235 236 # Run the tagger on this input 237 try: 238 tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations)) 239 except OSError, err: 240 logger.error("Could not run the C&C supertagger (%s)" % err) 241 candc_logger.error("Error: %s" % err) 242 # Output the actual error that the command returned 243 error = self.tagger.stderr.read() 244 logger.error("C&C returned the error: %s" % error) 245 candc_logger.error("C&C error: %s" % error) 246 raise CandcTaggingError, "error running the C&C supertagger: %s" % error 247 # C&C uses ANSI color commands in the output 248 # Remove them 249 tagger_out = remove_ansi_colors(tagger_out) 250 tagger_err = remove_ansi_colors(tagger_err) 251 # The tagger process should now be terminated. Check it didn't fall over 252 return_code = self.tagger.returncode 253 if return_code < 0: 254 raise CandcTaggingError, "The C&C tagger terminated with return code %s. "\ 255 "Error output for the tagging: %s" % (return_code, tagger_err) 256 257 # Format the string for slightly easier reading in the logfile 258 log_output = tagger_out.replace("\t", ", ") 259 output_lines = [line for line in log_output.split("\n") if line.strip()] 260 log_output = "\n".join(["%d-%d: %s" % (i,i+1,outline) for (i,outline) in enumerate(output_lines)]) 261 candc_logger.info("Output: %s" % log_output) 262 candc_logger.info("Stderr output: %s" % tagger_err) 263 264 # Get the tags out of the tagger output. 265 # We ignore the first two items (word and POS tag) and take the third (category) 266 # The output format for the different taggers varies 267 self.tags = self._tags_from_output(tagger_out) 268 269 # Check for bogus tags 270 # The tagger may return tags that can't actually be 271 # instantiated with the word, since it doesn't know about 272 # the lexicon: ignore them 273 #print "\n".join(", ".join(tag for (sign,tag,prob) in taglist) for taglist in self.tags) 274 self.tags = [ 275 [(sign,tag,prob) for (sign,tag,prob) in self.tags[time] \ 276 if sign is not None] 277 for time in range(len(self.tags))]

278

279 - def _get_input_length(self):

280 """ Returns the number of words (chords) in the input. """ 281 return len(self.tokens)

282 input_length = property(_get_input_length) 283

284 - def get_signs_for_word(self, index, offset=0):

285 batch_sizes = self.batch_sizes[index] 286 287 if self.options['last_batch']: 288 # This will return all batches 289 end_of_tags = len(batch_sizes) 290 else: 291 # This will never return the final batch 292 end_of_tags = len(batch_sizes) - 1 293 294 if offset >= end_of_tags: 295 # No more categories to return 296 return [] 297 298 tags = self.tags[index] 299 if offset == 0: 300 returned_so_far = 0 301 else: 302 returned_so_far = sum(batch_sizes[:offset]) 303 range_end = returned_so_far + batch_sizes[offset] 304 305 tag_probabilities = tags[returned_so_far:range_end] 306 return tag_probabilities

307

308 - def get_word(self, index):

309 return self.tokens[index]

310

311 312 -class CandcBestTagger(CandcTagger):

313 """ 314 Uses the C&C supertagger component to get the best tag for each 315 word. Returns only one tag per word. 316 """ 317 command = "super" 318 extra_args = [] 319

320 - def __init__(self, *args, **kwargs):

321 super(CandcBestTagger, self).__init__(*args, **kwargs)

322

323 - def _tags_from_output(self, output):

324 tag_sequence = [out.split("|")[2] for out in output.split()] 325 # Get a sign for this tag if possible 326 results = [[(self.grammar.get_sign_for_word_by_tag( 327 self.tokens[i], 328 tag, 329 extra_features={ 330 'duration' : self.durations[i], 331 'time' : self.times[i], 332 }), 333 tag, 334 1.0)] 335 for i,tag in enumerate(tag_sequence)] 336 self.batch_sizes = [[1]]*self.input_length 337 return results

338

339 -class CandcMultiTagger(CandcTagger):

340 """ 341 Uses the C&C supertagger component to get multiple tags for each 342 word. 343 """ 344 command = "msuper" 345 # Use a very low beta, so we get loads of tags, even improbable ones 346 extra_args = ["--beta", "0.0"] 347 348 TAGGER_OPTIONS = CandcTagger.TAGGER_OPTIONS + [ 349 ModuleOption('ignore-unknown', filter=str_to_bool, 350 help_text="Ignore any tags that the tagger returns but which "\ 351 "are not found in the grammar. By default, an error will "\ 352 "be thrown.", 353 usage="ignore-unknown=True (default False)", 354 default=False), 355 ] 356

357 - def __init__(self, *args, **kwargs):

358 super(CandcMultiTagger, self).__init__(*args, **kwargs)

359

360 - def _tags_from_output(self, output):

361 tags = [] 362 # Split up the output text to extract tags and probabilities 363 for line in output.split("\n"): 364 line = line.strip() 365 if len(line): 366 cols = line.split("\t") 367 num_results = int(cols[2]) 368 results = [] 369 all_tags = [] 370 # Get the tags and probs from the output 371 for result_num in range(num_results): 372 cat = cols[3+result_num*2] 373 prob = float(cols[4+result_num*2]) 374 results.append((cat, prob)) 375 all_tags.append(cat) 376 377 # Check all the tags are covered and add them with 0 prob if not 378 for tag in self.tag_list: 379 if tag not in all_tags: 380 results.append((tag, 0.0)) 381 382 tags.append(list(reversed(sorted(results, key=lambda x:x[1])))) 383 384 if len(tags) != self.input_length: 385 raise CandcTaggingError, "C&C output did not give a correct "\ 386 "set of tags: %s" % output 387 388 # Redistribute the tag probability to account for unseen tags 389 if self.options['unseen_tag_prob'] > 0.0: 390 unseen_prob = self.options['unseen_tag_prob'] 391 # Scale down everything that has a probability 392 prob_scale = 1.0 - unseen_prob 393 for i in range(len(tags)): 394 # Add reserved mass equally to every tag 395 prob_add = unseen_prob / len(tags[i]) 396 tags[i] = [(tag,(prob*prob_scale+prob_add)) for \ 397 tag,prob in tags[i]] 398 399 skip_tags = [] 400 # Work out what tags we're going to ignore altogether 401 if self.options['ignore-unknown']: 402 for tag_sequence in tags: 403 for tag,prob in tag_sequence: 404 if tag not in self.grammar.families: 405 # This tag's not in the grammar: just ignore it 406 skip_tags.append(tag) 407 logger.warn("Ignoring tag '%s', which is not in "\ 408 "the grammar." % tag) 409 #~ #### I've already done this above 410 #~ # Some tags get given zero probability by the model, either because 411 #~ # it's not smoothing enough, or because of rounding errors 412 #~ # We do a basic smoothing here, giving everything with 0 probability 413 #~ # a probability smaller than the smallest the model assigned 414 #~ smoothed_tags = [] 415 #~ for tag_probs in tags: 416 #~ zeros = sum(prob == 0.0 for (tag,prob) in tag_probs) 417 #~ # No need to smooth if everything got some prob 418 #~ if zeros: 419 #~ smallest = min(prob for (tag,prob) in tag_probs if prob > 0.0) 420 #~ if smallest == 1.0: 421 #~ # This occasionally happens and messes things up 422 #~ # Just reserve a small amount for the zeros in this case 423 #~ smallest = 0.001 424 #~ # Divide the smallest probability among the zero prob tags 425 #~ # and discount the others 426 #~ smooth_prob = smallest / zeros 427 #~ discount = 1.0-(smallest) 428 #~ tag_probs = [(tag, prob*discount if prob > 0.0 429 #~ else smooth_prob) 430 #~ for (tag,prob) in tag_probs] 431 #~ smoothed_tags.append(tag_probs) 432 #~ print smoothed_tags 433 434 signs = [[] for i in range(self.input_length)] 435 # Get an actual sign for each word/tag combination 436 for index,word in enumerate(self.tokens): 437 for (tag,prob) in tags[index]: 438 if tag not in skip_tags: 439 # Consult the grammar to get a suitable sign if we can 440 sign = self.grammar.get_sign_for_word_by_tag( 441 word, 442 tag, 443 extra_features={ 444 'time' : self.times[index], 445 'duration' : self.durations[index] 446 }) 447 signs[index].append((sign,tag, prob)) 448 449 self.batch_sizes = [] 450 for results in signs: 451 # Work out the batches that these should be returned in 452 self.batch_sizes.append(batch_sizes([p for __,__,p in results], self.tag_batch_ratio)) 453 return signs

454

455 -class CandcTaggingError(Exception):

456 pass

457 -class CandcConfigurationError(Exception):

458 pass

459

Source Code for Module jazzparser.taggers.candc.tagger