jazzparser.taggers.candc.training

1 """Training interface to the C&C supertagger. 2 3 This automates the process of training the C&C supertagger on data 4 from the database. The data should first be generated using the 5 script in the annotator bin. 6 7 Training data should be in the Jazz Parser format, which differs 8 slightly from the C&C format. Instead of <obs>|<pos>|<tag>, each chord 9 should have be represented as <chord>|<obs>|<pos>|<tag>. Use 10 generate_model_data to generate this from the database. 11 12 """ 13 import os, shutil 14 from tempfile import NamedTemporaryFile 15 from subprocess import Popen, PIPE 16 17 from jazzparser import settings 18 from jazzparser.utils.data import holdout_partition 19 from jazzparser.utils.output import remove_ansi_colors 20 from .utils import training_data_to_candc, sequence_index_to_training_file, \ 21 sequence_list_to_training_file, generate_tag_list 22 from jazzparser.data.db_mirrors import SequenceIndex 23

24 -def train_model(model, data_filename, holdout_partitions=0, train_params={}, 25 chordmap=None):

26 """ 27 Train a C&C model by calling the C&C supertagger training routine. 28 29 model should be a model name to train/retrain. 30 data_filename should be the path to a training data file in the 31 hybrid C&C format. 32 params is an optional dict of (string) parameter values to feed 33 to the C&C trainer. Only certain parameter values will be allowed. 34 These will override the default parameters in settings.CANDC. Set 35 a parameter to None or an empty string to use C&C's default. 36 37 """ 38 command = os.path.join(settings.CANDC.BASE_PATH, "bin", "train_super") 39 40 # Process parameters that we'll use for training 41 params = settings.CANDC.DEFAULT_TRAINING_PARAMS.copy() 42 params.update(train_params) 43 extra_args = [] 44 # Prepare the args for the training command with these parameters 45 for key,val in params.items(): 46 if val is not None and val != "": 47 extra_args.append('--%s' % key) 48 extra_args.append(str(val)) 49 50 def _train(dest_model, filename): 51 """ Train a model using the train_super command. """ 52 model_path = os.path.join(settings.CANDC.MODELS_PATH, *(dest_model)) 53 if not os.path.exists(model_path): 54 os.makedirs(model_path) 55 # Store a list of possible tags so we can smooth over unseen ones 56 generate_tag_list(os.path.join(model_path, "tags")) 57 # Run the training 58 command_args = [command, "--model", model_path, "--comment", 59 "\"Model trained automatically on chord data in file %s\"" % data_filename, 60 "--input", filename, 61 # Tell C&C to put the tagdict in the model dir 62 # Doc says this is the default, but it isn't... 63 "--super-tagdict", "//tagdict"] + extra_args 64 print "Running: %s" % " ".join(command_args) 65 trainer = Popen(command_args, stdin=PIPE, stdout=PIPE, stderr=PIPE) 66 trainer.wait() 67 if trainer.returncode != 0: 68 raise CandcTrainingError, "There was an error training a "\ 69 "supertagger model from the file %s: \n%s" % (data_filename, trainer.stderr.read()) 70 else: 71 print "Trained model %s:\n%s" % (model_path, remove_ansi_colors(trainer.stderr.read()).strip("\n"))

72 73 # Read the data in from the given filename 74 in_file = open(data_filename, 'r') 75 lines = in_file.readlines() 76 # Convert into the C&C training format 77 lines = training_data_to_candc(lines) 78 79 model = model.split(".") 80 81 if holdout_partitions: 82 # Split up the data into n partitions and train on every 83 # n-1 subset of them. 84 85 # Build the lists with each partition held out 86 partitions = holdout_partition(lines, holdout_partitions) 87 # Train on each partitioned set 88 for i,partition in enumerate(partitions): 89 print "Training partition %d" % i 90 temp_file = NamedTemporaryFile() 91 temp_file.write("\n".join(partition)) 92 temp_file.flush() 93 # Train the model on this part of the data 94 _train(model+["partition-%s" % i], temp_file.name) 95 temp_file.close() 96 else: 97 temp_file = NamedTemporaryFile() 98 temp_file.write("\n".join(lines)) 99 temp_file.flush() 100 # Just train on the whole data 101 _train(model, temp_file.name) 102 temp_file.close() 103

104 -def train_model_on_sequence_data(model, data_filename, *args, **kwargs):

105 """ 106 Same as train_model, but takes a db_mirrors sequence data file as 107 input, rather than a C&C training data file. 108 109 """ 110 # Read in the training data 111 si = SequenceIndex.from_file(data_filename) 112 # Generate a temporary file with C&C training data in it 113 file = sequence_index_to_training_file(si) 114 train_model(model, file.name, *args, **kwargs)

115

116 -def train_model_on_sequence_index(model, sequenceindex, *args, **kwargs):

117 """ 118 Same as L{train_model_on_sequence_data}, but doesn't read the sequences 119 from a file. 120 121 """ 122 # Generate a temporary file with C&C training data in it 123 file = sequence_index_to_training_file(sequenceindex) 124 train_model(model, file.name, *args, **kwargs)

125

126 -def train_model_on_sequence_list(model, sequences, *args, **kwargs):

127 """ 128 Same as L{train_model_on_sequence_data}, but doesn't read the sequences 129 from a file. 130 131 """ 132 # Generate a temporary file with C&C training data in it 133 file = sequence_list_to_training_file(sequences) 134 train_model(model, file.name, *args, **kwargs)

135

136 -class CandcTrainingError(Exception):

137 pass

138

Source Code for Module jazzparser.taggers.candc.training