1 """Training interface to the C&C supertagger.
2
3 This automates the process of training the C&C supertagger on data
4 from the database. The data should first be generated using the
5 script in the annotator bin.
6
7 Training data should be in the Jazz Parser format, which differs
8 slightly from the C&C format. Instead of <obs>|<pos>|<tag>, each chord
9 should have be represented as <chord>|<obs>|<pos>|<tag>. Use
10 generate_model_data to generate this from the database.
11
12 """
13 import os, shutil
14 from tempfile import NamedTemporaryFile
15 from subprocess import Popen, PIPE
16
17 from jazzparser import settings
18 from jazzparser.utils.data import holdout_partition
19 from jazzparser.utils.output import remove_ansi_colors
20 from .utils import training_data_to_candc, sequence_index_to_training_file, \
21 sequence_list_to_training_file, generate_tag_list
22 from jazzparser.data.db_mirrors import SequenceIndex
23
24 -def train_model(model, data_filename, holdout_partitions=0, train_params={},
25 chordmap=None):
26 """
27 Train a C&C model by calling the C&C supertagger training routine.
28
29 model should be a model name to train/retrain.
30 data_filename should be the path to a training data file in the
31 hybrid C&C format.
32 params is an optional dict of (string) parameter values to feed
33 to the C&C trainer. Only certain parameter values will be allowed.
34 These will override the default parameters in settings.CANDC. Set
35 a parameter to None or an empty string to use C&C's default.
36
37 """
38 command = os.path.join(settings.CANDC.BASE_PATH, "bin", "train_super")
39
40
41 params = settings.CANDC.DEFAULT_TRAINING_PARAMS.copy()
42 params.update(train_params)
43 extra_args = []
44
45 for key,val in params.items():
46 if val is not None and val != "":
47 extra_args.append('--%s' % key)
48 extra_args.append(str(val))
49
50 def _train(dest_model, filename):
51 """ Train a model using the train_super command. """
52 model_path = os.path.join(settings.CANDC.MODELS_PATH, *(dest_model))
53 if not os.path.exists(model_path):
54 os.makedirs(model_path)
55
56 generate_tag_list(os.path.join(model_path, "tags"))
57
58 command_args = [command, "--model", model_path, "--comment",
59 "\"Model trained automatically on chord data in file %s\"" % data_filename,
60 "--input", filename,
61
62
63 "--super-tagdict", "//tagdict"] + extra_args
64 print "Running: %s" % " ".join(command_args)
65 trainer = Popen(command_args, stdin=PIPE, stdout=PIPE, stderr=PIPE)
66 trainer.wait()
67 if trainer.returncode != 0:
68 raise CandcTrainingError, "There was an error training a "\
69 "supertagger model from the file %s: \n%s" % (data_filename, trainer.stderr.read())
70 else:
71 print "Trained model %s:\n%s" % (model_path, remove_ansi_colors(trainer.stderr.read()).strip("\n"))
72
73
74 in_file = open(data_filename, 'r')
75 lines = in_file.readlines()
76
77 lines = training_data_to_candc(lines)
78
79 model = model.split(".")
80
81 if holdout_partitions:
82
83
84
85
86 partitions = holdout_partition(lines, holdout_partitions)
87
88 for i,partition in enumerate(partitions):
89 print "Training partition %d" % i
90 temp_file = NamedTemporaryFile()
91 temp_file.write("\n".join(partition))
92 temp_file.flush()
93
94 _train(model+["partition-%s" % i], temp_file.name)
95 temp_file.close()
96 else:
97 temp_file = NamedTemporaryFile()
98 temp_file.write("\n".join(lines))
99 temp_file.flush()
100
101 _train(model, temp_file.name)
102 temp_file.close()
103
115
125
135
138