1 """Interface to external C&C supertagger tools.
2
3 Uses the C&C tagger out of the box.
4 The C&C tagger must have been installed in the candc directory for this
5 to work. It must also have be trained on some data before it can be
6 used.
7
8 """
9 """
10 ============================== License ========================================
11 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
12
13 This file is part of The Jazz Parser.
14
15 The Jazz Parser is free software: you can redistribute it and/or modify
16 it under the terms of the GNU General Public License as published by
17 the Free Software Foundation, either version 3 of the License, or
18 (at your option) any later version.
19
20 The Jazz Parser is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 GNU General Public License for more details.
24
25 You should have received a copy of the GNU General Public License
26 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
27
28 ============================ End license ======================================
29
30 """
31 __author__ = "Mark Wilding <mark.wilding@cantab.net>"
32
33 import os, logging, shutil
34 from subprocess import Popen, PIPE
35
36 from jazzparser import settings
37 from jazzparser.utils.base import group_pairs
38 from jazzparser.utils.options import ModuleOption
39 from jazzparser.utils.chords import interval_observation_from_chord_string_pair
40 from jazzparser.utils.probabilities import batch_sizes
41 from jazzparser.utils.strings import str_to_bool
42 from jazzparser.utils.loggers import create_logger
43 from jazzparser.utils.output import remove_ansi_colors
44 from jazzparser.taggers import Tagger, process_chord_input
45 from jazzparser.taggers.models import ModelTagger, TaggerModel
46 from jazzparser.taggers.chordmap import get_chord_mapping, \
47 get_chord_mapping_module_option
48 from jazzparser.data import Fraction
49 from .training import train_model_on_sequence_list
50 from .utils import read_tag_list
51
52
53 logger = logging.getLogger("main_logger")
56 """
57 This is really a fake: it doesn't actually save models itself, since we hand
58 over to the C&C tagger to do that. It provides the public methods of
59 tagger models so that we can use all the usual tagger training and
60 evaluation scripts without any special hacks.
61
62 """
63 MODEL_TYPE = 'candc'
64
65 TRAINING_OPTIONS = [
66 get_chord_mapping_module_option(),
67
68
69 ] + TaggerModel.TRAINING_OPTIONS
70
71 - def train(self, input_data, grammar=None, logger=None):
86
87 @staticmethod
89 """ Override to provide non-standard behaviour """
90 return CandcTaggerModel(model_name)
91
94
99 tags = property(_get_tags)
100
101 @staticmethod
103 model_dir = settings.CANDC.MODELS_PATH
104 if not os.path.exists(model_dir):
105 return []
106 names = [name for name in os.listdir(model_dir) \
107 if not name.startswith(".")
108 and os.path.isdir(os.path.join(model_dir, name))]
109 model_names = []
110
111 for dirname in names:
112
113 subdirs = [name for name in os.listdir(os.path.join(model_dir, dirname)) \
114 if not name.startswith(".") \
115 and os.path.isdir(os.path.join(model_dir, dirname, name))]
116 if len(subdirs) > 0:
117 model_names.extend(["%s/%s" % (dirname,subdir) for subdir in subdirs])
118 else:
119 model_names.append(dirname)
120 return model_names
121
124
127 """
128 Superclass of both kinds of C&C tagger. Don't use this: use one
129 of the subclasses below.
130 """
131 MODEL_CLASS = CandcTaggerModel
132 COMPATIBLE_FORMALISMS = [
133 'music_roman',
134 'music_keyspan',
135 'music_halfspan',
136 ]
137 INPUT_TYPES = ['db', 'chords']
138
139
140 TAG_BATCH_RATIO = 0.8
141 DEFAULT_UNSEEN_TAG_PROB = 0.001
142
143 TAGGER_OPTIONS = [
144 ModuleOption('batch', filter=float,
145 help_text="Probability ratio between one tag and the next "\
146 "that allows the second to be returned in the same batch.",
147 usage="batch=X, where X is a floating point value between 0 and 1",
148 default=TAG_BATCH_RATIO),
149 ModuleOption('model',
150 help_text="Name of the C&C trained model to use. Use the C&C "\
151 "training scripts to produce this.",
152 usage="model=X, where X is the model name. Split up multi-level models with dots.",
153 required=True),
154 ModuleOption('unseen_tag_prob', filter=float,
155 help_text="Probability mass reserved on each word so that some "\
156 "probability is assigned to tags never seen in the training "\
157 "set. This is a form of plus-n smoothing. "\
158 "Substracted from the total probability of tags for "\
159 "each word and distributed evenly across all tags.",
160 usage="unseen_tag_prob=X, where X is a floating point value between 0 and 1",
161 default=DEFAULT_UNSEEN_TAG_PROB),
162 ModuleOption('last_batch', filter=str_to_bool,
163 help_text="Use all possible tags, including the last, lowest "\
164 "probability batch, which typically acts as a bin for "\
165 "all remaining tags",
166 usage="last_batch=X, where X is 'true' or 'false'",
167 default=True),
168 ] + ModelTagger.TAGGER_OPTIONS
169
170 - def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs):
171 super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs)
172 process_chord_input(self)
173
174 if type(self) == CandcTagger:
175 raise NotImplementedError, "Tried to instantiate CandcTagger "\
176 "directly. You should use one of its subclasses."
177 self.tag_batch_ratio = self.options['batch']
178 model = self.options['model'].split('.')
179
180
181 if not os.path.exists(settings.CANDC.BASE_PATH):
182 raise CandcConfigurationError, "The C&C parser base "\
183 "directory %s does not exist" % settings.CANDC.BASE_PATH
184 if not os.path.exists(settings.CANDC.MODELS_PATH):
185 raise CandcConfigurationError, "The C&C parser models "\
186 "directory %s does not exist" % settings.CANDC.MODELS_PATH
187 candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command)
188 if not os.path.exists(candc_cmd):
189 raise CandcConfigurationError, "The C&C supertagger command "\
190 "%s does not exist. Have you built it?" % candc_cmd
191
192 candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model))
193 if not os.path.exists(candc_model):
194 raise CandcConfigurationError, "The C&C model given (%s) "\
195 "doesn't exist." % candc_model
196
197
198 logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model))
199 candc_logger = create_logger(filename=logfile)
200 self.logger.info("Logging C&C output to %s" % logfile)
201
202 candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input]))
203
204
205 self.tag_list = read_tag_list(os.path.join(candc_model, "tags"))
206
207
208 opts_filename = os.path.join(candc_model, "jpopts")
209 if not os.path.exists(opts_filename):
210 self.extra_opts = {}
211 else:
212 with open(opts_filename, 'r') as opts_file:
213 self.extra_opts = dict(
214 [line.strip("\n").split(":", 1)
215 for line in opts_file.readlines()])
216
217 self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None))
218
219
220 candc_command = [candc_cmd, "--model", candc_model,
221 "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args
222 self.tagger = Popen(candc_command,
223 stdin=PIPE, stdout=PIPE, stderr=PIPE)
224 candc_logger.info("C&C command: %s" % " ".join(candc_command))
225
226 self.tokens = self.input
227
228 observations = [
229 interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap)
230 for ch1,ch2 in group_pairs(self.tokens+[None])
231 ]
232
233 self.observations = ["%s|C" % t for t in observations]
234 candc_logger.info("Input: %s" % " ".join(self.observations))
235
236
237 try:
238 tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations))
239 except OSError, err:
240 logger.error("Could not run the C&C supertagger (%s)" % err)
241 candc_logger.error("Error: %s" % err)
242
243 error = self.tagger.stderr.read()
244 logger.error("C&C returned the error: %s" % error)
245 candc_logger.error("C&C error: %s" % error)
246 raise CandcTaggingError, "error running the C&C supertagger: %s" % error
247
248
249 tagger_out = remove_ansi_colors(tagger_out)
250 tagger_err = remove_ansi_colors(tagger_err)
251
252 return_code = self.tagger.returncode
253 if return_code < 0:
254 raise CandcTaggingError, "The C&C tagger terminated with return code %s. "\
255 "Error output for the tagging: %s" % (return_code, tagger_err)
256
257
258 log_output = tagger_out.replace("\t", ", ")
259 output_lines = [line for line in log_output.split("\n") if line.strip()]
260 log_output = "\n".join(["%d-%d: %s" % (i,i+1,outline) for (i,outline) in enumerate(output_lines)])
261 candc_logger.info("Output: %s" % log_output)
262 candc_logger.info("Stderr output: %s" % tagger_err)
263
264
265
266
267 self.tags = self._tags_from_output(tagger_out)
268
269
270
271
272
273
274 self.tags = [
275 [(sign,tag,prob) for (sign,tag,prob) in self.tags[time] \
276 if sign is not None]
277 for time in range(len(self.tags))]
278
282 input_length = property(_get_input_length)
283
285 batch_sizes = self.batch_sizes[index]
286
287 if self.options['last_batch']:
288
289 end_of_tags = len(batch_sizes)
290 else:
291
292 end_of_tags = len(batch_sizes) - 1
293
294 if offset >= end_of_tags:
295
296 return []
297
298 tags = self.tags[index]
299 if offset == 0:
300 returned_so_far = 0
301 else:
302 returned_so_far = sum(batch_sizes[:offset])
303 range_end = returned_so_far + batch_sizes[offset]
304
305 tag_probabilities = tags[returned_so_far:range_end]
306 return tag_probabilities
307
309 return self.tokens[index]
310
313 """
314 Uses the C&C supertagger component to get the best tag for each
315 word. Returns only one tag per word.
316 """
317 command = "super"
318 extra_args = []
319
322
338
340 """
341 Uses the C&C supertagger component to get multiple tags for each
342 word.
343 """
344 command = "msuper"
345
346 extra_args = ["--beta", "0.0"]
347
348 TAGGER_OPTIONS = CandcTagger.TAGGER_OPTIONS + [
349 ModuleOption('ignore-unknown', filter=str_to_bool,
350 help_text="Ignore any tags that the tagger returns but which "\
351 "are not found in the grammar. By default, an error will "\
352 "be thrown.",
353 usage="ignore-unknown=True (default False)",
354 default=False),
355 ]
356
359
454
459