jazzparser.data.input

1 from __future__ import absolute_import 2 """Wrappers for different types of input data. 3 4 """ 5 """ 6 ============================== License ======================================== 7 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 8 9 This file is part of The Jazz Parser. 10 11 The Jazz Parser is free software: you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation, either version 3 of the License, or 14 (at your option) any later version. 15 16 The Jazz Parser is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 20 21 You should have received a copy of the GNU General Public License 22 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 23 24 ============================ End license ====================================== 25 26 """ 27 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 28 29 import sys 30 from jazzparser.data import Fraction, Chord 31 from jazzparser.utils.options import ModuleOption 32 from jazzparser.data.db_mirrors import SequenceIndex 33 from jazzparser.utils.strings import str_to_bool, make_unique 34 from . import tools

35 36 -class InputReader(object):

37 """ 38 Superclass for all sorts of input. Specifically, this is the superclass 39 of L{Input} and L{BulkInput}. You should subclass these if creating a new 40 input type. 41 42 This just provides the maintenance stuff common to L{Input} and 43 L{BulkInput}. 44 45 """ 46 FILE_INPUT_OPTIONS = [] 47 SHELL_TOOLS = [ tools.InputTool() ] 48 49 @classmethod

50 - def process_option_dict(cls, optdict):

51 return ModuleOption.process_option_dict(optdict, cls.FILE_INPUT_OPTIONS)

52 53 @staticmethod

54 - def from_file(filename, options={}):

55 raise NotImplementedError, "called from_file() on abstract base class"

56

57 -class Input(InputReader):

58 """ 59 Superclass for different types of input wrapper. 60 61 All of these methods should be overridden by subclasses. 62 63 """

64 - def __init__(self, name=None):

65 self.name = name

66

67 - def __str__(self):

68 if self.name is not None: 69 return "<input '%s'>" % self.name 70 else: 71 return "<input data>"

72

73 - def _get_string_name(self):

74 if self.name is None: 75 return "Unnamed sequence" 76 else: 77 return self.name

78 string_name = property(_get_string_name) 79

80 - def __len__(self):

81 raise NotImplementedError, "%s defines no len(). All input types "\ 82 "should." % type(self).__name__

83

84 - def __getitem__(self, item):

85 raise NotImplementedError, "%s does not support indexing. All input "\ 86 "types should" % type(self).__name__

87

88 - def slice(self, start=None, end=None):

89 """ 90 Subclasses should provide a way of slicing (taking a subsequence of) 91 the input that returns an input of the original type. 92 """ 93 raise NotImplementedError, "%s does not support slicing. All input "\ 94 "types should" % type(self).__name__

95

96 - def get_gold_analysis(self):

97 """ 98 If the input includes a gold-standard analysis, this should return it. 99 Otherwise, it will return None. 100 101 """ 102 return

103

104 -class DbInput(Input):

105 """ 106 Wrapper for input from the database, rather than the command line. 107 No point in reducing db input to a string, then reinterpreting it. 108 109 If only one of C{times} and C{durations} is given, the other will 110 be computed from it. Computing C{times} from durations involves 111 assuming that the first chord occurs at time 0. Computing 112 C{durations} from C{times} involves assuming that the last chord 113 has a length of 1. 114 115 At least one of C{times} and C{durations} must be given. 116 117 We also store the id of the chord sequence that this came from (C{id}) and 118 the sequence representation itself (C{sequence}). This may be C{None} in 119 some cases. 120 121 Confusingly (for historical reasons!), C{inputs} contains string chord 122 labels. C{chords} contains the db_mirrors representation of the chords. 123 124 """ 125 FILE_INPUT_OPTIONS = [ 126 ModuleOption('index', filter=int, 127 help_text="read the sequence with index (not id) X", 128 usage="index=X, where X is an int", 129 required=True), 130 ] 131

132 - def __init__(self, inputs, durations=None, times=None, id=None, \ 133 chords=None, sequence=None, *args, **kwargs):

134 super(DbInput, self).__init__(*args, **kwargs) 135 136 self.inputs = inputs 137 self.durations = durations 138 self.times = times 139 self.id = id 140 self.chords = chords 141 self.sequence = sequence 142 143 if durations is None and times is None: 144 raise ValueError, "cannot create a DbInput with neither "\ 145 "times nor durations given" 146 elif times is None: 147 self.times = [sum(durations[:i]) for i in range(len(durations))] 148 elif durations is None: 149 from jazzparser.utils.base import group_pairs 150 self.durations = [time1-time0 for (time1,time0) in group_pairs(times)] + [Fraction(1)]

151

152 - def get_gold_analysis(self):

153 """ 154 Parses the annotations, if present, to get a gold analysis. Unlike 155 L{AnnotatedDbInput}, this input type cannot be assumed to have 156 annotations. It will therefore not raise an error if annotations 157 are missing or incomplete, but just return None. 158 159 """ 160 from jazzparser.evaluation.parsing import parse_sequence_with_annotations 161 from jazzparser.grammar import get_grammar 162 from jazzparser.parsers import ParseError 163 164 try: 165 parses = parse_sequence_with_annotations( 166 self, get_grammar(), allow_subparses=False) 167 except ParseError: 168 return None 169 else: 170 return parses[0].semantics

171 172 @staticmethod

173 - def from_sequence(seq):

174 """ 175 Creates a DbInput from a database representation of a sequence. 176 177 """ 178 chords = list(seq) 179 inputs = [str(chord) for chord in chords] 180 durations = [chord.duration for chord in seq] 181 return DbInput(inputs, durations=durations, name=seq.string_name, \ 182 id=seq.id, chords=chords, sequence=seq)

183

184 - def __str__(self):

185 return " ".join(["%s" % i for i in self.inputs])

186

187 - def __len__(self):

188 return len(self.inputs)

189

190 - def __getitem__(self, item):

191 return self.inputs[item]

192

193 - def slice(self, start=None, end=None):

194 if self.chords: 195 chords = self.chords[start:end] 196 else: 197 chords = None 198 return DbInput(self.inputs[start:end], 199 self.durations[start:end], 200 self.times[start:end], 201 id=self.id, 202 name=self.name, 203 chords=chords, 204 sequence=self.sequence)

205 206 @staticmethod

207 - def from_file(filename, options={}):

208 # Load up a sequence index file according to the filename 209 seqs = SequenceIndex.from_file(filename) 210 # Get a sequence by index from the file 211 seq = seqs.sequence_by_index(options['index']) 212 if seq is None: 213 raise InputReadError("%d is not a valid sequence index in %s" % \ 214 (options['index'], filename)) 215 # Get the data from the sequence 216 return DbInput.from_sequence(seq)

217

218 -class WeightedChordLabelInput(Input):

219 """ 220 Input wrapper for a lattice of chord labels, including a set of chord 221 labels for each timestep, each with a probability. The labels themselves 222 are similar to the chord in a L{DbInput} and are represented using 223 L{jazzparser.misc.chordlabel.data.ChordLabel}s. 224 225 The lattice should be a list of timesteps. Each timestep should be a 226 list of (label,prob) tuples, where label is a C{ChordLabel} and prob is 227 a probability. 228 229 """ 230 FILE_INPUT_OPTIONS = [] 231

232 - def __init__(self, lattice, *args, **kwargs):

233 super(WeightedChordLabelInput, self).__init__(*args, **kwargs) 234 # Make sure the lattice entries are sorted 235 self.lattice = [ \ 236 list(reversed(sorted(timestep, key=lambda x:x[1]))) for timestep in lattice]

237

238 - def __str__(self):

239 return "<Lattice:%s\n>" % "".join(["\n %d: %s" % (t, \ 240 ", ".join(["%s (%.2e)" % (label,prob) for (label,prob) in timestep])) \ 241 for t,timestep in enumerate(self.lattice)])

242

243 - def __repr__(self):

244 return "<Lattice (%d)>" % len(self)

245

246 - def __len__(self):

247 return len(self.lattice)

248

249 - def __getitem__(self, item):

250 return self.lattice[item]

251

252 - def slice(self, start=None, end=None):

253 return WeightedChordLabelInput(self.lattice[start:end])

254

255 - def apply_ratio_beam(self, ratio=1e-4):

256 """ 257 Applies a beam to remove all values from the lattice whose probability 258 is less than the given ratio of the highest probability for that 259 timestep. 260 261 @type ratio: float 262 @param ratio: the max ratio of a probability to the highest 263 probability in the timestep 264 265 """ 266 for timestep in self: 267 min_prob = timestep[0][1] * ratio 268 # Check if any values have a prob small enough to remove 269 to_remove = [i for i,val in enumerate(timestep) if val[1] < min_prob] 270 # Get rid of all of these values 271 for removed,index in enumerate(to_remove): 272 timestep.pop(index-removed)

273 274 @staticmethod

275 - def from_file(filename, options={}):

276 # Load a lattice from a file by unpickling it 277 import cPickle as pickle 278 f = open(filename, 'r') 279 lattice = pickle.load(f) 280 return WeightedChordLabelInput(lattice)

281

282 -class ChordInput(Input):

283 """ 284 Input wrapper for textual chord input. 285 286 This is the simplest type of input, usually taken from the command line. 287 288 You must provide a list of chord symbols and either a list of durations 289 or a list of times when constructing this. To process pure text (which 290 includes computing durations/times and splitting up chords), use 291 L{ChordInput.from_string}. 292 293 """ 294 FILE_INPUT_OPTIONS = [ 295 ModuleOption('roman', filter=str_to_bool, 296 help_text="read chord symbols as roman numberals. "\ 297 "Default is to assume note names", 298 usage="roman=B, where B is a boolean", 299 default=False), 300 ] 301

302 - def __init__(self, inputs, durations=None, times=None, roman=False, 303 *args, **kwargs):

304 super(ChordInput, self).__init__(*args, **kwargs) 305 306 self.inputs = inputs 307 self.durations = durations 308 self.times = times 309 self.roman = roman 310 311 # Compute the durations from times or vice versa 312 if durations is None and times is None: 313 raise ValueError, "cannot create a ChordInput with neither "\ 314 "times nor durations given" 315 elif times is None: 316 self.times = [sum(durations[:i], Fraction(0)) for i in range(len(durations))] 317 elif durations is None: 318 from jazzparser.utils.base import group_pairs 319 self.durations = [time1-time0 for (time1,time0) in group_pairs(times)] + [Fraction(1)] 320 321 # Convert all strings to internal chord representation 322 # Done now so we check the chords can all be understood before doing 323 # anything else 324 self.chords = [ 325 Chord.from_name(name, roman=roman).to_db_mirror() for name in inputs 326 ] 327 for chord,dur in zip(self.chords, self.durations): 328 chord.duration = dur

329 330 @staticmethod

331 - def from_string(input, name="<string input>", roman=False):

332 """ 333 Produce a wrapped-up version of the input directly from an input string, 334 which may come, for example, from the command line. 335 336 """ 337 from jazzparser.utils.input import assign_durations, strip_input 338 # Get durations from the original string before doing anything else 339 durations = assign_durations(input) 340 # Remove unwanted characters from the string 341 input = strip_input(input) 342 # Tokenise the string 343 chords = input.split() 344 return ChordInput(chords, durations=durations, name=name, roman=roman)

345

346 - def __str__(self):

347 return " ".join(["%s" % i for i in self.inputs])

348

349 - def __len__(self):

350 return len(self.inputs)

351

352 - def __getitem__(self, item):

353 return self.inputs[item]

354

355 - def slice(self, start=None, end=None):

356 return ChordInput(self.inputs[start:end], 357 self.durations[start:end], 358 self.times[start:end], 359 name=self.name)

360 361 @staticmethod

362 - def from_file(filename, options={}):

363 # Read the whole contents of the file 364 f = open(filename, 'r') 365 try: 366 data = f.read() 367 finally: 368 f.close() 369 # Just treat the whole file as one sequence 370 return ChordInput.from_string(data, name="File: %s" % filename, 371 roman=options['roman'])

372

373 - def to_db_input(self):

374 """ 375 This data type is useful for reading textual input. For internal 376 processing, however, it can be converted to a L{DbInput}, which 377 is generally more convenient to handle. 378 379 """ 380 return DbInput(self.inputs, durations=self.durations, chords=self.chords)

381

382 -class SegmentedMidiInput(Input):

383 """ 384 Input wrapper for MIDI files with extra information about segmentation, 385 in the form it's needed for the Raphael and Stoddard model and midi 386 supertagging models: that is, offset (start of first bar) and bar length. 387 388 Each segment is a midi L{midi.EventStream}. It also has the additional 389 attribute C{segment_start}, giving the tick time at which the segment 390 begins in the original midi stream. 391 392 Optionally also stores a gold standard analysis in the form of a 393 db annotated chord sequence: see L{AnnotatedDbInput}. 394 395 """ 396 FILE_INPUT_OPTIONS = [ 397 ModuleOption('time_unit', filter=float, 398 help_text="number of beats (by the MIDI file resolution) "\ 399 "to take to be one time unit", 400 usage="time_unit=X, where X is an int or float", 401 required=False, 402 default=4), 403 ModuleOption('tick_offset', filter=int, 404 help_text="time in MIDI ticks at which the first time "\ 405 "unit begins", 406 usage="tick_offset=X, where X is an int", 407 required=False, 408 default=0), 409 ModuleOption('truncate', filter=int, 410 help_text="truncate the input to this length.", 411 usage="truncate=L, where L is an integer"), 412 ] 413 SHELL_TOOLS = Input.SHELL_TOOLS + [ 414 tools.PlayMidiChunksTool(), 415 tools.PrintMidiChunksTool() 416 ] 417

418 - def __init__(self, inputs, time_unit=4, tick_offset=0, stream=None, 419 gold=None, sequence_index=None, *args, **kwargs):

420 """ 421 422 @type inputs: list of L{midi.EventStream}s 423 @param stream: the midi data segments 424 @type time_unit: int or float 425 @param time_unit: number of beats to take as the basic unit 426 of time for observations 427 @type tick_offset: int 428 @param tick_offset: number of ticks after which the first bar begins 429 430 """ 431 super(SegmentedMidiInput, self).__init__(*args, **kwargs) 432 433 self.stream = stream 434 self.time_unit = time_unit 435 self.tick_offset = tick_offset 436 self.inputs = inputs 437 self.gold = gold 438 self.sequence_index = sequence_index 439 440 self.tick_unit = int(stream.resolution*time_unit)

441

442 - def __len__(self):

443 return len(self.inputs)

444

445 - def __getitem__(self, item):

446 return self.inputs[item]

447

448 - def __str__(self):

449 if self.name is not None: 450 return "<MIDI: %s (%d)>" % (self.name, len(self)) 451 else: 452 return "<MIDI: %d chunks>" % len(self)

453

454 - def slice(self, start=None, end=None):

455 return SegmentedMidiInput(self.inputs[start:end], 456 durations=self.durations[start:end], 457 times=self.times[start:end], 458 name=self.name, 459 stream=self.stream, 460 sequence_index=self.sequence_index)

461

462 - def get_gold_analysis(self):

463 # This may be None if no analysis was in the input 464 return self.gold

465 466 @staticmethod

467 - def from_file(filename, options={}, gold=None, sequence_index=None):

468 from midi import read_midifile 469 from os.path import basename 470 # Read are parse the midi file 471 stream = read_midifile(filename) 472 # Get the required segmentation parameters from the options 473 time_unit = options['time_unit'] 474 tick_offset = options['tick_offset'] 475 # Use the filename as an identifier 476 name = basename(filename) 477 478 return SegmentedMidiInput.from_stream(stream, 479 time_unit=time_unit, 480 tick_offset=tick_offset, 481 name=name, 482 truncate=options['truncate'], 483 gold=gold, 484 only_notes=True, 485 sequence_index=sequence_index)

486 487 @staticmethod

488 - def from_stream(stream, time_unit=4, tick_offset=0, name=None, 489 only_notes=True, truncate=None, gold=None, 490 sequence_index=None):

491 """ 492 Creates a L{SegmentedMidiInput} from a midi event stream. 493 494 @type only_notes: bool 495 @param only_notes: if True, only includes note-on/note-off events in 496 the segments. If False, the stream will be sliced so that each 497 segment repeats things like program change events at the beginning. 498 Including only notes, however, makes the preprocessing very much 499 faster 500 501 """ 502 # Divide the stream up into slices of the right size 503 # Number of ticks in each slice 504 tick_unit = int(stream.resolution*time_unit) 505 if len(stream.trackpool) == 0: 506 end_time = 0 507 else: 508 end_time = max(stream.trackpool).tick 509 510 if only_notes: 511 from midi import EventStream, NoteOnEvent, NoteOffEvent, EndOfTrackEvent 512 # Only include notes in the stream 513 # This is much simpler and faster than the alternative 514 events = [ev for ev in list(sorted(stream.trackpool)) if \ 515 type(ev) in [NoteOnEvent, NoteOffEvent]] 516 events = iter(events) 517 try: 518 current_event = events.next() 519 # Get up to the start point in the stream 520 while current_event.tick < tick_offset: 521 current_event = events.next() 522 except StopIteration: 523 # Got to the end of the stream before we even started 524 inputs = [] 525 else: 526 inputs = [] 527 for chunk_start in range(tick_offset, end_time, tick_unit): 528 chunk_end = chunk_start+tick_unit 529 slc = EventStream() 530 slc.add_track() 531 slc.format = stream.format 532 slc.resolution = stream.resolution 533 slc.segment_start = chunk_start 534 535 # Add all the note events in this time period 536 try: 537 while current_event.tick < chunk_end: 538 slc.add_event(current_event) 539 current_event = events.next() 540 # Add the end of track event 541 eot = EndOfTrackEvent() 542 eot.tick = chunk_end 543 slc.add_event(eot) 544 except StopIteration: 545 # Reached the end of the stream 546 inputs.append(slc) 547 break 548 549 inputs.append(slc) 550 else: 551 # Use slices to do all the necessary repetition of ongoing events 552 from midi.slice import EventStreamSlice 553 start_times = range(tick_offset, end_time, tick_unit) 554 # First slice starts at the offset value 555 slices = [EventStreamSlice(stream, 556 chunk_start, 557 chunk_start+tick_unit) 558 for chunk_start in start_times] 559 inputs = [slc.to_event_stream(repeat_playing=False, cancel_playing=False) \ 560 for slc in slices] 561 # Associate the start time with each segment 562 for slc,start_time in zip(inputs, start_times): 563 slc.segment_start = start_time 564 565 # Remove empty segments from the start and end 566 current = 0 567 # There's always one event - the end of track 568 while len(inputs[current].trackpool) < 2: 569 current += 1 570 inputs = inputs[current:] 571 # And the end 572 current = len(inputs) - 1 573 while len(inputs[current].trackpool) < 2: 574 current -= 1 575 inputs = inputs[:current+1] 576 577 if truncate is not None: 578 inputs = inputs[:truncate] 579 580 return SegmentedMidiInput(inputs, 581 time_unit=time_unit, 582 tick_offset=tick_offset, 583 name=name, 584 stream=stream, 585 gold=gold, 586 sequence_index=sequence_index)

587

588 -class AnnotatedDbInput(DbInput):

589 """ 590 Like DbInput, but stores category annotations along with the chords. 591 592 """ 593 FILE_INPUT_OPTIONS = DbInput.FILE_INPUT_OPTIONS 594

595 - def __init__(self, *args, **kwargs):

596 self.categories = kwargs.pop('categories', []) 597 super(AnnotatedDbInput, self).__init__(*args, **kwargs) 598 599 if len(self.categories) != len(self): 600 raise InputTypeError, "there must be the same number of category "\ 601 "annotations as chords"

602

603 - def get_gold_analysis(self):

604 """ 605 Parses the annotations to get a gold analysis. 606 607 """ 608 from jazzparser.evaluation.parsing import parse_sequence_with_annotations 609 from jazzparser.grammar import get_grammar 610 parses = parse_sequence_with_annotations( 611 self, get_grammar(), allow_subparses=False) 612 return parses[0].semantics

613 614 @staticmethod

615 - def from_sequence(seq):

616 """ 617 Creates a DbInput from a database representation of a sequence. 618 619 """ 620 inputs = [str(chord) for chord in seq] 621 chords = list(seq.iterator()) 622 durations = [chord.duration for chord in seq] 623 categories = [c.category for c in seq] 624 return AnnotatedDbInput(inputs, durations=durations, 625 name=seq.string_name, id=seq.id, categories=categories, 626 sequence=seq, chords=chords)

627 628 @staticmethod

629 - def from_file(filename, options={}):

630 # Load up a sequence index file according to the filename 631 seqs = SequenceIndex.from_file(filename) 632 # Get a sequence by index from the file 633 seq = seqs.sequence_by_index(options['index']) 634 if seq is None: 635 raise InputReadError("%d is not a valid sequence index in %s" % \ 636 (options['index'], filename)) 637 return AnnotatedDbInput.from_sequence(seq)

638 639 INPUT_TYPES = [ 640 ('db', DbInput), 641 ('db-annotated', AnnotatedDbInput), 642 ('chords', ChordInput), 643 ('segmidi', SegmentedMidiInput), 644 ('labels', WeightedChordLabelInput), 645 ]

646 647 -class BulkInput(InputReader):

648 """ 649 Ways of accepting multiple inputs at once. These types can be used by the 650 parser script, whch will iterate over the component inputs. 651 652 The classes should be iterable and iterate over the inputs. 653 654 """ 655 INPUT_TYPE = None 656

657 - def __iter__(self):

658 return iter(self.inputs)

659

660 - def __len__(self):

661 return len(self.inputs)

662

663 - def __getitem__(self, i):

664 return self.inputs[i]

665

666 - def subset(self, *ranges):

667 """ 668 Returns an object of the same type containing the data points in the 669 given ranges, given as [start,end) pairs. Give multiple ranges as 670 successive arguments. 671 672 A default implementation is provided, but subclasses may want to 673 provide their own if this is not appropriate. 674 675 """ 676 return type(self)(\ 677 sum([self.inputs[start:end] for (start,end) in ranges], []))

678

679 - def get_partitions(self, num_partitions):

680 """ 681 Generate an n-way partition and the corresponding heldout sets for 682 the data set. The objects returned are of the same bulk input 683 type. 684 685 @return: ([part0, part1, ...], [rest0, rest1, ...]) 686 687 """ 688 partition_size = len(self) / num_partitions 689 partitions = [] 690 heldout_sets = [] 691 692 # Get each equally-sized partition (all but the last) 693 for parti in range(num_partitions-1): 694 partitions.append(self.subset((partition_size*parti, partition_size*(parti+1)))) 695 # Get the set of inputs not in partition parti 696 heldout_sets.append(self.subset( 697 (0, partition_size*parti), 698 (partition_size*(parti+1), None) 699 )) 700 701 # Last partition: throw in everything that's left 702 partitions.append(self.subset((partition_size*(num_partitions-1), None))) 703 heldout_sets.append(self.subset((0, partition_size*(num_partitions-1)))) 704 return (partitions, heldout_sets)

705

706 - def get_identifiers(self):

707 """ 708 Returns a list containing a string identifier for each input. What 709 this is depends on the input type. At its simplest it may be just an 710 integer. In cases where something more informative is available (e.g. 711 a filename), this will be returned instead. 712 713 Whatever happens, each input will have a unique identifier. This is 714 useful, for example, for creating an output file for each input. 715 716 """ 717 # Try getting a name for each 718 ids = [inp.name for inp in self] 719 # Replace any None or blank names with an id 720 ids = [name or str(i) for i,name in enumerate(ids)] 721 # Check that the names are all unique and append ints if not 722 return make_unique(ids)

723

724 -class DbBulkInput(BulkInput):

725 """ 726 A file containing a list of chord sequences. Can be read in from a 727 sequence index file. 728 729 """ 730 INPUT_TYPE = DbInput 731

732 - def __init__(self, inputs):

733 self.inputs = inputs

734 735 @staticmethod

736 - def from_file(filename, options={}):

737 # Read in the sequence index file 738 f = SequenceIndex.from_file(filename) 739 inputs = [DbInput.from_sequence(s) for s in f] 740 return DbBulkInput(inputs)

741 742 @property

743 - def sequences(self):

744 return [inp.sequence for inp in self.inputs]

745

746 -class ChordBulkInput(BulkInput):

747 """ 748 A file containing a list of textual chord sequences. This used to be 749 provided fully in the top-level parser script as input processing. 750 751 """ 752 INPUT_TYPE = ChordInput 753 FILE_INPUT_OPTIONS = [ 754 ModuleOption('start', filter=int, 755 help_text="line number to start reading from", 756 usage="start=X, where X is an int"), 757 ModuleOption('end', filter=int, 758 help_text="line number at which to stop reading", 759 usage="end=X, where X is an int"), 760 ModuleOption('roman', filter=str_to_bool, 761 help_text="read chord symbols as roman numberals. "\ 762 "Default is to assume note names", 763 usage="roman=B, where B is a boolean", 764 default=False), 765 ] 766

767 - def __init__(self, inputs, output_lines=None):

768 self.inputs = inputs 769 self.output_lines = output_lines

770 771 @staticmethod

772 - def from_file(filename, options={}):

773 f = open(filename, 'r') 774 try: 775 lines = f.readlines() 776 finally: 777 f.close() 778 lines = [l.rstrip("\n") for l in lines] 779 780 # Use the start and end line numbers if they were given 781 if 'start' in options: 782 lines = lines[options['start']:] 783 if 'end' in options: 784 lines = lines[:options['end']] 785 786 # Do all the preprocessing 787 output_lines = {} 788 inputs = [] 789 sequence_name = None 790 for line in lines: 791 # If this is an output comment, output it and move to the next item 792 if line.startswith(">>"): 793 # If this is also a name definition, use it for the next sequence 794 if line[2:].startswith("="): 795 sequence_name = line[3:-1] 796 output_lines[len(inputs)] = line[3:] 797 else: 798 output_lines[len(inputs)] = line[2:] 799 continue 800 elif line.startswith("//"): 801 # Non-printing comment 802 # This could also be a name definition 803 if line[2:].startswith("="): 804 output_lines[len(inputs)] = line[3:-1] 805 continue 806 elif len(line.strip()) == 0: 807 # Ignore blank lines 808 continue 809 else: 810 # Otherwise it's an actual chord sequence 811 inputs.append(ChordInput.from_string(line, 812 name=sequence_name, 813 roman=options['roman'])) 814 # Reset the sequence name 815 sequence_name = None 816 return ChordBulkInput(inputs, output_lines=output_lines)

817

818 - def to_db_inputs(self):

819 """ 820 @see: L{ChordInput.to_db_input} 821 """ 822 return DbBulkInput([chords.to_db_input() for chords in self.inputs])

823

824 -class SegmentedMidiBulkInput(BulkInput):

825 """ 826 A CSV file containing midi file paths and the parameters for segmenting 827 each one. 828 829 May store an index of a gold analysis with each input. This should appear 830 in column 4. If these are given, the first line of the file should specify 831 the path to the sequence input file as follows:: 832 833 GOLD: <relative path> 834 835 Columns: filename, time unit, tick offset, ignore (bool, optional), gold id (int, optional) 836 837 """ 838 INPUT_TYPE = SegmentedMidiInput 839 FILE_INPUT_OPTIONS = [ 840 ModuleOption('truncate', filter=int, 841 help_text="truncate each input to this length.", 842 usage="truncate=L, where L is an integer")] 843 SHELL_TOOLS = BulkInput.SHELL_TOOLS + [ tools.PlayBulkMidiChunksTool() ] 844

845 - def __init__(self, inputs):

846 self.inputs = inputs

847

848 - def __str__(self):

849 return "<bulk midi: %s>" % (" ".join([str(mid) for mid in self.inputs]))

850 851 @staticmethod

852 - def writeln(csv, filename, time_unit=None, tick_offset=0, ignore=False, 853 seq_index=None):

854 """ 855 Writes a line to a segmidi bulk input file, opened as a CSV writer. 856 857 """ 858 row = [ 859 "%s" % filename, 860 "%f" % time_unit if time_unit else "2", 861 "%d" % tick_offset, 862 "TRUE" if ignore else "", 863 "%d" % seq_index if seq_index is not None else "" 864 ] 865 csv.writerow(row)

866 867 @staticmethod

868 - def from_file(filename, options={}):

869 import csv, os 870 # Read in the CSV file 871 infile = open(filename, 'r') 872 try: 873 reader = csv.reader(infile) 874 data = list(reader) 875 finally: 876 infile.close() 877 878 base_path = os.path.abspath(os.path.dirname(filename)) 879 880 # Check the first line of the file for GOLD input 881 if data[0][0].startswith("GOLD:"): 882 gold_path = data[0][0].lstrip("GOLD:").strip() 883 gold_path = os.path.join(base_path, gold_path) 884 # Load the annotated data 885 gold_data = AnnotatedDbBulkInput.from_file(gold_path) 886 # Ignore this first line now 887 data = data[1:] 888 else: 889 gold_data = None 890 891 # Read the file's data and process it 892 inputs = [] 893 for row in data: 894 # Optional col 4 allows us to ignore rows for training while 895 # keeping their parameters in the file 896 if len(row) > 3: 897 ignore = str_to_bool(row[3]) 898 else: 899 ignore = False 900 901 if not ignore: 902 filename = row[0] 903 # Read in the midi file 904 midi = os.path.join(base_path, filename) 905 906 # Prepare the parameters 907 if row[1]: 908 time_unit = float(row[1]) 909 else: 910 time_unit = 2.0 911 912 if row[2]: 913 tick_offset = int(row[2]) 914 else: 915 tick_offset = 0 916 917 if len(row) > 4 and gold_data is not None and row[4].strip(): 918 # A gold sequence analysis was given: load it up 919 seq_index = int(row[4]) 920 gold = gold_data[seq_index].get_gold_analysis() 921 else: 922 seq_index = None 923 gold = None 924 925 options = SegmentedMidiInput.process_option_dict({ 926 'time_unit' : time_unit, 927 'tick_offset' : tick_offset, 928 'truncate' : options['truncate'], 929 }) 930 inputs.append( 931 SegmentedMidiInput.from_file(midi, options=options, 932 gold=gold, sequence_index=seq_index)) 933 return SegmentedMidiBulkInput(inputs)

934

935 -class AnnotatedDbBulkInput(DbBulkInput):

936 """ 937 Like DbBulkInput, but for AnnotatedDbInput. 938 939 """ 940 INPUT_TYPE = AnnotatedDbInput 941 942 @staticmethod

943 - def from_file(filename, options={}):

944 # Read in the sequence index file 945 f = SequenceIndex.from_file(filename) 946 inputs = [AnnotatedDbInput.from_sequence(s) for s in f] 947 return AnnotatedDbBulkInput(inputs)

948

949 -class MidiTaggerTrainingBulkInput(SegmentedMidiBulkInput):

950 """ 951 Subclass of L{SegmentedMidiBulkInput} for taking training input for midi 952 supertaggers. This is identical to L{SegmentedMidiBulkInput}, but has an 953 additional option C{chords} to specify a path from which to read a 954 L{AnnotatedDbBulkInput}. This may be used by the training procedure to initialize 955 or train parameters, in addition to the main midi training input. 956 957 Accepts additionally all options accepted by L{AnnotatedDbBulkInput}. These will 958 be passed on to L{DbBulkInput} when it's read in. 959 960 """ 961 FILE_INPUT_OPTIONS = \ 962 SegmentedMidiBulkInput.FILE_INPUT_OPTIONS + \ 963 [ModuleOption('chords', 964 help_text="path from which to read a bulk-db input, "\ 965 "which may be used in addition to the midi training "\ 966 "data by the training procedure", 967 usage="chords=F, where F is an filename")] + \ 968 AnnotatedDbBulkInput.FILE_INPUT_OPTIONS 969

970 - def __init__(self, inputs, chords=None):

971 self.inputs = inputs 972 self.chords = chords

973 974 @staticmethod

975 - def from_file(filename, options={}):

976 if 'chords' in options and options['chords'] is not None: 977 # Read in the AnnotatedDbBulkInput from this file 978 # Take AnnotatedDbBulkInput's options out of the option dict 979 dboptions = {} 980 for dbopt in AnnotatedDbBulkInput.FILE_INPUT_OPTIONS: 981 if dbopt.name in options: 982 dboptions[dbopt.name] = options.pop(dbopt.name) 983 chords = AnnotatedDbBulkInput.from_file(options['chords'], options=dboptions) 984 else: 985 chords = None 986 # Read the main midi data just as SegmentedMidiBulkInput does 987 main_data = SegmentedMidiBulkInput.from_file(filename, options) 988 return MidiTaggerTrainingBulkInput(main_data.inputs, chords=chords)

989

990 - def subset(self, *ranges):

991 # Custom implementation so subsets get the chord input 992 return MidiTaggerTrainingBulkInput(\ 993 sum([self.inputs[start:end] for (start,end) in ranges], []), 994 chords=self.chords)

995 996 997 BULK_INPUT_TYPES = [ 998 ('bulk-db', DbBulkInput), 999 ('bulk-db-annotated', AnnotatedDbBulkInput), 1000 ('bulk-chords', ChordBulkInput), 1001 ('bulk-segmidi', SegmentedMidiBulkInput), 1002 ('bulk-midi-train', MidiTaggerTrainingBulkInput), 1003 ]

1004 1005 1006 -class InputTypeError(Exception):

1007 pass

1008

1009 -class InputReadError(Exception):

1010 pass

1011

1012 -def input_type_name(cls):

1013 for datatype,clsmatch in INPUT_TYPES+BULK_INPUT_TYPES: 1014 if clsmatch == cls: 1015 return datatype 1016 return None

1017

1018 -def get_input_type(name):

1019 for datatype,cls in INPUT_TYPES+BULK_INPUT_TYPES: 1020 if datatype == name: 1021 return cls 1022 return None

1023

1024 -def get_input_type_names(single=True, bulk=True):

1025 types = [] + (INPUT_TYPES if single else []) + (BULK_INPUT_TYPES if bulk else []) 1026 return zip(*types)[0]

1027

1028 -def is_bulk_type(cls):

1029 return issubclass(cls, BulkInput)

1030

1031 -def detect_input_type(data, allowed=None, allow_bulk=False, errmess=""):

1032 """ 1033 Preprocesses input. 1034 1035 The input may be already wrapped using one of the wrappers in this 1036 module, or it may be a string. In this case it will be wrapped using 1037 ChordInput and the result will be returned. 1038 1039 @type allowed: list of input type names 1040 @param allowed: (optional) list of data types that are allowed. If the 1041 data is in a recognised format, but not one of these, an error will 1042 be raised 1043 @type allow_bulk: bool 1044 @param allow_bulk: if True, accepts bulk input types. If C{allowed} is also 1045 given, will check that the bulk input supplies an allowed type of 1046 individual inputs 1047 @type errmess: str 1048 @param errmess: additional error message to include in the output when 1049 a disallowed type is encountered. The message reads something like 1050 "input of type <type> is not allowed<errmess>..." 1051 @rtype: (type name, input) pair 1052 @return: the identified input type and the wrapped-up input, ready to be 1053 used by a tagger 1054 1055 """ 1056 if type(data) == str: 1057 # Handle strings by wrapping them up in a ChordInput 1058 datatype = 'chords' 1059 data = ChordInput.from_string(data) 1060 else: 1061 # Other types should already be wrapped 1062 for typename,cls in INPUT_TYPES: 1063 if type(data) == cls: 1064 datatype = typename 1065 break 1066 else: 1067 if allow_bulk: 1068 # Check the bulk input types 1069 for typename,cls in BULK_INPUT_TYPES: 1070 if type(data) == cls: 1071 datatype = typename 1072 break 1073 else: 1074 # No valid wrapped type was found 1075 raise InputTypeError, "invalid input type: %s%s" % \ 1076 (type(data).__name__, errmess) 1077 else: 1078 # No valid wrapped type was found 1079 raise InputTypeError, "invalid input type: %s%s" % \ 1080 (type(data).__name__, errmess) 1081 if allowed is not None and datatype not in allowed: 1082 raise InputTypeError, "input of type '%s' is not allowed%s. Allowed "\ 1083 "types are: %s" % (datatype, errmess, ", ".join(allowed)) 1084 return (datatype,data)

1085

1086 -def command_line_input(filename=None, filetype=None, options="", \ 1087 allowed_types=None, default_type=None):

1088 """ 1089 Utility function for processing file input options from the command line. 1090 Pass in as args the values straight from the command line options to 1091 select a filename, filetype and list of options. 1092 1093 Typical command-line options for this purpose (for an optparse option parser C{op}):: 1094 op.add_option("--file", "-f", dest="file", action="store", help="use a file to get input from") 1095 op.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Use '--filetype help' for a list of available types") 1096 op.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Use '--fopt help', with '--ft <type>', for a list of available options") 1097 Then you can call this function as:: 1098 command_line_input(filename=options.file, filetype=options.filetype, options=options.file_options) 1099 1100 @type allowed_types: list of strs 1101 @param allowed_types: types of input you want the user to be able to give. 1102 If not given, all types are allowed 1103 @type default_type: str 1104 @param default_type: filetype to assume if no other filetype is given 1105 @rtype: L{InputReader} subclass 1106 @return: the input wrapper of appropriate type, or None if no input file 1107 was given 1108 1109 """ 1110 if allowed_types is None: 1111 allowed_types = get_input_type_names() 1112 1113 if filetype is None and default_type is not None: 1114 filetype = default_type 1115 1116 # Catch a request for filetype help 1117 if filetype is not None and filetype.lower() == "help": 1118 # Output possible file types 1119 print "Allowed input types: %s" % ", ".join(allowed_types) 1120 sys.exit(0) 1121 1122 # Check that the filetype is valid and get the input type class if it is 1123 input_type = get_input_type(filetype) 1124 if input_type is None: 1125 raise InputTypeError, "Unknown filetype '%s'. Allowed types are: %s" % \ 1126 (filetype, ", ".join(allowed_types)) 1127 if input_type_name(input_type) not in allowed_types: 1128 raise InputTypeError, "Cannot accept input of type '%s'. Allowed "\ 1129 "types are: %s" % (filetype, ", ".join(allowed_types)) 1130 1131 if options is not None and options.lower() == "help": 1132 # Output help text 1133 from jazzparser.utils.options import options_help_text 1134 print options_help_text(input_type.FILE_INPUT_OPTIONS, intro="Available options for input type %s" % input_type) 1135 sys.exit(0) 1136 1137 if filename is None: 1138 return None 1139 1140 # First get a dict of the options 1141 file_options = ModuleOption.process_option_string(options) 1142 # Process the options as appropriate for this type 1143 file_options = input_type.process_option_dict(file_options) 1144 1145 # Instantiate the input from the file as appropriate for the input type 1146 input_data = input_type.from_file(filename, file_options) 1147 return input_data

1148

Source Code for Package jazzparser.data.input