jazzparser.taggers.baseline1.tagger

1 """First, very simple baseline tagger model. 2 3 Tagging model 'baseline1' is a very simple tagging model that tags 4 using just the unigram probabilities on the basis of observed chord 5 types (no intervals). 6 7 It is the model presented as 'model 3' in the Stupid Baselines talk 8 (the first two are just thought experiments and not worth implementing). 9 10 """ 11 """ 12 ============================== License ======================================== 13 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 14 15 This file is part of The Jazz Parser. 16 17 The Jazz Parser is free software: you can redistribute it and/or modify 18 it under the terms of the GNU General Public License as published by 19 the Free Software Foundation, either version 3 of the License, or 20 (at your option) any later version. 21 22 The Jazz Parser is distributed in the hope that it will be useful, 23 but WITHOUT ANY WARRANTY; without even the implied warranty of 24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 GNU General Public License for more details. 26 27 You should have received a copy of the GNU General Public License 28 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 29 30 ============================ End license ====================================== 31 32 """ 33 __author__ = "Mark Granroth-Wilding <mark@granroth-wilding.co.uk>" 34 35 import pickle 36 from jazzparser.taggers.models import ModelTagger, ModelLoadError, TaggerModel 37 from jazzparser.taggers import process_chord_input 38 from jazzparser.utils.probabilities import batch_sizes 39 from jazzparser.data import Chord 40

41 -def observation_from_chord(crd):

42 chord = Chord.from_name(crd) 43 return chord.type

44

45 -class Baseline1Model(TaggerModel):

46 """ 47 A class to encapsulate the model data for the tagger. 48 """ 49 MODEL_TYPE = "baseline1" 50

51 - def __init__(self, model_name, *args, **kwargs):

52 super(Baseline1Model, self).__init__(model_name, *args, **kwargs) 53 self.category_chord_count = {} 54 self.category_count = {} 55 self.chord_count = {}

56

57 - def _add_category_chord_count(self, category, chord):

58 """ 59 Adds a count of the joint observation of the category and the 60 chord and of the category and the chord themselves. 61 """ 62 # Count the cat-chord combo 63 cat_chords = self.category_chord_count.setdefault(category, {}) 64 if chord in cat_chords: 65 cat_chords[chord] += 1 66 else: 67 cat_chords[chord] = 1 68 # Count the cat occurrence 69 if category in self.category_count: 70 self.category_count[category] += 1 71 else: 72 self.category_count[category] = 1 73 # Count the chord occurrence 74 if chord in self.chord_count: 75 self.chord_count[chord] += 1 76 else: 77 self.chord_count[chord] = 1

78

79 - def train(self, sequences, grammar=None, logger=None):

80 seqs = 0 81 chords = 0 82 # Each sequence in the given corpus 83 for seq in sequences: 84 seqs += 1 85 # Each chord in the sequence 86 for chord in seq.iterator(): 87 chords += 1 88 self._add_category_chord_count(chord.category, observation_from_chord(str(chord))) 89 # Add a bit of training info to the descriptive text 90 self.model_description = """\ 91 Unigram probability model, observing only chord types 92 93 Training sequences: %(seqs)d 94 Training samples: %(samples)d""" % { 95 'seqs' : seqs, 96 'samples' : chords 97 }

98

99 - def get_prob_cat_given_chord(self, cat, chord):

100 obs = observation_from_chord(chord) 101 chord_count = self.chord_count.get(obs, 0) 102 if chord_count == 0: 103 # Unseen data: give all seen cats equal probability 104 if cat in self.category_count: 105 return 1.0 / len(self.category_count) 106 else: 107 # Haven't seen the category before: don't smooth 108 return 0.0 109 count = self.category_chord_count.get(cat, {}).get(obs, 0) 110 return float(count) / chord_count

111

112 -class Baseline1Tagger(ModelTagger):

113 """ 114 The first of the simple baseline tagger models. This models unigram 115 probabilities of tags, given only the chord types. 116 117 """ 118 MODEL_CLASS = Baseline1Model 119 INPUT_TYPES = ['db', 'chords'] 120

121 - def __init__(self, grammar, input, options={}, *args, **kwargs):

122 super(Baseline1Tagger, self).__init__(grammar, input, options, *args, **kwargs) 123 process_chord_input(self) 124 125 #### Tag the input sequence #### 126 self._tagged_data = [] 127 self._batch_ranges = [] 128 # Get all the possible signs from the grammar 129 for index in range(self.input_length): 130 features = { 131 'duration' : self.durations[index], 132 'time' : self.times[index], 133 } 134 word_signs = [] 135 # Now assign a probability to each tag, given the observation 136 for tag in self.model.category_count.keys(): 137 sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features) 138 if sign is not None: 139 probability = self.model.get_prob_cat_given_chord(tag, self.input[index]) 140 word_signs.append((sign, tag, probability)) 141 word_signs = list(reversed(sorted([(sign, tag, prob) for sign,tag,prob in word_signs], key=lambda x:x[2]))) 142 self._tagged_data.append(word_signs) 143 144 # Work out the sizes of the batches to return these in 145 batches = batch_sizes([p for __,__,p in word_signs], self.batch_ratio) 146 # Transform these into a form that's easier to use for getting the signs 147 so_far = 0 148 batch_ranges = [] 149 for batch in batches: 150 batch_ranges.append((so_far,so_far+batch)) 151 so_far += batch 152 self._batch_ranges.append(batch_ranges)

153

154 - def get_signs_for_word(self, index, offset=0):

155 if self.best_only: 156 # Only ever return one sign 157 if offset == 0 and len(self._tagged_data[index]) > 0: 158 return [self._tagged_data[index][0]] 159 else: 160 return None 161 ranges = self._batch_ranges[index] 162 if offset >= len(ranges): 163 # No more batches left 164 return None 165 start,end = ranges[offset] 166 return self._tagged_data[index][start:end]

167

168 - def get_word(self, index):

169 return self.input[index]

170

Source Code for Module jazzparser.taggers.baseline1.tagger