1 """First, very simple baseline tagger model.
2
3 Tagging model 'baseline1' is a very simple tagging model that tags
4 using just the unigram probabilities on the basis of observed chord
5 types (no intervals).
6
7 It is the model presented as 'model 3' in the Stupid Baselines talk
8 (the first two are just thought experiments and not worth implementing).
9
10 """
11 """
12 ============================== License ========================================
13 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
14
15 This file is part of The Jazz Parser.
16
17 The Jazz Parser is free software: you can redistribute it and/or modify
18 it under the terms of the GNU General Public License as published by
19 the Free Software Foundation, either version 3 of the License, or
20 (at your option) any later version.
21
22 The Jazz Parser is distributed in the hope that it will be useful,
23 but WITHOUT ANY WARRANTY; without even the implied warranty of
24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 GNU General Public License for more details.
26
27 You should have received a copy of the GNU General Public License
28 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
29
30 ============================ End license ======================================
31
32 """
33 __author__ = "Mark Granroth-Wilding <mark@granroth-wilding.co.uk>"
34
35 import pickle
36 from jazzparser.taggers.models import ModelTagger, ModelLoadError, TaggerModel
37 from jazzparser.taggers import process_chord_input
38 from jazzparser.utils.probabilities import batch_sizes
39 from jazzparser.data import Chord
40
44
46 """
47 A class to encapsulate the model data for the tagger.
48 """
49 MODEL_TYPE = "baseline1"
50
51 - def __init__(self, model_name, *args, **kwargs):
52 super(Baseline1Model, self).__init__(model_name, *args, **kwargs)
53 self.category_chord_count = {}
54 self.category_count = {}
55 self.chord_count = {}
56
58 """
59 Adds a count of the joint observation of the category and the
60 chord and of the category and the chord themselves.
61 """
62
63 cat_chords = self.category_chord_count.setdefault(category, {})
64 if chord in cat_chords:
65 cat_chords[chord] += 1
66 else:
67 cat_chords[chord] = 1
68
69 if category in self.category_count:
70 self.category_count[category] += 1
71 else:
72 self.category_count[category] = 1
73
74 if chord in self.chord_count:
75 self.chord_count[chord] += 1
76 else:
77 self.chord_count[chord] = 1
78
79 - def train(self, sequences, grammar=None, logger=None):
98
100 obs = observation_from_chord(chord)
101 chord_count = self.chord_count.get(obs, 0)
102 if chord_count == 0:
103
104 if cat in self.category_count:
105 return 1.0 / len(self.category_count)
106 else:
107
108 return 0.0
109 count = self.category_chord_count.get(cat, {}).get(obs, 0)
110 return float(count) / chord_count
111
113 """
114 The first of the simple baseline tagger models. This models unigram
115 probabilities of tags, given only the chord types.
116
117 """
118 MODEL_CLASS = Baseline1Model
119 INPUT_TYPES = ['db', 'chords']
120
121 - def __init__(self, grammar, input, options={}, *args, **kwargs):
122 super(Baseline1Tagger, self).__init__(grammar, input, options, *args, **kwargs)
123 process_chord_input(self)
124
125
126 self._tagged_data = []
127 self._batch_ranges = []
128
129 for index in range(self.input_length):
130 features = {
131 'duration' : self.durations[index],
132 'time' : self.times[index],
133 }
134 word_signs = []
135
136 for tag in self.model.category_count.keys():
137 sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features)
138 if sign is not None:
139 probability = self.model.get_prob_cat_given_chord(tag, self.input[index])
140 word_signs.append((sign, tag, probability))
141 word_signs = list(reversed(sorted([(sign, tag, prob) for sign,tag,prob in word_signs], key=lambda x:x[2])))
142 self._tagged_data.append(word_signs)
143
144
145 batches = batch_sizes([p for __,__,p in word_signs], self.batch_ratio)
146
147 so_far = 0
148 batch_ranges = []
149 for batch in batches:
150 batch_ranges.append((so_far,so_far+batch))
151 so_far += batch
152 self._batch_ranges.append(batch_ranges)
153
155 if self.best_only:
156
157 if offset == 0 and len(self._tagged_data[index]) > 0:
158 return [self._tagged_data[index][0]]
159 else:
160 return None
161 ranges = self._batch_ranges[index]
162 if offset >= len(ranges):
163
164 return None
165 start,end = ranges[offset]
166 return self._tagged_data[index][start:end]
167
169 return self.input[index]
170