jazzparser.taggers.candc.utils

1 """Some basic utilities used by the C&C tagger interface. 2 3 """ 4 """ 5 ============================== License ======================================== 6 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 7 8 This file is part of The Jazz Parser. 9 10 The Jazz Parser is free software: you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation, either version 3 of the License, or 13 (at your option) any later version. 14 15 The Jazz Parser is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 22 23 ============================ End license ====================================== 24 25 """ 26 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 27

28 -def untag_sequence_data(sequence):

29 """ 30 Given a line read in from a file containing C&C-style tagged 31 data, returns a version of the line with the tags stripped off, 32 leaving just the chord sequence. 33 """ 34 return " ".join([c.partition("|")[0] for c in sequence.split()])

35

36 -def tags_from_sequence_data(sequence):

37 return [c.rpartition("|")[2] for c in sequence.split()]

38

39 -def training_sequence_to_candc(sequence):

40 """ 41 Given a line of data in the format we use for training data, 42 converts it to a format suitable for direct input to the C&C 43 training tool. 44 45 Precisely, it removes the first X| from the start of each 46 observation, which in the data format we use is the chord itself. 47 48 """ 49 # Remove the first | and everything before it 50 return " ".join([c.partition("|")[2] for c in sequence.split()])

51

52 -def training_data_to_candc(lines):

53 """ 54 Performs the same as training_sequence_to_candc, but on a whole 55 data set (list of sequences). 56 """ 57 return [training_sequence_to_candc(seq) for seq in lines]

58 59

60 -def _sequence_to_candc_format(formatter, sequence):

61 """ 62 Produces a string representation of observations to be used as 63 training data for a C&C model from a chord sequence internal 64 model. 65 This is an inner function for the various different formats of 66 C&C data we use. 67 68 """ 69 from jazzparser.utils.base import group_pairs 70 # Produce observations from chord pairs 71 pairs_list = group_pairs( list(sequence.iterator()) + [None] ) 72 observation_list = [formatter(*chords) for chords in pairs_list] 73 return "%s\n" % " ".join(observation_list)

74

75 -def _type_format(type, mapping):

76 if mapping is None: 77 return type 78 else: 79 return mapping[type]

80

81 -def sequence_to_candc_pos(sequence, type_map=None):

82 # Generate POS training data 83 def _formatter(chord1, chord2): 84 if chord2 is None: 85 interval = "" 86 else: 87 interval = "%d" % ((chord2.root - chord1.root) % 12) 88 return "%s-%s|C" % (interval, _type_format(chord1.type, type_map))

89 return _sequence_to_candc_format(_formatter, sequence) 90

91 -def sequence_to_candc_chord_super(sequence, type_map=None):

92 # This is our own combined format that includes the observation, 93 # super-tagger training and the chord itself 94 def _formatter(chord1, chord2): 95 if chord2 is None: 96 interval = "" 97 else: 98 interval = "%d" % ((chord2.root - chord1.root) % 12) 99 return "%s|%s-%s|C|%s" % (chord1.jazz_parser_input, interval, 100 _type_format(chord1.type, type_map), chord1.category or "?")

101 return _sequence_to_candc_format(_formatter, sequence) 102

103 -def sequence_to_candc_super(sequence, type_map=None):

104 # Generate super-tagger training data 105 def _formatter(chord1, chord2): 106 if chord2 is None: 107 interval = "" 108 else: 109 interval = "%d" % ((chord2.root - chord1.root) % 12) 110 return "%s-%s|C|%s" % (interval, _type_format(chord1.type, type_map), 111 chord1.category or "?")

112 return _sequence_to_candc_format(_formatter, sequence) 113

114 -def sequence_index_to_candc_chord_super(si, *args, **kwargs):

115 """ 116 Given a SequenceIndex object containing sequence data, produces 117 C&C training data as a single string. 118 119 """ 120 return "".join([sequence_to_candc_chord_super(s, *args, **kwarg) for s in si.sequences])

121

122 -def sequence_list_to_candc_chord_super(sequences, *args, **kwarg):

123 """ 124 Given a list of sequences, produces 125 C&C training data as a single string. 126 127 """ 128 return "".join([sequence_to_candc_chord_super(s, *args, **kwarg) for s in sequences])

129

130 -def sequence_index_to_training_file(si, type_map=None):

131 """ 132 Given a SequenceIndex object, returns an open temporary file 133 containing all the data in our hybrid C&C training data format. 134 This is converted (rather trivially) by the train_model function 135 into C&C's required format. 136 137 """ 138 from tempfile import NamedTemporaryFile 139 file = NamedTemporaryFile() 140 file.write(sequence_index_to_candc_chord_super(si, type_map=type_map)) 141 file.flush() 142 return file

143

144 -def sequence_list_to_training_file(seqs, type_map=None):

145 """ 146 Given a list of sequences, returns an open temporary file 147 containing all the data in our hybrid C&C training data format. 148 This is converted (rather trivially) by the train_model function 149 into C&C's required format. 150 151 """ 152 from tempfile import NamedTemporaryFile 153 file = NamedTemporaryFile() 154 file.write(sequence_list_to_candc_chord_super(seqs, type_map=type_map)) 155 file.flush() 156 return file

157

158 -def generate_tag_list(filename, grammar=None):

159 """ 160 Generates a list of possible tags to be stored along with a C&C model. 161 It contains all tags that are in the grammar. 162 163 """ 164 from jazzparser.grammar import get_grammar 165 if grammar is None: 166 # Load the default grammar 167 grammar = get_grammar() 168 tags = grammar.families.keys() 169 data = "\n".join(tags) 170 file = open(filename, 'w') 171 file.write(data) 172 file.close()

173

174 -def read_tag_list(filename):

175 """ 176 Reads in a tag list generated by L{generate_tag_list}. 177 178 """ 179 with open(filename, 'r') as f: 180 data = f.read() 181 return data.split("\n")

182

Source Code for Module jazzparser.taggers.candc.utils