1 """Some basic utilities used by the C&C tagger interface.
2
3 """
4 """
5 ============================== License ========================================
6 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
7
8 This file is part of The Jazz Parser.
9
10 The Jazz Parser is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or
13 (at your option) any later version.
14
15 The Jazz Parser is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
22
23 ============================ End license ======================================
24
25 """
26 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
27
29 """
30 Given a line read in from a file containing C&C-style tagged
31 data, returns a version of the line with the tags stripped off,
32 leaving just the chord sequence.
33 """
34 return " ".join([c.partition("|")[0] for c in sequence.split()])
35
38
40 """
41 Given a line of data in the format we use for training data,
42 converts it to a format suitable for direct input to the C&C
43 training tool.
44
45 Precisely, it removes the first X| from the start of each
46 observation, which in the data format we use is the chord itself.
47
48 """
49
50 return " ".join([c.partition("|")[2] for c in sequence.split()])
51
53 """
54 Performs the same as training_sequence_to_candc, but on a whole
55 data set (list of sequences).
56 """
57 return [training_sequence_to_candc(seq) for seq in lines]
58
59
74
80
89 return _sequence_to_candc_format(_formatter, sequence)
90
101 return _sequence_to_candc_format(_formatter, sequence)
102
112 return _sequence_to_candc_format(_formatter, sequence)
113
115 """
116 Given a SequenceIndex object containing sequence data, produces
117 C&C training data as a single string.
118
119 """
120 return "".join([sequence_to_candc_chord_super(s, *args, **kwarg) for s in si.sequences])
121
123 """
124 Given a list of sequences, produces
125 C&C training data as a single string.
126
127 """
128 return "".join([sequence_to_candc_chord_super(s, *args, **kwarg) for s in sequences])
129
131 """
132 Given a SequenceIndex object, returns an open temporary file
133 containing all the data in our hybrid C&C training data format.
134 This is converted (rather trivially) by the train_model function
135 into C&C's required format.
136
137 """
138 from tempfile import NamedTemporaryFile
139 file = NamedTemporaryFile()
140 file.write(sequence_index_to_candc_chord_super(si, type_map=type_map))
141 file.flush()
142 return file
143
145 """
146 Given a list of sequences, returns an open temporary file
147 containing all the data in our hybrid C&C training data format.
148 This is converted (rather trivially) by the train_model function
149 into C&C's required format.
150
151 """
152 from tempfile import NamedTemporaryFile
153 file = NamedTemporaryFile()
154 file.write(sequence_list_to_candc_chord_super(seqs, type_map=type_map))
155 file.flush()
156 return file
157
173
175 """
176 Reads in a tag list generated by L{generate_tag_list}.
177
178 """
179 with open(filename, 'r') as f:
180 data = f.read()
181 return data.split("\n")
182