1 """Utilities for bin scripts.
2
3 These are used by scripts outside the Jazz Parser packages, in
4 <PROJECT_ROOT>/bin.
5
6 """
7 """
8 ============================== License ========================================
9 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
10
11 This file is part of The Jazz Parser.
12
13 The Jazz Parser is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
17
18 The Jazz Parser is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
25
26 ============================ End license ======================================
27
28 """
29 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
30
33 """
34 Various tasks common to the initial part of the evaluation routine
35 scripts (C{models/eval.py}).
36
37 @todo: This is not used any more. Remove it, after checking it's definitely
38 not used.
39
40 @param usage: the optparse usage string
41 @param description: the optparse description string
42 @type optparse_options: list of tuples
43 @param optparse_options: (args,kwargs) pairs to add additional
44 options to the optparse parser.
45 @type check_args: function
46 @param check_args: function to take the command-line arguments and
47 check them. This will be called early in the script. Must
48 return a tuple of (1) the model name (or model basename) that
49 will be used in the partition model names and (2) the input
50 filename to get sequences from.
51 @type optparse_groups: list of pairs
52 @param optparse_groups: specificatios for option groups to add to the
53 optparse option parser. The first of each pair is a tuple of
54 args to C{OptionGroup}'s init (excluding the first).
55 The second is a list of options
56 each formatted as C{optparse_options}.
57
58 @rtype: tuple
59 @return: (1) list of (sequences,model_name,partition_index) tuples
60 for each partition; (2) list of lists containing the sequence
61 ids for each partition; (3) optparse options; (4) optparse
62 arguments.
63
64 """
65 import sys
66 from optparse import OptionParser, OptionGroup
67 from jazzparser.utils.config import parse_args_with_config
68 from jazzparser.utils.loggers import init_logging
69 from jazzparser.data.db_mirrors import SequenceIndex
70 from jazzparser.utils.data import partition
71
72 parser = OptionParser(usage=usage, description=description)
73 group = OptionGroup(parser, "Input", "Input data and partitioning for evaluation")
74 group.add_option("-s", "--sequence", dest="sequence", action="store", help="limit the evaluation to just one sequence, with the given index in the input file")
75 group.add_option("--partition", dest="partition", action="store", help="restrict to only one partition of the data. Specify as i/n, where i is the partition number and n the total number of partitions.")
76 group.add_option("-p", "--partitions", dest="partitions", type="int", action="store", help="test on all n partitions of the data, using a different model for each. Will look for a model <NAME>i, where <NAME> is the given model name and i the partition number.")
77 parser.add_option_group(group)
78
79 parser.add_option("--debug", dest="debug", action="store_true", help="show debugging output")
80
81
82 for args,kwargs in optparse_options:
83 parser.add_option(*args, **kwargs)
84
85
86 for group_args,options in optparse_groups:
87
88 same_titles = [g for g in parser.option_groups if g.title == group_args[0]]
89 if same_titles:
90 group = same_titles[0]
91 else:
92 group = OptionGroup(parser, *group_args)
93 parser.add_option_group(group)
94
95 for args,kwargs in options:
96 group.add_option(*args, **kwargs)
97 options, arguments = parse_args_with_config(parser)
98
99 if check_args is None:
100 raise ValueError, "could not check arguments and get model "\
101 "name. check_args must not be None"
102 model_name,input_filename = check_args(arguments)
103
104 if options.debug:
105
106 init_logging(logging.DEBUG)
107 else:
108 init_logging()
109
110
111 seqs = SequenceIndex.from_file(input_filename)
112
113 def _get_seq_by_index(index):
114 seq = seqs.sequence_by_index(index)
115 if seq is None:
116 print >>sys.stderr, "There are only %d sequences" % len(seqs)
117 sys.exit(1)
118 return seq
119
120
121 if options.partitions is not None:
122
123 total_parts = options.partitions
124 print >>sys.stderr, "Cross validation: dividing test data into %d partitions" % total_parts
125 partitions = [(part,"%s%d" % (model_name,i), i) for i,part in enumerate(partition(seqs.sequences, total_parts))]
126 part_ids = partition(seqs.ids, total_parts)
127 elif options.partition is not None:
128
129
130 parti,total_parts = options.partition.split("/")
131 parti,total_parts = int(parti), int(total_parts)
132 print >>sys.stderr, "Restricting sequences to %d-way partition %d" % (total_parts,parti)
133
134 part_ids = partition(seqs.ids, total_parts)[parti]
135 partitions = [ [(part,"%s%d" % (model_name,i), i) for i,part in enumerate(partition(seqs.sequences, total_parts))][parti] ]
136 elif options.sequence is not None:
137
138 seq = _get_seq_by_index(int(options.sequence))
139 partitions = [( [seq], model_name, 0 )]
140 part_ids = [seq.id]
141 else:
142
143 partitions = [(seqs.sequences, model_name,0)]
144 part_ids = [None]
145
146 return partitions,part_ids,options,arguments
147