1 """File I/O and internal representations for other people's corpora.
2
3 This package contains modules for reading in (and potentially writing
4 out) data from other people's corpora and classes for representing
5 and manipulating the data.
6
7 My own data is stored using the classes in the Django database
8 definition or those in L{jazzparser.data.db_mirrors}.
9
10 """
11 """
12 ============================== License ========================================
13 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
14
15 This file is part of The Jazz Parser.
16
17 The Jazz Parser is free software: you can redistribute it and/or modify
18 it under the terms of the GNU General Public License as published by
19 the Free Software Foundation, either version 3 of the License, or
20 (at your option) any later version.
21
22 The Jazz Parser is distributed in the hope that it will be useful,
23 but WITHOUT ANY WARRANTY; without even the implied warranty of
24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 GNU General Public License for more details.
26
27 You should have received a copy of the GNU General Public License
28 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
29
30 ============================ End license ======================================
31
32 """
33 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
34
35 __kp_info = {
36 'location' : ('kostka-payne',),
37 'extensions' : ['q.k'],
38 }
39
40 CORPORA = {
41 'kostka-payne' : __kp_info,
42 'kp' : __kp_info,
43 }
44 """Available corpus datasets, indexed by names by which they can be loaded."""
45
46
48 """
49 Load a file from a named corpus that is stored within the project.
50 If the file is not found, we'll attempt to append the default
51 file extensions defined for this corpus until we find one that
52 exists.
53
54 @type corpus_name: str
55 @param corpus_name: name of the corpus
56 @type file_path: list of strs or str
57 @param file_path: path to file to be loaded, split into a list, or
58 as a string in the style of the local system's paths.
59
60 """
61 from jazzparser.settings import CORPORA_DIR
62 import os
63
64 if type(file_path) == str:
65 file_path = file_path.split(os.sep)
66
67 if corpus_name not in CORPORA:
68 raise CorpusError, "unknown corpus '%s'" % corpus_name
69 corpus_info = CORPORA[corpus_name]
70
71 base_path = os.path.join(CORPORA_DIR, *(corpus_info['location'] + tuple(file_path)))
72
73 extensions = [''] + [".%s" % ext for ext in corpus_info['extensions']]
74 for ext in extensions:
75 filename = "%s%s" % (base_path,ext)
76 if os.path.exists(filename):
77 return filename
78
79 raise IOError, "file '%s' not found in %s corpus" % \
80 (os.path.join(*file_path), corpus_name)
81
83 """
84 Produces a list of the files in the corpus with the given name.
85 Each file is represented by its path, split into a list.
86
87 """
88 from jazzparser.settings import CORPORA_DIR
89 import os
90
91 if corpus_name not in CORPORA:
92 raise CorpusError, "unknown corpus '%s'" % corpus_name
93 corpus_info = CORPORA[corpus_name]
94
95 path = os.path.join(CORPORA_DIR, *(corpus_info['location']))
96 corpus_files = []
97
98 for root, dirs, files in os.walk(path):
99
100 for name in dirs:
101 if name.startswith("."):
102 dirs.remove(name)
103
104 root = [bit for bit in root.lstrip(path).split(os.sep) if bit != '']
105
106 corpus_files.extend([root+[name] for name in files if not name.startswith(".")])
107 return corpus_files
108
109
112