Package jazzparser :: Package data :: Package corpora
[hide private]
[frames] | no frames]

Source Code for Package jazzparser.data.corpora

  1  """File I/O and internal representations for other people's corpora. 
  2   
  3  This package contains modules for reading in (and potentially writing  
  4  out) data from other people's corpora and classes for representing  
  5  and manipulating the data. 
  6   
  7  My own data is stored using the classes in the Django database  
  8  definition or those in L{jazzparser.data.db_mirrors}. 
  9   
 10  """ 
 11  """ 
 12  ============================== License ======================================== 
 13   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
 14    
 15   This file is part of The Jazz Parser. 
 16    
 17   The Jazz Parser is free software: you can redistribute it and/or modify 
 18   it under the terms of the GNU General Public License as published by 
 19   the Free Software Foundation, either version 3 of the License, or 
 20   (at your option) any later version. 
 21    
 22   The Jazz Parser is distributed in the hope that it will be useful, 
 23   but WITHOUT ANY WARRANTY; without even the implied warranty of 
 24   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 25   GNU General Public License for more details. 
 26    
 27   You should have received a copy of the GNU General Public License 
 28   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
 29   
 30  ============================ End license ====================================== 
 31   
 32  """ 
 33  __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"  
 34   
 35  __kp_info = { 
 36      'location' : ('kostka-payne',), 
 37      'extensions' : ['q.k'], 
 38  } 
 39   
 40  CORPORA = { 
 41      'kostka-payne' : __kp_info, 
 42      'kp' : __kp_info,               # Alias for easier access 
 43  } 
 44  """Available corpus datasets, indexed by names by which they can be loaded.""" 
 45   
 46   
47 -def get_corpus_file(corpus_name, file_path):
48 """ 49 Load a file from a named corpus that is stored within the project. 50 If the file is not found, we'll attempt to append the default 51 file extensions defined for this corpus until we find one that 52 exists. 53 54 @type corpus_name: str 55 @param corpus_name: name of the corpus 56 @type file_path: list of strs or str 57 @param file_path: path to file to be loaded, split into a list, or 58 as a string in the style of the local system's paths. 59 60 """ 61 from jazzparser.settings import CORPORA_DIR 62 import os 63 64 if type(file_path) == str: 65 file_path = file_path.split(os.sep) 66 67 if corpus_name not in CORPORA: 68 raise CorpusError, "unknown corpus '%s'" % corpus_name 69 corpus_info = CORPORA[corpus_name] 70 71 base_path = os.path.join(CORPORA_DIR, *(corpus_info['location'] + tuple(file_path))) 72 # Check whether the file exists and try adding extensions if not 73 extensions = [''] + [".%s" % ext for ext in corpus_info['extensions']] 74 for ext in extensions: 75 filename = "%s%s" % (base_path,ext) 76 if os.path.exists(filename): 77 return filename 78 # No file found at all 79 raise IOError, "file '%s' not found in %s corpus" % \ 80 (os.path.join(*file_path), corpus_name)
81
82 -def list_corpus_files(corpus_name):
83 """ 84 Produces a list of the files in the corpus with the given name. 85 Each file is represented by its path, split into a list. 86 87 """ 88 from jazzparser.settings import CORPORA_DIR 89 import os 90 91 if corpus_name not in CORPORA: 92 raise CorpusError, "unknown corpus '%s'" % corpus_name 93 corpus_info = CORPORA[corpus_name] 94 95 path = os.path.join(CORPORA_DIR, *(corpus_info['location'])) 96 corpus_files = [] 97 # Check all the files in the directory 98 for root, dirs, files in os.walk(path): 99 # Don't recurse to hidden dirs 100 for name in dirs: 101 if name.startswith("."): 102 dirs.remove(name) 103 # Get rid of the base path from the root 104 root = [bit for bit in root.lstrip(path).split(os.sep) if bit != ''] 105 # Exclude hidden files 106 corpus_files.extend([root+[name] for name in files if not name.startswith(".")]) 107 return corpus_files
108 109
110 -class CorpusError(Exception):
111 pass
112