jazzparser.utils.web

1 from __future__ import absolute_import 2 """Internet access utilities. 3 4 Utilities for retreiving information or files from the internet. 5 6 """ 7 """ 8 ============================== License ======================================== 9 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 10 11 This file is part of The Jazz Parser. 12 13 The Jazz Parser is free software: you can redistribute it and/or modify 14 it under the terms of the GNU General Public License as published by 15 the Free Software Foundation, either version 3 of the License, or 16 (at your option) any later version. 17 18 The Jazz Parser is distributed in the hope that it will be useful, 19 but WITHOUT ANY WARRANTY; without even the implied warranty of 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 GNU General Public License for more details. 22 23 You should have received a copy of the GNU General Public License 24 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 25 26 ============================ End license ====================================== 27 28 """ 29 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 30 31 SOURCES = ['jazzpage','vanbasco','intersearch','melodycatcher'] 32

33 -def find_midi_files(name, sources=None, verbose_out=None):

34 """ 35 Given the name of a piece of music, tries to retrieve MIDI files 36 for a piece of that name. 37 38 Looks up the name on vanBasco's midi search and tries to parse the 39 results to pull the file urls out. Most of these will fail, either 40 because the midi file isn't accessible at the obviously place, or 41 because the link's out of date (many are). 42 43 C{name} may be a unicode string. 44 45 @type sources: list of strings 46 @param sources: list of source names to get files from. 47 Possibilities are in L{SOURCES}. If None, uses all sources. 48 @type verbose_out: writable file-like object 49 @param verbose_out: stream to send verbose output to (default None - 50 no verbose output) 51 @return: list of (midi-file,song-name) pairs each containing a 52 L{midi.EventStream} for each file. 53 54 """ 55 if sources is not None: 56 for source in sources: 57 if source not in SOURCES: 58 raise ValueError, "invalid source name '%s'. Possibilities "\ 59 "are: %s" % ", ".join(SOURCES) 60 if sources is None or len(sources) == 0: 61 # Default to using all sources 62 sources = SOURCES 63 64 # Fetch files from each source in turn 65 files = [] 66 if 'vanbasco' in sources: 67 files.extend(van_basco_midi_files(name, verbose_out=verbose_out)) 68 if 'jazzpage' in sources: 69 files.extend(the_jazz_page_midi_files(name, verbose_out=verbose_out)) 70 if 'intersearch' in sources: 71 files.extend(intersearch_midi_files(name, verbose_out=verbose_out)) 72 if 'melodycatcher' in sources: 73 files.extend(melody_catcher_midi_files(name, verbose_out=verbose_out)) 74 # Get rid of any duplicates of the same data 75 return remove_duplicate_files(files, key=lambda d:d[0])

76

77 -def remove_duplicate_files(files, key=lambda x:x):

78 """ 79 Given a list of file data strings, removes any duplicates of 80 the same data. Only the first occurence of the file data will 81 be kept. 82 83 The list doesn't have to be just of the data, as long as the data 84 is returned by the key function applied to each value in the list. 85 86 """ 87 keys = [key(f) for f in reversed(files)] 88 to_remove = [] 89 for i,data in enumerate(keys): 90 if data in keys[i+1:]: 91 to_remove.append(i) 92 print "Removing %d duplicates" % len(to_remove) 93 94 files = list(reversed( 95 [f for (i,f) in enumerate(reversed(files)) 96 if i not in to_remove] 97 )) 98 return files

99

100 -def the_jazz_page_midi_files(name, refresh_cache=False, verbose_out=None):

101 """ 102 The Jazz Page has quite a few of midi files on a single page. 103 There's only one of most songs and it's not exactly a huge database, 104 but they claim to be high quality. 105 106 By default, the list of midi files and song names will be cached 107 the first time it's fetched. Subsequently, songs will just be 108 looked up in the local database. Use C{refresh_cache=True} to 109 force the list to be re-read from the site. 110 111 To refresh the cache without doing a search, see 112 L{refresh_the_jazz_page_cache}. 113 114 @return: list of (midi-file,song-name) pairs each containing a 115 L{midi.EventStream} for each file. Unlikely to be more than one. 116 117 """ 118 from jazzparser.settings import LOCAL_DATA_DIR 119 from jazzparser.utils.csv import UnicodeCsvReader 120 from urllib2 import urlopen, HTTPError, URLError 121 from midi import read_midifile, MidiReadError 122 import os, difflib 123 from cStringIO import StringIO 124 125 def _verbose(message): 126 if verbose_out is not None: 127 print >>verbose_out, message

128 129 _verbose("<<<<<<<<<< The Jazz Page midi search >>>>>>>>>>>>>>") 130 131 # Try reading the cache to see if we've fetched the data before 132 cache_filename = os.path.join(LOCAL_DATA_DIR, "the_midi_site_cache") 133 cached = os.path.exists(cache_filename) 134 135 # Recreate the cache if it doesn't exist or it's being forced 136 if not cached or refresh_cache: 137 refresh_the_jazz_page_cache() 138 139 # Now read in the cache from the file 140 cache_file = open(cache_filename, 'r') 141 try: 142 reader = UnicodeCsvReader(cache_file) 143 song_names = {} 144 # Index the links by song name (allowing for multiple songs with 145 # the same name) 146 for link,song_name in reader: 147 song_names.setdefault(song_name,[]).append(link) 148 finally: 149 cache_file.close() 150 151 # Fetch each file for songs whose names are close to the search term 152 _verbose("Searching for song name: %s" % name) 153 matches = difflib.get_close_matches(name.lower(), song_names.keys(), cutoff=0.8) 154 if len(matches) == 0: 155 _verbose("No matches found") 156 157 files = [] 158 for name in matches: 159 for link in song_names[name]: 160 _verbose("Fetching midi file %s" % link) 161 file_data = urlopen(link).read() 162 files.append((file_data, link, name)) 163 164 # Check that each one is actually a MIDI file (these should be good 165 # quality in general) 166 ok_files = [] 167 for data,link,name in files: 168 try: 169 mid = read_midifile(StringIO(data)) 170 except MidiReadError, err: 171 _verbose("Invalid midi file: %s (%s)" % (link, err)) 172 # Skip this file 173 pass 174 else: 175 ok_files.append((data,name)) 176 return ok_files 177

178 -def refresh_the_jazz_page_cache():

179 """ 180 Reads entries from The Jazz Page into a local cache for searching. 181 182 @see: L{the_jazz_page_midi_files} 183 184 """ 185 import os 186 from urllib2 import urlopen 187 from BeautifulSoup import BeautifulSoup 188 from jazzparser.settings import LOCAL_DATA_DIR 189 from jazzparser.utils.csv import UnicodeCsvWriter 190 191 cache_filename = os.path.join(LOCAL_DATA_DIR, "the_midi_site_cache") 192 domain = "http://www.thejazzpage.de" 193 index_url = "%s/midiinfo.html" % domain 194 195 if os.path.exists(cache_filename): 196 # Remove the old cache file 197 os.remove(cache_filename) 198 # Create a new cache file 199 cache_file = open(cache_filename, 'w') 200 try: 201 writer = UnicodeCsvWriter(cache_file) 202 # Read in the index page to get the list of entries from 203 soup = BeautifulSoup(urlopen(index_url).read()) 204 # After the first table, each one is a letter, apart from the last one 205 tables = soup.findAll("table")[1:-1] 206 rowspan = 0 207 for table in tables: 208 for row in table.findAll("tr"): 209 cells = list(row) 210 if rowspan == 0: 211 if cells[0].has_key("rowspan"): 212 rowspan = int(cells[0]["rowspan"]) -1 213 # This is a row with a first column in it 214 # Ignore the first column - we're only want the 2nd 215 middle_cell = cells[1] 216 else: 217 middle_cell = cells[0] 218 rowspan -= 1 219 # Get the file and song name from the middle cell 220 if middle_cell.a is not None: 221 link = middle_cell.a["href"] 222 link = "%s/%s" % (domain,link) 223 name = middle_cell.a.text 224 name = name.replace("\n","").lower() 225 writer.writerow([link, name]) 226 finally: 227 cache_file.close()

228 229 230

231 -def van_basco_midi_files(name, verbose_out=None):

232 """ 233 One method of getting midi files used by L{find_midi_files}. 234 235 Looks up the name on vanBasco's midi search and tries to parse the 236 results to pull the file urls out. Most of these will fail, either 237 because the midi file isn't accessible at the obviously place, or 238 because the link's out of date (many are). 239 240 Beware that this can take a long time, since it has to look up many 241 servers and most of them are slow. 242 243 @type verbose_out: file-like object 244 @param verbose_out: stream to print verbose output to. This is a 245 good way of knowing whether anything's happening, since this 246 process can be slow. 247 @return: list of (midi-file,song-name) pairs each containing a 248 L{midi.EventStream} for each file. May return up to 50, but 249 probably far fewer. 250 251 """ 252 from urllib2 import urlopen, HTTPError, URLError 253 from xml.sax.saxutils import unescape 254 from urllib import quote 255 from BeautifulSoup import BeautifulSoup, NavigableString 256 from midi import read_midifile, MidiReadError 257 from cStringIO import StringIO 258 259 def _verbose(message): 260 if verbose_out is not None: 261 print >>verbose_out, message

262 263 _verbose("<<<<<<<<<< vanBasco midi search >>>>>>>>>>>>>>") 264 265 string_name = name.encode('ascii', 'replace') 266 # Construct the url for the search query 267 query = u"+".join(name.split()) 268 domain = "http://www.vanbasco.com" 269 url = "%s/search.html?resultsperpage=50&q=%s" % (domain, quote(query.encode('utf-8'))) 270 _verbose("Querying %s" % url) 271 272 def _links_from_page(page_url, mirror_page=False): 273 """ 274 This parses a page of vanBasco results and pulls out possible 275 midi file links from it. 276 277 It's defined as a function so that it can be called recursively 278 if the midi file isn't found, but a page of mirrors is offered 279 by vanBasco. The mirrors page is almost the same as the results 280 page. 281 282 """ 283 # Read the page into a beautiful soup to parse it 284 soup = BeautifulSoup(urlopen(page_url).read()) 285 # Pull out the central part of the page 286 central = soup.table.tr.findAll("td", recursive=False)[1] 287 288 # The mirrors pages are structured very slightly differently 289 if mirror_page: 290 central = central.p.contents 291 else: 292 central = central.contents 293 294 # Pick out the lines we want from this 295 midi_files = [] 296 current_link = None 297 current_name = None 298 current_filename = None 299 current_mirrors = None 300 301 for tag in central: 302 if isinstance(tag, NavigableString): 303 # Just a raw string: see if it's a filename 304 if current_link is not None: 305 # Try to pull a MIDI filename out of this line 306 line = unescape(unicode(tag)).lower().replace(" ", " ") 307 name = line.split()[0] 308 if name.endswith(".mid") or name.endswith(".midi"): 309 # Found one: use this with the link we have already 310 current_filename = name 311 elif tag.name == "a": 312 link = tag['href'] 313 if link.startswith("http://"): 314 # External link: treat as a new result 315 if current_filename is not None: 316 # First store the previous result 317 midi_files.append((current_link, 318 current_filename, 319 current_name, 320 current_mirrors)) 321 current_link = unicode(link) 322 current_name = unicode(tag.contents[0]) 323 current_filename = None 324 current_mirrors = None 325 elif link.startswith("/search.html"): 326 # This is a link to a mirrors page: keep it in case 327 # we can't get the main file 328 current_mirrors = u"%s%s" % (domain, link) 329 # Add the final link 330 if current_link is not None and current_filename is not None: 331 # First store the previous result 332 midi_files.append((current_link, 333 current_filename, 334 current_name, 335 current_mirrors)) 336 337 # Try to get a MIDI file from each of these links 338 files = [] 339 for link,filename,name,mirror_link in midi_files: 340 # Many of these will fail, since the sites are crap and most 341 # files have gone. We'll get some, though, enshalla. 342 _verbose("Trying to get file %s from %s" % (filename.encode('ascii','replace'), link.encode('ascii','replace'))) 343 try: 344 got_file = get_linked_file(link, filename) 345 except Exception, err: 346 _verbose(" %s" % err) 347 # File wasn't found, couldn't be read or timed out 348 if not mirror_page and mirror_link is not None: 349 # Try going to the mirrors page to see if we can 350 # fetch any mirrors of this midi file 351 _verbose("Recursively fetching mirror files from %s" % mirror_link.encode('ascii','replace')) 352 mirror_files = _links_from_page(mirror_link, mirror_page=True) 353 if len(mirror_files) > 0: 354 # Found this file at at least one mirror 355 # Just use one copy of it 356 files.append(mirror_files[0]) 357 _verbose("Returning to top level. Found %d mirror files" % len(mirror_files)) 358 # If no mirrors are available, we just accept that we 359 # can't get the file and hope it wasn't any good anyway 360 else: 361 _verbose(" Success") 362 files.append((got_file,filename,name)) 363 return files 364 365 midi_files = _links_from_page(url) 366 367 # Check that each one is actually a MIDI file - some may be adverts 368 ok_files = [] 369 for data,filename,name in midi_files: 370 try: 371 mid = read_midifile(StringIO(data)) 372 except MidiReadError, err: 373 _verbose("Invalid midi file: %s (%s)" % (filename.encode('ascii','replace'), err)) 374 # Skip this file 375 pass 376 else: 377 ok_files.append((data,name)) 378 return ok_files 379 380 381

382 -def intersearch_midi_files(name, verbose_out=None):

383 """ 384 One method of getting midi files used by L{find_midi_files}. 385 386 Looks up the name on Intersearch midi search and tries to parse the 387 results to pull the file urls out. 388 389 This is very similar to the vanBasco search, since the results 390 are in roughly the same format. These pages are slightly cleaner. 391 392 @see: U{http://www.inter-search.co.uk} 393 394 @type verbose_out: file-like object 395 @param verbose_out: stream to print verbose output to. This is a 396 good way of knowing whether anything's happening, since this 397 process can be slow. 398 @return: list of (midi-file,song-name) pairs each containing a 399 L{midi.EventStream} for each file. May return up to 50, but 400 probably far fewer. 401 402 """ 403 from urllib2 import urlopen, HTTPError, URLError 404 from xml.sax.saxutils import unescape 405 from urllib import quote 406 from BeautifulSoup import BeautifulSoup, NavigableString 407 from midi import read_midifile, MidiReadError 408 from cStringIO import StringIO 409 import re 410 411 def _verbose(message): 412 if verbose_out is not None: 413 print >>verbose_out, message

414 415 _verbose("<<<<<<<<<< Intersearch midi search >>>>>>>>>>>>>>") 416 417 string_name = name.encode('ascii', 'replace') 418 # Construct the url for the search query 419 query = u"+".join(name.split()) 420 domain = "http://www.inter-search.co.uk" 421 url = "%s/midi/search.pl?t=%s&m=1&x=50" % (domain, quote(query.encode('utf-8'))) 422 _verbose("Querying %s" % url) 423 424 # Read the page into a beautiful soup to parse it 425 soup = BeautifulSoup(urlopen(url).read()) 426 # Get all the paragraphs from the middle of the page 427 pars = soup.findAll("p") 428 429 # Ignore pars with certain CSS classes 430 ignore_classes = ["b1", "b1t"] 431 432 files = [] 433 # Pick out the lines we want from this 434 for paragraph in pars: 435 if paragraph.has_key("class") and paragraph["class"] in ignore_classes: 436 continue 437 # The link to the host page should be the only link in this par 438 link = paragraph.findAll("a")[0] 439 link.extract() 440 url = link["href"] 441 name = link.text 442 # Process the remain tags one by one to find the filename 443 filename = None 444 for tag in paragraph: 445 if isinstance(tag, NavigableString): 446 # The first one is the filename, the rest is useless 447 filename = unicode(tag) 448 break 449 if filename is None: 450 # No filename found: can't do anything with this 451 continue 452 # Now try fetching the file 453 _verbose("Trying to get file %s from %s" % (filename.encode('ascii','replace'), url.encode('ascii','replace'))) 454 try: 455 got_file = get_linked_file(url, filename) 456 except Exception, err: 457 _verbose(" %s" % err) 458 else: 459 _verbose(" Success") 460 files.append((got_file,filename,name)) 461 462 # Check that each one is actually a MIDI file - some may be adverts 463 ok_files = [] 464 for data,filename,name in files: 465 try: 466 mid = read_midifile(StringIO(data)) 467 except MidiReadError, err: 468 _verbose("Invalid midi file: %s (%s)" % (filename.encode('ascii','replace'), err)) 469 # Skip this file 470 pass 471 else: 472 ok_files.append((data,name)) 473 return ok_files 474 475

476 -def melody_catcher_midi_files(name, verbose_out=None):

477 """ 478 One method of getting midi files used by L{find_midi_files}. 479 480 Looks up the name on Melody Catcher midi search and tries to parse 481 the results to pull the file urls out. 482 Melody Catcher is meant primarily as a query-by-humming search, but 483 also offers a title search, which we use. 484 485 @see: U{http://www.melodycatcher.com} 486 487 @type verbose_out: file-like object 488 @param verbose_out: stream to print verbose output to. This is a 489 good way of knowing whether anything's happening, since this 490 process can be slow. 491 @return: list of (midi-file,song-name) pairs each containing a 492 L{midi.EventStream} for each file. May return up to 50, but 493 probably far fewer. 494 495 """ 496 from urllib2 import urlopen, Request 497 from urllib import urlencode 498 from BeautifulSoup import BeautifulSoup, NavigableString 499 from midi import read_midifile, MidiReadError 500 from cStringIO import StringIO 501 from jazzparser.utils.strings import strip_accents 502 503 def _verbose(message): 504 if verbose_out is not None: 505 print >>verbose_out, message

506 507 _verbose("<<<<<<<<<< Melody Catcher midi search >>>>>>>>>>>>>>") 508 509 string_name = name.encode('ascii', 'replace') 510 # Remove accents from characters 511 search_name = strip_accents(name) 512 # Construct the url for the search query 513 domain = "http://www.melodycatcher.com" 514 url = "%s/search.php" % domain 515 # Form uses POST to search (tut) 516 post_data = { 517 'ts' : search_name, # Search query 518 'send' : "Submit", 519 } 520 data = urlencode(post_data) 521 request = Request(url, data) 522 _verbose("Querying %s (POST: %s)" % (url,data)) 523 524 # Read the page into a beautiful soup to parse it 525 soup = BeautifulSoup(urlopen(request).read()) 526 # Simplest way to detect no results: this string somewhere on the page 527 if "No results found for your search" in str(soup): 528 _verbose("No results") 529 return [] 530 # First table is the whole page (tut), next two are menus, third 531 # is search form, fourth contains the results 532 table = soup.findAll("table")[4] 533 # Each row is a result 534 files = [] 535 for row in table.findAll("tr"): 536 # We only want the second column 537 cell = row.findAll("td")[1] 538 # The first link is to the midi file 539 link = cell.findAll("a")[0] 540 url = link["href"] 541 # Get rid of the file extension from the name (which is really 542 # a filename, but not a nice one) 543 name = link.text.encode('ascii','ignore').replace(".midi","").replace(".mid","") 544 filename = url.rpartition("/")[2] 545 # Now try fetching the file itself 546 _verbose("Trying to get %s" % url) 547 try: 548 got_file = urlopen(url, timeout=3) 549 except Exception, err: 550 _verbose(" %s" % err) 551 else: 552 _verbose(" Success") 553 files.append((got_file.read(),filename,name)) 554 got_file.close() 555 556 # Check that each one is actually a MIDI file - some may be adverts 557 ok_files = [] 558 for data,filename,name in files: 559 try: 560 mid = read_midifile(StringIO(data)) 561 except MidiReadError, err: 562 _verbose("Invalid midi file: %s (%s)" % (filename, err)) 563 # Skip this file 564 pass 565 else: 566 ok_files.append((data,name)) 567 return ok_files 568 569

570 -def get_linked_file(page_url, filename, timeout=3):

571 """ 572 Often the links on search results pages for MIDI files will be to 573 a page that references the file, rather than to the file itself. 574 Given the url to this page and the filename (also supplied in the 575 search results), this function returns the file itself, if it 576 can find it on the page. 577 578 This may raise an exception in the process of fetching the file. 579 This function doesn't catch anything raised by urlopen. 580 581 If a link to the file can't be found on the page, we next try 582 getting the file from the same directory as the page. If neither 583 of these works, the last error encountered while try to get a file 584 is raised (although errors reading files linked to on the page will 585 be raised in preference to those from the last-ditch directory 586 attempt). 587 588 @type timeout: int 589 @param timeout: a timeout used on every url access (default 3 secs) 590 591 """ 592 from BeautifulSoup import BeautifulSoup 593 from urllib2 import urlopen 594 from urllib import quote 595 from urlparse import urljoin 596 597 # Fetch the referring page and parse it 598 soup = BeautifulSoup(urlopen(page_url, timeout=timeout).read()) 599 # Get all the links out of the page 600 links = soup.findAll("a") 601 file_links = [] 602 last_error = None 603 # Look for possible links to this file 604 for link in links: 605 if link.has_key("href"): 606 target = urljoin(page_url, link["href"]) 607 if target.endswith(filename): 608 # Looks like this is a link to the right file 609 try: 610 link_file = urlopen(target, timeout=timeout) 611 try: 612 return link_file.read() 613 finally: 614 link_file.close() 615 except Exception, err: 616 # Don't raise this unless this is the last link available 617 last_error = err 618 continue 619 # No more links left to try 620 # Last ditch attempt: try the directory containing the page 621 link = urljoin(page_url, filename) 622 try: 623 link_file = urlopen(link, timeout=timeout) 624 try: 625 return link_file.read() 626 finally: 627 link_file.close() 628 except Exception, err: 629 if last_error is None: 630 last_error = err 631 # If there was an error, raise it 632 raise last_error

633 634

635 -def get_vanilla_book():

636 """ 637 Downloads the whole of the Vanilla Book: 638 L{http://www.ralphpatt.com/Song.html}. 639 640 """ 641 from BeautifulSoup import BeautifulSoup 642 from urllib2 import urlopen 643 from urllib import quote 644 from urlparse import urljoin 645 import re 646 from jazzparser.utils.base import group_pairs 647 648 #~ raise NotImplementedError, "not finished writing this" 649 650 INDEX_PAGE = 'http://www.ralphpatt.com/Song.html' 651 SONG_BASE = 'http://www.ralphpatt.com/' 652 # The overbar alternative ending marker 653 alt_end_re = re.compile(r'(\d+).(_+)') 654 655 # Fetch the referring page and parse it 656 soup = BeautifulSoup(urlopen(INDEX_PAGE).read()) 657 # Pull out all the links 658 links = soup.findAll("a") 659 # Get just the links to songs: all in VB/ 660 song_links = [l['href'] for l in links if l.has_key("href") and \ 661 l['href'].startswith("VB/")] 662 663 for song_link in song_links: 664 url = "%s%s" % (SONG_BASE, song_link) 665 song_soup = BeautifulSoup(urlopen(url).read()) 666 # The song's name is in the title tag 667 song_name = song_soup.title.string.strip() 668 print song_name 669 # The chords are in a pre tag 670 chord_text = ''.join(song_soup.body.pre.findAll(text=True)) 671 # Remove the key line 672 lines = chord_text.split("\n") 673 start_line = 0 674 for i,line in enumerate(lines): 675 if line.lower().startswith("key"): 676 # Found the key line: ignore everything up to here 677 start_line = i+1 678 break 679 else: 680 # No key line found! 681 print "No key line for %s" % song_name 682 continue 683 lines = lines[start_line:] 684 685 # Find the chord lines: they start with | or [ 686 song_lines = [] 687 for i,line in enumerate(lines): 688 if line.startswith("[") or line.startswith("|"): 689 song_lines.append((lines[i-1], lines[i])) 690 691 try: 692 bars = [] 693 bar_ranges = [] 694 open_repeats = [] 695 for overline,line in song_lines: 696 barlines = list(re.finditer(r"(\|\|)|(\|)|(\[:)|(:\])|(\[)", line)) 697 barline_ptns = [] 698 for i,(start_match,end_match) in enumerate(group_pairs(barlines)): 699 # If the bar has zero length, it's just two barlines 700 # next to each other: ignore 701 if start_match.end() == end_match.start(): 702 continue 703 barline_ptns.append(start_match.start()) 704 # Get the upper and lower parts of this bar 705 if i == len(barlines) - 2: 706 # If this is the last bar on the line, go to the end 707 overbar = overline[start_match.start()-2:] 708 else: 709 overbar = overline[start_match.start()-2:end_match.start()] 710 overbar_cnt = overbar.strip() 711 if len(overbar_cnt) < 2: 712 overbar_cnt = "" 713 bar = line[start_match.end():end_match.start()] 714 715 # We might loose some timing information at this point, 716 # but it's not really worth trying to get 717 chords = [str(c) for c in bar.split() if c != "/"] 718 bars.append(chords) 719 720 # Check the starting barline for a repeat 721 barline = line[start_match.start():start_match.end()] 722 end_barline = line[end_match.start():end_match.end()] 723 # If we're starting a repeat, note that it starts here 724 if barline == "[:": 725 open_repeats.append(len(bars)-1) 726 # If we're ending a repeat, copy in the repeated bars 727 if end_barline == ":]": 728 if len(open_repeats) == 0: 729 print "Unmatched open repeat in %s" % song_name 730 raise ChordSequenceParseError 731 repeat_start = open_repeats.pop() 732 bars.extend(bars[repeat_start:]) 733 734 if overbar_cnt.startswith("__"): 735 overbar_cnt = overbar_cnt[2:].lstrip() 736 elif overbar_cnt.startswith("_"): 737 overbar_cnt = overbar_cnt[1:].lstrip() 738 if len(overbar_cnt): 739 alt_end = alt_end_re.match(overbar_cnt) 740 if alt_end: 741 print "alt end", alt_end.groups()[0] 742 else: 743 print overbar_cnt 744 ## TODO: deal with alternative endings (in the overbar) 745 746 except ChordSequenceParseError: 747 continue

748

749 -def get_irealb(url, skip=0):

750 """ 751 Get the iReal b format chord sequences from a webpage. The sequences are 752 encoded in a URL (!) which is huuuuge. The argument URL is that of the 753 page containing the URL with the sequences in. If C{skip} is given, 754 C{skip} irealb:// urls are skipped (in case you don't want to download 755 the first on the page). 756 757 """ 758 from BeautifulSoup import BeautifulSoup 759 from urllib2 import urlopen 760 from urllib import unquote 761 762 # Fetch the page and parse it 763 soup = BeautifulSoup(urlopen(url).read()) 764 # Pull out all the links 765 links = soup.findAll("a") 766 # Get just the links that are sequence databases 767 irealb_links = [l['href'] for l in links if l.has_key("href") and \ 768 l['href'].startswith("irealb://")] 769 if skip >= len(irealb_links): 770 raise ValueError, "there are only %d irealb links on the page" % \ 771 len(irealb_links) 772 773 link = irealb_links[skip] 774 text = unquote(link) 775 776 for sequence in text.split("==0=0==="): 777 data = sequence.split("=") 778 # The last one is just the name of the corpus 779 if len(data) < 2: 780 continue 781 # Split up the meta data 782 # 2 doesn't seem to be used (lyricist?) 783 # 5 is never used 784 name = data[0] 785 composer = data[1] 786 style = data[3] 787 key = data[4] 788 chord_data = data[6] 789 print "=== %s ===" % name 790 # Decode the chord sequences 791 for bar in chord_data.split("|"): 792 print "BAR" 793 for chord in bar.split(","): 794 ### TODO: can't work out how to decode this 795 print chord

796

797 -class ChordSequenceParseError(Exception):

798 pass

799

Source Code for Module jazzparser.utils.web