1 from __future__ import absolute_import
2 """Internet access utilities.
3
4 Utilities for retreiving information or files from the internet.
5
6 """
7 """
8 ============================== License ========================================
9 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
10
11 This file is part of The Jazz Parser.
12
13 The Jazz Parser is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
17
18 The Jazz Parser is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
25
26 ============================ End license ======================================
27
28 """
29 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
30
31 SOURCES = ['jazzpage','vanbasco','intersearch','melodycatcher']
32
34 """
35 Given the name of a piece of music, tries to retrieve MIDI files
36 for a piece of that name.
37
38 Looks up the name on vanBasco's midi search and tries to parse the
39 results to pull the file urls out. Most of these will fail, either
40 because the midi file isn't accessible at the obviously place, or
41 because the link's out of date (many are).
42
43 C{name} may be a unicode string.
44
45 @type sources: list of strings
46 @param sources: list of source names to get files from.
47 Possibilities are in L{SOURCES}. If None, uses all sources.
48 @type verbose_out: writable file-like object
49 @param verbose_out: stream to send verbose output to (default None -
50 no verbose output)
51 @return: list of (midi-file,song-name) pairs each containing a
52 L{midi.EventStream} for each file.
53
54 """
55 if sources is not None:
56 for source in sources:
57 if source not in SOURCES:
58 raise ValueError, "invalid source name '%s'. Possibilities "\
59 "are: %s" % ", ".join(SOURCES)
60 if sources is None or len(sources) == 0:
61
62 sources = SOURCES
63
64
65 files = []
66 if 'vanbasco' in sources:
67 files.extend(van_basco_midi_files(name, verbose_out=verbose_out))
68 if 'jazzpage' in sources:
69 files.extend(the_jazz_page_midi_files(name, verbose_out=verbose_out))
70 if 'intersearch' in sources:
71 files.extend(intersearch_midi_files(name, verbose_out=verbose_out))
72 if 'melodycatcher' in sources:
73 files.extend(melody_catcher_midi_files(name, verbose_out=verbose_out))
74
75 return remove_duplicate_files(files, key=lambda d:d[0])
76
78 """
79 Given a list of file data strings, removes any duplicates of
80 the same data. Only the first occurence of the file data will
81 be kept.
82
83 The list doesn't have to be just of the data, as long as the data
84 is returned by the key function applied to each value in the list.
85
86 """
87 keys = [key(f) for f in reversed(files)]
88 to_remove = []
89 for i,data in enumerate(keys):
90 if data in keys[i+1:]:
91 to_remove.append(i)
92 print "Removing %d duplicates" % len(to_remove)
93
94 files = list(reversed(
95 [f for (i,f) in enumerate(reversed(files))
96 if i not in to_remove]
97 ))
98 return files
99
100 -def the_jazz_page_midi_files(name, refresh_cache=False, verbose_out=None):
101 """
102 The Jazz Page has quite a few of midi files on a single page.
103 There's only one of most songs and it's not exactly a huge database,
104 but they claim to be high quality.
105
106 By default, the list of midi files and song names will be cached
107 the first time it's fetched. Subsequently, songs will just be
108 looked up in the local database. Use C{refresh_cache=True} to
109 force the list to be re-read from the site.
110
111 To refresh the cache without doing a search, see
112 L{refresh_the_jazz_page_cache}.
113
114 @return: list of (midi-file,song-name) pairs each containing a
115 L{midi.EventStream} for each file. Unlikely to be more than one.
116
117 """
118 from jazzparser.settings import LOCAL_DATA_DIR
119 from jazzparser.utils.csv import UnicodeCsvReader
120 from urllib2 import urlopen, HTTPError, URLError
121 from midi import read_midifile, MidiReadError
122 import os, difflib
123 from cStringIO import StringIO
124
125 def _verbose(message):
126 if verbose_out is not None:
127 print >>verbose_out, message
128
129 _verbose("<<<<<<<<<< The Jazz Page midi search >>>>>>>>>>>>>>")
130
131
132 cache_filename = os.path.join(LOCAL_DATA_DIR, "the_midi_site_cache")
133 cached = os.path.exists(cache_filename)
134
135
136 if not cached or refresh_cache:
137 refresh_the_jazz_page_cache()
138
139
140 cache_file = open(cache_filename, 'r')
141 try:
142 reader = UnicodeCsvReader(cache_file)
143 song_names = {}
144
145
146 for link,song_name in reader:
147 song_names.setdefault(song_name,[]).append(link)
148 finally:
149 cache_file.close()
150
151
152 _verbose("Searching for song name: %s" % name)
153 matches = difflib.get_close_matches(name.lower(), song_names.keys(), cutoff=0.8)
154 if len(matches) == 0:
155 _verbose("No matches found")
156
157 files = []
158 for name in matches:
159 for link in song_names[name]:
160 _verbose("Fetching midi file %s" % link)
161 file_data = urlopen(link).read()
162 files.append((file_data, link, name))
163
164
165
166 ok_files = []
167 for data,link,name in files:
168 try:
169 mid = read_midifile(StringIO(data))
170 except MidiReadError, err:
171 _verbose("Invalid midi file: %s (%s)" % (link, err))
172
173 pass
174 else:
175 ok_files.append((data,name))
176 return ok_files
177
179 """
180 Reads entries from The Jazz Page into a local cache for searching.
181
182 @see: L{the_jazz_page_midi_files}
183
184 """
185 import os
186 from urllib2 import urlopen
187 from BeautifulSoup import BeautifulSoup
188 from jazzparser.settings import LOCAL_DATA_DIR
189 from jazzparser.utils.csv import UnicodeCsvWriter
190
191 cache_filename = os.path.join(LOCAL_DATA_DIR, "the_midi_site_cache")
192 domain = "http://www.thejazzpage.de"
193 index_url = "%s/midiinfo.html" % domain
194
195 if os.path.exists(cache_filename):
196
197 os.remove(cache_filename)
198
199 cache_file = open(cache_filename, 'w')
200 try:
201 writer = UnicodeCsvWriter(cache_file)
202
203 soup = BeautifulSoup(urlopen(index_url).read())
204
205 tables = soup.findAll("table")[1:-1]
206 rowspan = 0
207 for table in tables:
208 for row in table.findAll("tr"):
209 cells = list(row)
210 if rowspan == 0:
211 if cells[0].has_key("rowspan"):
212 rowspan = int(cells[0]["rowspan"]) -1
213
214
215 middle_cell = cells[1]
216 else:
217 middle_cell = cells[0]
218 rowspan -= 1
219
220 if middle_cell.a is not None:
221 link = middle_cell.a["href"]
222 link = "%s/%s" % (domain,link)
223 name = middle_cell.a.text
224 name = name.replace("\n","").lower()
225 writer.writerow([link, name])
226 finally:
227 cache_file.close()
228
229
230
232 """
233 One method of getting midi files used by L{find_midi_files}.
234
235 Looks up the name on vanBasco's midi search and tries to parse the
236 results to pull the file urls out. Most of these will fail, either
237 because the midi file isn't accessible at the obviously place, or
238 because the link's out of date (many are).
239
240 Beware that this can take a long time, since it has to look up many
241 servers and most of them are slow.
242
243 @type verbose_out: file-like object
244 @param verbose_out: stream to print verbose output to. This is a
245 good way of knowing whether anything's happening, since this
246 process can be slow.
247 @return: list of (midi-file,song-name) pairs each containing a
248 L{midi.EventStream} for each file. May return up to 50, but
249 probably far fewer.
250
251 """
252 from urllib2 import urlopen, HTTPError, URLError
253 from xml.sax.saxutils import unescape
254 from urllib import quote
255 from BeautifulSoup import BeautifulSoup, NavigableString
256 from midi import read_midifile, MidiReadError
257 from cStringIO import StringIO
258
259 def _verbose(message):
260 if verbose_out is not None:
261 print >>verbose_out, message
262
263 _verbose("<<<<<<<<<< vanBasco midi search >>>>>>>>>>>>>>")
264
265 string_name = name.encode('ascii', 'replace')
266
267 query = u"+".join(name.split())
268 domain = "http://www.vanbasco.com"
269 url = "%s/search.html?resultsperpage=50&q=%s" % (domain, quote(query.encode('utf-8')))
270 _verbose("Querying %s" % url)
271
272 def _links_from_page(page_url, mirror_page=False):
273 """
274 This parses a page of vanBasco results and pulls out possible
275 midi file links from it.
276
277 It's defined as a function so that it can be called recursively
278 if the midi file isn't found, but a page of mirrors is offered
279 by vanBasco. The mirrors page is almost the same as the results
280 page.
281
282 """
283
284 soup = BeautifulSoup(urlopen(page_url).read())
285
286 central = soup.table.tr.findAll("td", recursive=False)[1]
287
288
289 if mirror_page:
290 central = central.p.contents
291 else:
292 central = central.contents
293
294
295 midi_files = []
296 current_link = None
297 current_name = None
298 current_filename = None
299 current_mirrors = None
300
301 for tag in central:
302 if isinstance(tag, NavigableString):
303
304 if current_link is not None:
305
306 line = unescape(unicode(tag)).lower().replace(" ", " ")
307 name = line.split()[0]
308 if name.endswith(".mid") or name.endswith(".midi"):
309
310 current_filename = name
311 elif tag.name == "a":
312 link = tag['href']
313 if link.startswith("http://"):
314
315 if current_filename is not None:
316
317 midi_files.append((current_link,
318 current_filename,
319 current_name,
320 current_mirrors))
321 current_link = unicode(link)
322 current_name = unicode(tag.contents[0])
323 current_filename = None
324 current_mirrors = None
325 elif link.startswith("/search.html"):
326
327
328 current_mirrors = u"%s%s" % (domain, link)
329
330 if current_link is not None and current_filename is not None:
331
332 midi_files.append((current_link,
333 current_filename,
334 current_name,
335 current_mirrors))
336
337
338 files = []
339 for link,filename,name,mirror_link in midi_files:
340
341
342 _verbose("Trying to get file %s from %s" % (filename.encode('ascii','replace'), link.encode('ascii','replace')))
343 try:
344 got_file = get_linked_file(link, filename)
345 except Exception, err:
346 _verbose(" %s" % err)
347
348 if not mirror_page and mirror_link is not None:
349
350
351 _verbose("Recursively fetching mirror files from %s" % mirror_link.encode('ascii','replace'))
352 mirror_files = _links_from_page(mirror_link, mirror_page=True)
353 if len(mirror_files) > 0:
354
355
356 files.append(mirror_files[0])
357 _verbose("Returning to top level. Found %d mirror files" % len(mirror_files))
358
359
360 else:
361 _verbose(" Success")
362 files.append((got_file,filename,name))
363 return files
364
365 midi_files = _links_from_page(url)
366
367
368 ok_files = []
369 for data,filename,name in midi_files:
370 try:
371 mid = read_midifile(StringIO(data))
372 except MidiReadError, err:
373 _verbose("Invalid midi file: %s (%s)" % (filename.encode('ascii','replace'), err))
374
375 pass
376 else:
377 ok_files.append((data,name))
378 return ok_files
379
380
381
383 """
384 One method of getting midi files used by L{find_midi_files}.
385
386 Looks up the name on Intersearch midi search and tries to parse the
387 results to pull the file urls out.
388
389 This is very similar to the vanBasco search, since the results
390 are in roughly the same format. These pages are slightly cleaner.
391
392 @see: U{http://www.inter-search.co.uk}
393
394 @type verbose_out: file-like object
395 @param verbose_out: stream to print verbose output to. This is a
396 good way of knowing whether anything's happening, since this
397 process can be slow.
398 @return: list of (midi-file,song-name) pairs each containing a
399 L{midi.EventStream} for each file. May return up to 50, but
400 probably far fewer.
401
402 """
403 from urllib2 import urlopen, HTTPError, URLError
404 from xml.sax.saxutils import unescape
405 from urllib import quote
406 from BeautifulSoup import BeautifulSoup, NavigableString
407 from midi import read_midifile, MidiReadError
408 from cStringIO import StringIO
409 import re
410
411 def _verbose(message):
412 if verbose_out is not None:
413 print >>verbose_out, message
414
415 _verbose("<<<<<<<<<< Intersearch midi search >>>>>>>>>>>>>>")
416
417 string_name = name.encode('ascii', 'replace')
418
419 query = u"+".join(name.split())
420 domain = "http://www.inter-search.co.uk"
421 url = "%s/midi/search.pl?t=%s&m=1&x=50" % (domain, quote(query.encode('utf-8')))
422 _verbose("Querying %s" % url)
423
424
425 soup = BeautifulSoup(urlopen(url).read())
426
427 pars = soup.findAll("p")
428
429
430 ignore_classes = ["b1", "b1t"]
431
432 files = []
433
434 for paragraph in pars:
435 if paragraph.has_key("class") and paragraph["class"] in ignore_classes:
436 continue
437
438 link = paragraph.findAll("a")[0]
439 link.extract()
440 url = link["href"]
441 name = link.text
442
443 filename = None
444 for tag in paragraph:
445 if isinstance(tag, NavigableString):
446
447 filename = unicode(tag)
448 break
449 if filename is None:
450
451 continue
452
453 _verbose("Trying to get file %s from %s" % (filename.encode('ascii','replace'), url.encode('ascii','replace')))
454 try:
455 got_file = get_linked_file(url, filename)
456 except Exception, err:
457 _verbose(" %s" % err)
458 else:
459 _verbose(" Success")
460 files.append((got_file,filename,name))
461
462
463 ok_files = []
464 for data,filename,name in files:
465 try:
466 mid = read_midifile(StringIO(data))
467 except MidiReadError, err:
468 _verbose("Invalid midi file: %s (%s)" % (filename.encode('ascii','replace'), err))
469
470 pass
471 else:
472 ok_files.append((data,name))
473 return ok_files
474
475
477 """
478 One method of getting midi files used by L{find_midi_files}.
479
480 Looks up the name on Melody Catcher midi search and tries to parse
481 the results to pull the file urls out.
482 Melody Catcher is meant primarily as a query-by-humming search, but
483 also offers a title search, which we use.
484
485 @see: U{http://www.melodycatcher.com}
486
487 @type verbose_out: file-like object
488 @param verbose_out: stream to print verbose output to. This is a
489 good way of knowing whether anything's happening, since this
490 process can be slow.
491 @return: list of (midi-file,song-name) pairs each containing a
492 L{midi.EventStream} for each file. May return up to 50, but
493 probably far fewer.
494
495 """
496 from urllib2 import urlopen, Request
497 from urllib import urlencode
498 from BeautifulSoup import BeautifulSoup, NavigableString
499 from midi import read_midifile, MidiReadError
500 from cStringIO import StringIO
501 from jazzparser.utils.strings import strip_accents
502
503 def _verbose(message):
504 if verbose_out is not None:
505 print >>verbose_out, message
506
507 _verbose("<<<<<<<<<< Melody Catcher midi search >>>>>>>>>>>>>>")
508
509 string_name = name.encode('ascii', 'replace')
510
511 search_name = strip_accents(name)
512
513 domain = "http://www.melodycatcher.com"
514 url = "%s/search.php" % domain
515
516 post_data = {
517 'ts' : search_name,
518 'send' : "Submit",
519 }
520 data = urlencode(post_data)
521 request = Request(url, data)
522 _verbose("Querying %s (POST: %s)" % (url,data))
523
524
525 soup = BeautifulSoup(urlopen(request).read())
526
527 if "No results found for your search" in str(soup):
528 _verbose("No results")
529 return []
530
531
532 table = soup.findAll("table")[4]
533
534 files = []
535 for row in table.findAll("tr"):
536
537 cell = row.findAll("td")[1]
538
539 link = cell.findAll("a")[0]
540 url = link["href"]
541
542
543 name = link.text.encode('ascii','ignore').replace(".midi","").replace(".mid","")
544 filename = url.rpartition("/")[2]
545
546 _verbose("Trying to get %s" % url)
547 try:
548 got_file = urlopen(url, timeout=3)
549 except Exception, err:
550 _verbose(" %s" % err)
551 else:
552 _verbose(" Success")
553 files.append((got_file.read(),filename,name))
554 got_file.close()
555
556
557 ok_files = []
558 for data,filename,name in files:
559 try:
560 mid = read_midifile(StringIO(data))
561 except MidiReadError, err:
562 _verbose("Invalid midi file: %s (%s)" % (filename, err))
563
564 pass
565 else:
566 ok_files.append((data,name))
567 return ok_files
568
569
571 """
572 Often the links on search results pages for MIDI files will be to
573 a page that references the file, rather than to the file itself.
574 Given the url to this page and the filename (also supplied in the
575 search results), this function returns the file itself, if it
576 can find it on the page.
577
578 This may raise an exception in the process of fetching the file.
579 This function doesn't catch anything raised by urlopen.
580
581 If a link to the file can't be found on the page, we next try
582 getting the file from the same directory as the page. If neither
583 of these works, the last error encountered while try to get a file
584 is raised (although errors reading files linked to on the page will
585 be raised in preference to those from the last-ditch directory
586 attempt).
587
588 @type timeout: int
589 @param timeout: a timeout used on every url access (default 3 secs)
590
591 """
592 from BeautifulSoup import BeautifulSoup
593 from urllib2 import urlopen
594 from urllib import quote
595 from urlparse import urljoin
596
597
598 soup = BeautifulSoup(urlopen(page_url, timeout=timeout).read())
599
600 links = soup.findAll("a")
601 file_links = []
602 last_error = None
603
604 for link in links:
605 if link.has_key("href"):
606 target = urljoin(page_url, link["href"])
607 if target.endswith(filename):
608
609 try:
610 link_file = urlopen(target, timeout=timeout)
611 try:
612 return link_file.read()
613 finally:
614 link_file.close()
615 except Exception, err:
616
617 last_error = err
618 continue
619
620
621 link = urljoin(page_url, filename)
622 try:
623 link_file = urlopen(link, timeout=timeout)
624 try:
625 return link_file.read()
626 finally:
627 link_file.close()
628 except Exception, err:
629 if last_error is None:
630 last_error = err
631
632 raise last_error
633
634
636 """
637 Downloads the whole of the Vanilla Book:
638 L{http://www.ralphpatt.com/Song.html}.
639
640 """
641 from BeautifulSoup import BeautifulSoup
642 from urllib2 import urlopen
643 from urllib import quote
644 from urlparse import urljoin
645 import re
646 from jazzparser.utils.base import group_pairs
647
648
649
650 INDEX_PAGE = 'http://www.ralphpatt.com/Song.html'
651 SONG_BASE = 'http://www.ralphpatt.com/'
652
653 alt_end_re = re.compile(r'(\d+).(_+)')
654
655
656 soup = BeautifulSoup(urlopen(INDEX_PAGE).read())
657
658 links = soup.findAll("a")
659
660 song_links = [l['href'] for l in links if l.has_key("href") and \
661 l['href'].startswith("VB/")]
662
663 for song_link in song_links:
664 url = "%s%s" % (SONG_BASE, song_link)
665 song_soup = BeautifulSoup(urlopen(url).read())
666
667 song_name = song_soup.title.string.strip()
668 print song_name
669
670 chord_text = ''.join(song_soup.body.pre.findAll(text=True))
671
672 lines = chord_text.split("\n")
673 start_line = 0
674 for i,line in enumerate(lines):
675 if line.lower().startswith("key"):
676
677 start_line = i+1
678 break
679 else:
680
681 print "No key line for %s" % song_name
682 continue
683 lines = lines[start_line:]
684
685
686 song_lines = []
687 for i,line in enumerate(lines):
688 if line.startswith("[") or line.startswith("|"):
689 song_lines.append((lines[i-1], lines[i]))
690
691 try:
692 bars = []
693 bar_ranges = []
694 open_repeats = []
695 for overline,line in song_lines:
696 barlines = list(re.finditer(r"(\|\|)|(\|)|(\[:)|(:\])|(\[)", line))
697 barline_ptns = []
698 for i,(start_match,end_match) in enumerate(group_pairs(barlines)):
699
700
701 if start_match.end() == end_match.start():
702 continue
703 barline_ptns.append(start_match.start())
704
705 if i == len(barlines) - 2:
706
707 overbar = overline[start_match.start()-2:]
708 else:
709 overbar = overline[start_match.start()-2:end_match.start()]
710 overbar_cnt = overbar.strip()
711 if len(overbar_cnt) < 2:
712 overbar_cnt = ""
713 bar = line[start_match.end():end_match.start()]
714
715
716
717 chords = [str(c) for c in bar.split() if c != "/"]
718 bars.append(chords)
719
720
721 barline = line[start_match.start():start_match.end()]
722 end_barline = line[end_match.start():end_match.end()]
723
724 if barline == "[:":
725 open_repeats.append(len(bars)-1)
726
727 if end_barline == ":]":
728 if len(open_repeats) == 0:
729 print "Unmatched open repeat in %s" % song_name
730 raise ChordSequenceParseError
731 repeat_start = open_repeats.pop()
732 bars.extend(bars[repeat_start:])
733
734 if overbar_cnt.startswith("__"):
735 overbar_cnt = overbar_cnt[2:].lstrip()
736 elif overbar_cnt.startswith("_"):
737 overbar_cnt = overbar_cnt[1:].lstrip()
738 if len(overbar_cnt):
739 alt_end = alt_end_re.match(overbar_cnt)
740 if alt_end:
741 print "alt end", alt_end.groups()[0]
742 else:
743 print overbar_cnt
744
745
746 except ChordSequenceParseError:
747 continue
748
750 """
751 Get the iReal b format chord sequences from a webpage. The sequences are
752 encoded in a URL (!) which is huuuuge. The argument URL is that of the
753 page containing the URL with the sequences in. If C{skip} is given,
754 C{skip} irealb:// urls are skipped (in case you don't want to download
755 the first on the page).
756
757 """
758 from BeautifulSoup import BeautifulSoup
759 from urllib2 import urlopen
760 from urllib import unquote
761
762
763 soup = BeautifulSoup(urlopen(url).read())
764
765 links = soup.findAll("a")
766
767 irealb_links = [l['href'] for l in links if l.has_key("href") and \
768 l['href'].startswith("irealb://")]
769 if skip >= len(irealb_links):
770 raise ValueError, "there are only %d irealb links on the page" % \
771 len(irealb_links)
772
773 link = irealb_links[skip]
774 text = unquote(link)
775
776 for sequence in text.split("==0=0==="):
777 data = sequence.split("=")
778
779 if len(data) < 2:
780 continue
781
782
783
784 name = data[0]
785 composer = data[1]
786 style = data[3]
787 key = data[4]
788 chord_data = data[6]
789 print "=== %s ===" % name
790
791 for bar in chord_data.split("|"):
792 print "BAR"
793 for chord in bar.split(","):
794
795 print chord
796
799