""" ocrscrape.py --- Download all non-album OCRs from ocremix.org Copyright (C) 2011 Brian S. Stephan This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Example usage: % python2 ocrscrape.py --directory ~/Music/ocremix.org/OCRs """ import argparse import BeautifulSoup import hashlib import os import random import re import sys import threading import time import urllib import urllib2 class OCRListScraper: """ Pull down the URLs for all non-album OCRemixes. This doesn't actually download them, it just provides a list of URLs to go to and scrape. """ def __init__(self): """Set up some constants and variables.""" self.BASE_DOMAIN = 'http://ocremix.org' self.REMIX_LIST = '/remixes/?%s' self.REMIX_ENTRY_REGEX = '(/remix/OCR([0-9]+)/)' self.ocrs = [] def build_ocr_list(self, offset=0): """Populate the list of OCRs to download.""" if offset == 0: sys.stdout.write('building list of OCRs, please wait...') sys.stdout.flush() try: params = urllib.urlencode({'offset': offset, 'sort': 'datedesc'}) ocr_index = urllib2.urlopen(self.BASE_DOMAIN + self.REMIX_LIST % params) except urllib2.URLError as e: print('ERROR: couldn\'t get complete list of OCRs, reason: ' + e.reason) return count = 0 for line in ocr_index: match = re.search(self.REMIX_ENTRY_REGEX, line) if match and match.group(0): self.ocrs.append(self.BASE_DOMAIN + match.group(1)) count = count + 1 # recurse if count > 0: self.build_ocr_list(offset+count) if offset == 0: # print success at the end len_all = len(self.ocrs) ocrs_set = set(self.ocrs) self.ocrs = list(ocrs_set) print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), len_all-len(self.ocrs))) class OCRScraperWorker(threading.Thread): """ Visit OCR URLs, analyze the links, check the local filesystem, and optionally download the MP3. Will check/download all given URLs when the thread is started. """ class OCR(): """Track the details of an OCR to pull down.""" pass def __init__(self, thread_id, ocrs, directory): """Configure the worker thread's list of OCR URLs and target directory.""" self.thread_id = thread_id self.ocrs = ocrs self.directory = directory threading.Thread.__init__(self) def run(self): """Given a list of OCRs, identify their data and potentially fetch them.""" for ocr_str in self.ocrs: ocr = self._identify_ocr_data_from_url(ocr_str) # check if the file exists already do_download = False filename = os.path.abspath(os.path.abspath(self.directory) + os.sep + ocr.filename) if os.path.isfile(filename): if self._file_checksum_matches(filename, ocr): os.utime(filename, None) sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n' .format(self.thread_id, ocr.title.encode('utf-8'))) else: sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n' .format(self.thread_id, ocr.title.encode('utf-8'))) do_download = True else: # download sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, ocr.title.encode('utf-8'))) do_download = True if do_download: urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename) if not self._file_checksum_matches(filename, ocr): sys.stdout.write( '[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n' .format(self.thread_id, ocr.title.encode('utf-8'))) def _identify_ocr_data_from_url(self, url): """Given the input URL, turn the relevant data into an OCR object for later use. """ ocr = self.OCR() ocr.num = '0' match = re.search('/(OCR([0-9]+))/$', url) if not match: print('WARNING: i couldn\'t recognize the url, but i\'ll try plowing ahead blindly') else: ocr.num = match.group(1) try: ocr_data = urllib2.urlopen(url) except urllib2.URLError as e: print('ERROR: couldn\'t get data for ' + url + ', reason: ' + e.reason) return soup = BeautifulSoup.BeautifulSoup(''.join(ocr_data)) title_tag = soup.find('title').contents[0] match = re.search('^ReMix: (.*) - OverClocked ReMix$', title_tag) if not match: print('ERROR: couldn\'t find the OCR name from the title') return ocr.title = match.group(1).replace('<','<').replace('>','>').replace('&','&') # this is about to get gross... download = soup.find(id='panel-download') # get the checksum ocr.checksum = download.contents[1].contents[3].contents[3].contents[1] # get the download url(s) ocr.urls = [] hrefs = download.contents[1].contents[7].findAll('a') for link in hrefs: ocr.urls.append(urllib.unquote_plus(link['href'])) # store the desired local filename ocr.filename = ocr.urls[0].rsplit('/',1)[1] return ocr def _file_checksum_matches(self, filename, ocr): """Return if the MD5 checksum on a file matches that in the OCR listing.""" if os.path.isfile(filename): # read and check md5sum md5 = hashlib.md5() with open(filename, 'rb') as local_file: for chunk in iter(lambda: local_file.read(8192), ''): md5.update(chunk) return md5.hexdigest() == ocr.checksum return False if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-d', '--directory') args = parser.parse_args() if args.directory == None: print('script needs -d/--directory [DIRECTORY] to use for OCRs') sys.exit(2) ocrs = OCRListScraper() ocrs.build_ocr_list(0) # fork off a number of worker threads based on the list of OCRs # split the list into fourths thread1 = OCRScraperWorker('T1', ocrs.ocrs[0::4], args.directory) thread2 = OCRScraperWorker('T2', ocrs.ocrs[1::4], args.directory) thread3 = OCRScraperWorker('T3', ocrs.ocrs[2::4], args.directory) thread4 = OCRScraperWorker('T4', ocrs.ocrs[3::4], args.directory) thread1.start() thread2.start() thread3.start() thread4.start() # vi:tabstop=4:expandtab:autoindent