commit 0559a51b5287c1ca09066777776a0e26758fe979 Author: Brian S. Stephan Date: Tue Dec 6 16:18:10 2011 -0600 ocrscrape.py - pull down all non-album OCRs on ocremix.org diff --git a/ocrscrape.py b/ocrscrape.py new file mode 100644 index 0000000..b5ce62c --- /dev/null +++ b/ocrscrape.py @@ -0,0 +1,209 @@ +""" +ocrscrape.py --- Download all non-album OCRs from ocremix.org +Copyright (C) 2011 Brian S. Stephan + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +Example usage: +% python2 ocrscrape.py --directory ~/Music/ocremix.org/OCRs +""" + +import argparse +import BeautifulSoup +import hashlib +import os +import random +import re +import sys +import threading +import time +import urllib +import urllib2 + +class OCRListScraper: + + """ + Pull down the URLs for all non-album OCRemixes. + + This doesn't actually download them, it just provides a list of URLs to go to + and scrape. + """ + + def __init__(self): + """Set up some constants and variables.""" + + self.BASE_DOMAIN = 'http://ocremix.org' + self.REMIX_LIST = '/remixes/?%s' + self.REMIX_ENTRY_REGEX = '(/remix/OCR([0-9]+)/)' + self.ocrs = [] + + def build_ocr_list(self, offset=0): + """Populate the list of OCRs to download.""" + + if offset == 0: + sys.stdout.write('building list of OCRs, please wait...') + sys.stdout.flush() + + try: + params = urllib.urlencode({'offset': offset, 'sort': 'datedesc'}) + ocr_index = urllib2.urlopen(self.BASE_DOMAIN + self.REMIX_LIST % params) + except urllib2.URLError as e: + print('ERROR: couldn\'t get complete list of OCRs, reason: ' + e.reason) + return + + count = 0 + for line in ocr_index: + match = re.search(self.REMIX_ENTRY_REGEX, line) + if match and match.group(0): + self.ocrs.append(self.BASE_DOMAIN + match.group(1)) + count = count + 1 + + # recurse + if count > 0: + self.build_ocr_list(offset+count) + + if offset == 0: + # print success at the end + len_all = len(self.ocrs) + ocrs_set = set(self.ocrs) + self.ocrs = list(ocrs_set) + print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), len_all-len(self.ocrs))) + +class OCRScraperWorker(threading.Thread): + + """ + Visit OCR URLs, analyze the links, check the local filesystem, + and optionally download the MP3. + + Will check/download all given URLs when the thread is started. + """ + + class OCR(): + """Track the details of an OCR to pull down.""" + pass + + def __init__(self, thread_id, ocrs, directory): + """Configure the worker thread's list of OCR URLs and target directory.""" + + self.thread_id = thread_id + self.ocrs = ocrs + self.directory = directory + threading.Thread.__init__(self) + + def run(self): + """Given a list of OCRs, identify their data and potentially fetch them.""" + + for ocr_str in self.ocrs: + ocr = self._identify_ocr_data_from_url(ocr_str) + # check if the file exists already + do_download = False + filename = os.path.abspath(os.path.abspath(self.directory) + '/' + ocr.filename) + if os.path.isfile(filename): + if self._file_checksum_matches(filename, ocr): + os.utime(filename, None) + sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'.format(self.thread_id, ocr.title.encode('utf-8'))) + else: + sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'.format(self.thread_id, ocr.title.encode('utf-8'))) + do_download = True + else: + # download + sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, ocr.title.encode('utf-8'))) + do_download = True + + if do_download: + urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename) + if not self._file_checksum_matches(filename, ocr): + sys.stdout.write('[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'.format(self.thread_id, ocr.title.encode('utf-8'))) + + def _identify_ocr_data_from_url(self, url): + """Given the input URL, turn the relevant data into an OCR object for later checking and downloading.""" + + ocr = self.OCR() + ocr.num = '0' + match = re.search('/(OCR([0-9]+))/$', url) + if not match: + print('WARNING: i couldn\'t recognize the url, but i\'ll try plowing ahead blindly') + else: + ocr.num = match.group(1) + + try: + ocr_data = urllib2.urlopen(url) + except urllib2.URLError as e: + print('ERROR: couldn\'t get data for ' + url + ', reason: ' + e.reason) + return + + soup = BeautifulSoup.BeautifulSoup(''.join(ocr_data)) + title_tag = soup.find('title').contents[0] + match = re.search('^ReMix: (.*) - OverClocked ReMix$', title_tag) + if not match: + print('ERROR: couldn\'t find the OCR name from the title') + return + + ocr.title = match.group(1).replace('<','<').replace('>','>').replace('&','&') + + # this is about to get gross... + download = soup.find(id='panel-download') + + # get the checksum + ocr.checksum = download.contents[1].contents[3].contents[3].contents[1] + + # get the download url(s) + ocr.urls = [] + hrefs = download.contents[1].contents[7].findAll('a') + for link in hrefs: + ocr.urls.append(urllib.unquote_plus(link['href'])) + + # store the desired local filename + ocr.filename = ocr.urls[0].rsplit('/',1)[1] + + return ocr + + def _file_checksum_matches(self, filename, ocr): + """Return if the MD5 checksum on a file matches that in the OCR listing.""" + + if os.path.isfile(filename): + # read and check md5sum + md5 = hashlib.md5() + with open(filename, 'rb') as local_file: + for chunk in iter(lambda: local_file.read(8192), ''): + md5.update(chunk) + return md5.hexdigest() == ocr.checksum + return False + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--directory') + args = parser.parse_args() + + if args.directory == None: + print('script needs -d/--directory [DIRECTORY] to use for OCRs') + sys.exit(2) + + ocrs = OCRListScraper() + ocrs.build_ocr_list(0) + + # fork off a number of worker threads based on the list of OCRs + + # split the list into fourths + thread1 = OCRScraperWorker('T1', ocrs.ocrs[0::4], args.directory) + thread2 = OCRScraperWorker('T2', ocrs.ocrs[1::4], args.directory) + thread3 = OCRScraperWorker('T3', ocrs.ocrs[2::4], args.directory) + thread4 = OCRScraperWorker('T4', ocrs.ocrs[3::4], args.directory) + + thread1.start() + thread2.start() + thread3.start() + thread4.start() + +# vi:tabstop=4:expandtab:autoindent