ocrscrape.py - pull down all non-album OCRs on ocremix.org

2011-12-06 16:18:10 -06:00 · 2011-12-06 16:18:10 -06:00 · 0559a51b52
commit 0559a51b52
1 changed files with 209 additions and 0 deletions
--- a/ocrscrape.py
+++ b/ocrscrape.py
@ -0,0 +1,209 @@
+"""
+ocrscrape.py --- Download all non-album OCRs from ocremix.org
+Copyright (C) 2011  Brian S. Stephan
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Example usage:
+% python2 ocrscrape.py --directory ~/Music/ocremix.org/OCRs
+"""
+
+import argparse
+import BeautifulSoup
+import hashlib
+import os
+import random
+import re
+import sys
+import threading
+import time
+import urllib
+import urllib2
+
+class OCRListScraper:
+
+    """
+    Pull down the URLs for all non-album OCRemixes.
+
+    This doesn't actually download them, it just provides a list of URLs to go to
+    and scrape.
+    """
+
+    def __init__(self):
+        """Set up some constants and variables."""
+
+        self.BASE_DOMAIN        = 'http://ocremix.org'
+        self.REMIX_LIST         = '/remixes/?%s'
+        self.REMIX_ENTRY_REGEX  = '(/remix/OCR([0-9]+)/)'
+        self.ocrs               = []
+
+    def build_ocr_list(self, offset=0):
+        """Populate the list of OCRs to download."""
+
+        if offset == 0:
+            sys.stdout.write('building list of OCRs, please wait...')
+            sys.stdout.flush()
+
+        try:
+            params = urllib.urlencode({'offset': offset, 'sort': 'datedesc'})
+            ocr_index = urllib2.urlopen(self.BASE_DOMAIN + self.REMIX_LIST % params)
+        except urllib2.URLError as e:
+            print('ERROR: couldn\'t get complete list of OCRs, reason: ' + e.reason)
+            return
+
+        count = 0
+        for line in ocr_index:
+            match = re.search(self.REMIX_ENTRY_REGEX, line)
+            if match and match.group(0):
+                self.ocrs.append(self.BASE_DOMAIN + match.group(1))
+                count = count + 1
+
+        # recurse
+        if count > 0:
+            self.build_ocr_list(offset+count)
+
+        if offset == 0:
+            # print success at the end
+            len_all = len(self.ocrs)
+            ocrs_set = set(self.ocrs)
+            self.ocrs = list(ocrs_set)
+            print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), len_all-len(self.ocrs)))
+
+class OCRScraperWorker(threading.Thread):
+
+    """
+    Visit OCR URLs, analyze the links, check the local filesystem,
+    and optionally download the MP3.
+
+    Will check/download all given URLs when the thread is started.
+    """
+
+    class OCR():
+        """Track the details of an OCR to pull down."""
+        pass
+
+    def __init__(self, thread_id, ocrs, directory):
+        """Configure the worker thread's list of OCR URLs and target directory."""
+
+        self.thread_id      = thread_id
+        self.ocrs           = ocrs
+        self.directory      = directory
+        threading.Thread.__init__(self)
+
+    def run(self):
+        """Given a list of OCRs, identify their data and potentially fetch them."""
+
+        for ocr_str in self.ocrs:
+            ocr = self._identify_ocr_data_from_url(ocr_str)
+            # check if the file exists already
+            do_download = False
+            filename = os.path.abspath(os.path.abspath(self.directory) + '/' + ocr.filename)
+            if os.path.isfile(filename):
+                if self._file_checksum_matches(filename, ocr):
+                    os.utime(filename, None)
+                    sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'.format(self.thread_id, ocr.title.encode('utf-8')))
+                else:
+                    sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'.format(self.thread_id, ocr.title.encode('utf-8')))
+                    do_download = True
+            else:
+                # download
+                sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, ocr.title.encode('utf-8')))
+                do_download = True
+
+            if do_download:
+                urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
+                if not self._file_checksum_matches(filename, ocr):
+                    sys.stdout.write('[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'.format(self.thread_id, ocr.title.encode('utf-8')))
+
+    def _identify_ocr_data_from_url(self, url):
+        """Given the input URL, turn the relevant data into an OCR object for later checking and downloading."""
+
+        ocr = self.OCR()
+        ocr.num = '0'
+        match = re.search('/(OCR([0-9]+))/$', url)
+        if not match:
+            print('WARNING: i couldn\'t recognize the url, but i\'ll try plowing ahead blindly')
+        else:
+            ocr.num = match.group(1)
+
+        try:
+            ocr_data = urllib2.urlopen(url)
+        except urllib2.URLError as e:
+            print('ERROR: couldn\'t get data for ' + url + ', reason: ' + e.reason)
+            return
+
+        soup = BeautifulSoup.BeautifulSoup(''.join(ocr_data))
+        title_tag = soup.find('title').contents[0]
+        match = re.search('^ReMix: (.*) - OverClocked ReMix$', title_tag)
+        if not match:
+            print('ERROR: couldn\'t find the OCR name from the title')
+            return
+
+        ocr.title = match.group(1).replace('&lt;','<').replace('&gt;','>').replace('&amp;','&')
+
+        # this is about to get gross...
+        download = soup.find(id='panel-download')
+
+        # get the checksum
+        ocr.checksum = download.contents[1].contents[3].contents[3].contents[1]
+
+        # get the download url(s)
+        ocr.urls = []
+        hrefs = download.contents[1].contents[7].findAll('a')
+        for link in hrefs:
+            ocr.urls.append(urllib.unquote_plus(link['href']))
+
+        # store the desired local filename
+        ocr.filename = ocr.urls[0].rsplit('/',1)[1]
+
+        return ocr
+
+    def _file_checksum_matches(self, filename, ocr):
+        """Return if the MD5 checksum on a file matches that in the OCR listing."""
+
+        if os.path.isfile(filename):
+            # read and check md5sum
+            md5 = hashlib.md5()
+            with open(filename, 'rb') as local_file:
+                for chunk in iter(lambda: local_file.read(8192), ''):
+                    md5.update(chunk)
+            return md5.hexdigest() == ocr.checksum
+        return False
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--directory')
+    args = parser.parse_args()
+
+    if args.directory == None:
+        print('script needs -d/--directory [DIRECTORY] to use for OCRs')
+        sys.exit(2)
+
+    ocrs = OCRListScraper()
+    ocrs.build_ocr_list(0)
+
+    # fork off a number of worker threads based on the list of OCRs
+
+    # split the list into fourths
+    thread1 = OCRScraperWorker('T1', ocrs.ocrs[0::4], args.directory)
+    thread2 = OCRScraperWorker('T2', ocrs.ocrs[1::4], args.directory)
+    thread3 = OCRScraperWorker('T3', ocrs.ocrs[2::4], args.directory)
+    thread4 = OCRScraperWorker('T4', ocrs.ocrs[3::4], args.directory)
+
+    thread1.start()
+    thread2.start()
+    thread3.start()
+    thread4.start()
+
+# vi:tabstop=4:expandtab:autoindent