incorporscripts/ocrscrape.py

"""
ocrscrape.py --- Download all non-album OCRs from ocremix.org
Copyright (C) 2011  Brian S. Stephan

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

Example usage:
% python2 ocrscrape.py --directory ~/Music/ocremix.org/OCRs
"""

import argparse
import BeautifulSoup
import hashlib
import os
import random
import re
import sys
import threading
import time
import urllib
import urllib2

class OCRListScraper:

    """
    Pull down the URLs for all non-album OCRemixes.

    This doesn't actually download them, it just provides a list of URLs to go to
    and scrape.
    """

    def __init__(self):
        """Set up some constants and variables."""

        self.BASE_DOMAIN        = 'http://ocremix.org'
        self.REMIX_LIST         = '/remixes/?%s'
        self.REMIX_ENTRY_REGEX  = '(/remix/OCR([0-9]+)/)'
        self.ocrs               = []

    def build_ocr_list(self, offset=0):
        """Populate the list of OCRs to download."""

        if offset == 0:
            sys.stdout.write('building list of OCRs, please wait...')
            sys.stdout.flush()

        try:
            params = urllib.urlencode({'offset': offset, 'sort': 'datedesc'})
            ocr_index = urllib2.urlopen(self.BASE_DOMAIN + self.REMIX_LIST % params)
        except urllib2.URLError as e:
            print('ERROR: couldn\'t get complete list of OCRs, reason: ' + e.reason)
            return

        count = 0
        for line in ocr_index:
            match = re.search(self.REMIX_ENTRY_REGEX, line)
            if match and match.group(0):
                self.ocrs.append(self.BASE_DOMAIN + match.group(1))
                count = count + 1

        # recurse
        if count > 0:
            self.build_ocr_list(offset+count)

        if offset == 0:
            # print success at the end
            len_all = len(self.ocrs)
            ocrs_set = set(self.ocrs)
            self.ocrs = list(ocrs_set)
            print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs),
                  len_all-len(self.ocrs)))

class OCRScraperWorker(threading.Thread):

    """
    Visit OCR URLs, analyze the links, check the local filesystem,
    and optionally download the MP3.

    Will check/download all given URLs when the thread is started.
    """

    class OCR():
        """Track the details of an OCR to pull down."""
        pass

    def __init__(self, thread_id, ocrs, directory):
        """Configure the worker thread's list of OCR URLs and target directory."""

        self.thread_id      = thread_id
        self.ocrs           = ocrs
        self.directory      = directory
        threading.Thread.__init__(self)

    def run(self):
        """Given a list of OCRs, identify their data and potentially fetch them."""

        for ocr_str in self.ocrs:
            ocr = self._identify_ocr_data_from_url(ocr_str)
            # check if the file exists already
            do_download = False
            filename = os.path.abspath(os.path.abspath(self.directory) + os.sep + ocr.filename)
            if os.path.isfile(filename):
                if self._file_checksum_matches(filename, ocr):
                    os.utime(filename, None)
                    sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'
                                     .format(self.thread_id, ocr.title.encode('utf-8')))
                else:
                    sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'
                                     .format(self.thread_id, ocr.title.encode('utf-8')))
                    do_download = True
            else:
                # download
                sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id,
                                 ocr.title.encode('utf-8')))
                do_download = True

            if do_download:
                urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
                if not self._file_checksum_matches(filename, ocr):
                    sys.stdout.write(
                        '[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'
                        .format(self.thread_id, ocr.title.encode('utf-8')))

    def _identify_ocr_data_from_url(self, url):
        """Given the input URL, turn the relevant data into an OCR object for later use. """

        ocr = self.OCR()
        ocr.num = '0'
        match = re.search('/(OCR([0-9]+))/$', url)
        if not match:
            print('WARNING: i couldn\'t recognize the url, but i\'ll try plowing ahead blindly')
        else:
            ocr.num = match.group(1)

        try:
            ocr_data = urllib2.urlopen(url)
        except urllib2.URLError as e:
            print('ERROR: couldn\'t get data for ' + url + ', reason: ' + e.reason)
            return

        soup = BeautifulSoup.BeautifulSoup(''.join(ocr_data))
        title_tag = soup.find('title').contents[0]
        match = re.search('^ReMix: (.*) - OverClocked ReMix$', title_tag)
        if not match:
            print('ERROR: couldn\'t find the OCR name from the title')
            return

        ocr.title = match.group(1).replace('&lt;','<').replace('&gt;','>').replace('&amp;','&')

        # this is about to get gross...
        download = soup.find(id='panel-download')

        # get the checksum
        ocr.checksum = download.contents[1].contents[3].contents[3].contents[1]

        # get the download url(s)
        ocr.urls = []
        hrefs = download.contents[1].contents[7].findAll('a')
        for link in hrefs:
            ocr.urls.append(urllib.unquote_plus(link['href']))

        # store the desired local filename
        ocr.filename = ocr.urls[0].rsplit('/',1)[1]

        return ocr

    def _file_checksum_matches(self, filename, ocr):
        """Return if the MD5 checksum on a file matches that in the OCR listing."""

        if os.path.isfile(filename):
            # read and check md5sum
            md5 = hashlib.md5()
            with open(filename, 'rb') as local_file:
                for chunk in iter(lambda: local_file.read(8192), ''):
                    md5.update(chunk)
            return md5.hexdigest() == ocr.checksum
        return False

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--directory')
    args = parser.parse_args()

    if args.directory == None:
        print('script needs -d/--directory [DIRECTORY] to use for OCRs')
        sys.exit(2)

    ocrs = OCRListScraper()
    ocrs.build_ocr_list(0)

    # fork off a number of worker threads based on the list of OCRs

    # split the list into fourths
    thread1 = OCRScraperWorker('T1', ocrs.ocrs[0::4], args.directory)
    thread2 = OCRScraperWorker('T2', ocrs.ocrs[1::4], args.directory)
    thread3 = OCRScraperWorker('T3', ocrs.ocrs[2::4], args.directory)
    thread4 = OCRScraperWorker('T4', ocrs.ocrs[3::4], args.directory)

    thread1.start()
    thread2.start()
    thread3.start()
    thread4.start()

# vi:tabstop=4:expandtab:autoindent