incorporscripts/ocrscrape.py

216 lines
7.5 KiB
Python

"""
ocrscrape.py --- Download all non-album OCRs from ocremix.org
Copyright (C) 2011 Brian S. Stephan
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Example usage:
% python2 ocrscrape.py --directory ~/Music/ocremix.org/OCRs
"""
import argparse
import BeautifulSoup
import hashlib
import os
import random
import re
import sys
import threading
import time
import urllib
import urllib2
class OCRListScraper:
"""
Pull down the URLs for all non-album OCRemixes.
This doesn't actually download them, it just provides a list of URLs to go to
and scrape.
"""
def __init__(self):
"""Set up some constants and variables."""
self.BASE_DOMAIN = 'http://ocremix.org'
self.REMIX_LIST = '/remixes/?%s'
self.REMIX_ENTRY_REGEX = '(/remix/OCR([0-9]+)/)'
self.ocrs = []
def build_ocr_list(self, offset=0):
"""Populate the list of OCRs to download."""
if offset == 0:
sys.stdout.write('building list of OCRs, please wait...')
sys.stdout.flush()
try:
params = urllib.urlencode({'offset': offset, 'sort': 'datedesc'})
ocr_index = urllib2.urlopen(self.BASE_DOMAIN + self.REMIX_LIST % params)
except urllib2.URLError as e:
print('ERROR: couldn\'t get complete list of OCRs, reason: ' + e.reason)
return
count = 0
for line in ocr_index:
match = re.search(self.REMIX_ENTRY_REGEX, line)
if match and match.group(0):
self.ocrs.append(self.BASE_DOMAIN + match.group(1))
count = count + 1
# recurse
if count > 0:
self.build_ocr_list(offset+count)
if offset == 0:
# print success at the end
len_all = len(self.ocrs)
ocrs_set = set(self.ocrs)
self.ocrs = list(ocrs_set)
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs),
len_all-len(self.ocrs)))
class OCRScraperWorker(threading.Thread):
"""
Visit OCR URLs, analyze the links, check the local filesystem,
and optionally download the MP3.
Will check/download all given URLs when the thread is started.
"""
class OCR():
"""Track the details of an OCR to pull down."""
pass
def __init__(self, thread_id, ocrs, directory):
"""Configure the worker thread's list of OCR URLs and target directory."""
self.thread_id = thread_id
self.ocrs = ocrs
self.directory = directory
threading.Thread.__init__(self)
def run(self):
"""Given a list of OCRs, identify their data and potentially fetch them."""
for ocr_str in self.ocrs:
ocr = self._identify_ocr_data_from_url(ocr_str)
# check if the file exists already
do_download = False
filename = os.path.abspath(os.path.abspath(self.directory) + os.sep + ocr.filename)
if os.path.isfile(filename):
if self._file_checksum_matches(filename, ocr):
os.utime(filename, None)
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'
.format(self.thread_id, ocr.title.encode('utf-8')))
else:
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'
.format(self.thread_id, ocr.title.encode('utf-8')))
do_download = True
else:
# download
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id,
ocr.title.encode('utf-8')))
do_download = True
if do_download:
urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
if not self._file_checksum_matches(filename, ocr):
sys.stdout.write(
'[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'
.format(self.thread_id, ocr.title.encode('utf-8')))
def _identify_ocr_data_from_url(self, url):
"""Given the input URL, turn the relevant data into an OCR object for later use. """
ocr = self.OCR()
ocr.num = '0'
match = re.search('/(OCR([0-9]+))/$', url)
if not match:
print('WARNING: i couldn\'t recognize the url, but i\'ll try plowing ahead blindly')
else:
ocr.num = match.group(1)
try:
ocr_data = urllib2.urlopen(url)
except urllib2.URLError as e:
print('ERROR: couldn\'t get data for ' + url + ', reason: ' + e.reason)
return
soup = BeautifulSoup.BeautifulSoup(''.join(ocr_data))
title_tag = soup.find('title').contents[0]
match = re.search('^ReMix: (.*) - OverClocked ReMix$', title_tag)
if not match:
print('ERROR: couldn\'t find the OCR name from the title')
return
ocr.title = match.group(1).replace('&lt;','<').replace('&gt;','>').replace('&amp;','&')
# this is about to get gross...
download = soup.find(id='panel-download')
# get the checksum
ocr.checksum = download.contents[1].contents[3].contents[3].contents[1]
# get the download url(s)
ocr.urls = []
hrefs = download.contents[1].contents[7].findAll('a')
for link in hrefs:
ocr.urls.append(urllib.unquote_plus(link['href']))
# store the desired local filename
ocr.filename = ocr.urls[0].rsplit('/',1)[1]
return ocr
def _file_checksum_matches(self, filename, ocr):
"""Return if the MD5 checksum on a file matches that in the OCR listing."""
if os.path.isfile(filename):
# read and check md5sum
md5 = hashlib.md5()
with open(filename, 'rb') as local_file:
for chunk in iter(lambda: local_file.read(8192), ''):
md5.update(chunk)
return md5.hexdigest() == ocr.checksum
return False
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--directory')
args = parser.parse_args()
if args.directory == None:
print('script needs -d/--directory [DIRECTORY] to use for OCRs')
sys.exit(2)
ocrs = OCRListScraper()
ocrs.build_ocr_list(0)
# fork off a number of worker threads based on the list of OCRs
# split the list into fourths
thread1 = OCRScraperWorker('T1', ocrs.ocrs[0::4], args.directory)
thread2 = OCRScraperWorker('T2', ocrs.ocrs[1::4], args.directory)
thread3 = OCRScraperWorker('T3', ocrs.ocrs[2::4], args.directory)
thread4 = OCRScraperWorker('T4', ocrs.ocrs[3::4], args.directory)
thread1.start()
thread2.start()
thread3.start()
thread4.start()
# vi:tabstop=4:expandtab:autoindent