216 lines
7.5 KiB
Python
216 lines
7.5 KiB
Python
"""
|
|
ocrscrape.py --- Download all non-album OCRs from ocremix.org
|
|
Copyright (C) 2011 Brian S. Stephan
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
Example usage:
|
|
% python2 ocrscrape.py --directory ~/Music/ocremix.org/OCRs
|
|
"""
|
|
|
|
import argparse
|
|
import BeautifulSoup
|
|
import hashlib
|
|
import os
|
|
import random
|
|
import re
|
|
import sys
|
|
import threading
|
|
import time
|
|
import urllib
|
|
import urllib2
|
|
|
|
class OCRListScraper:
|
|
|
|
"""
|
|
Pull down the URLs for all non-album OCRemixes.
|
|
|
|
This doesn't actually download them, it just provides a list of URLs to go to
|
|
and scrape.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Set up some constants and variables."""
|
|
|
|
self.BASE_DOMAIN = 'http://ocremix.org'
|
|
self.REMIX_LIST = '/remixes/?%s'
|
|
self.REMIX_ENTRY_REGEX = '(/remix/OCR([0-9]+)/)'
|
|
self.ocrs = []
|
|
|
|
def build_ocr_list(self, offset=0):
|
|
"""Populate the list of OCRs to download."""
|
|
|
|
if offset == 0:
|
|
sys.stdout.write('building list of OCRs, please wait...')
|
|
sys.stdout.flush()
|
|
|
|
try:
|
|
params = urllib.urlencode({'offset': offset, 'sort': 'datedesc'})
|
|
ocr_index = urllib2.urlopen(self.BASE_DOMAIN + self.REMIX_LIST % params)
|
|
except urllib2.URLError as e:
|
|
print('ERROR: couldn\'t get complete list of OCRs, reason: ' + e.reason)
|
|
return
|
|
|
|
count = 0
|
|
for line in ocr_index:
|
|
match = re.search(self.REMIX_ENTRY_REGEX, line)
|
|
if match and match.group(0):
|
|
self.ocrs.append(self.BASE_DOMAIN + match.group(1))
|
|
count = count + 1
|
|
|
|
# recurse
|
|
if count > 0:
|
|
self.build_ocr_list(offset+count)
|
|
|
|
if offset == 0:
|
|
# print success at the end
|
|
len_all = len(self.ocrs)
|
|
ocrs_set = set(self.ocrs)
|
|
self.ocrs = list(ocrs_set)
|
|
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs),
|
|
len_all-len(self.ocrs)))
|
|
|
|
class OCRScraperWorker(threading.Thread):
|
|
|
|
"""
|
|
Visit OCR URLs, analyze the links, check the local filesystem,
|
|
and optionally download the MP3.
|
|
|
|
Will check/download all given URLs when the thread is started.
|
|
"""
|
|
|
|
class OCR():
|
|
"""Track the details of an OCR to pull down."""
|
|
pass
|
|
|
|
def __init__(self, thread_id, ocrs, directory):
|
|
"""Configure the worker thread's list of OCR URLs and target directory."""
|
|
|
|
self.thread_id = thread_id
|
|
self.ocrs = ocrs
|
|
self.directory = directory
|
|
threading.Thread.__init__(self)
|
|
|
|
def run(self):
|
|
"""Given a list of OCRs, identify their data and potentially fetch them."""
|
|
|
|
for ocr_str in self.ocrs:
|
|
ocr = self._identify_ocr_data_from_url(ocr_str)
|
|
# check if the file exists already
|
|
do_download = False
|
|
filename = os.path.abspath(os.path.abspath(self.directory) + os.sep + ocr.filename)
|
|
if os.path.isfile(filename):
|
|
if self._file_checksum_matches(filename, ocr):
|
|
os.utime(filename, None)
|
|
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'
|
|
.format(self.thread_id, ocr.title.encode('utf-8')))
|
|
else:
|
|
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'
|
|
.format(self.thread_id, ocr.title.encode('utf-8')))
|
|
do_download = True
|
|
else:
|
|
# download
|
|
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id,
|
|
ocr.title.encode('utf-8')))
|
|
do_download = True
|
|
|
|
if do_download:
|
|
urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
|
|
if not self._file_checksum_matches(filename, ocr):
|
|
sys.stdout.write(
|
|
'[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'
|
|
.format(self.thread_id, ocr.title.encode('utf-8')))
|
|
|
|
def _identify_ocr_data_from_url(self, url):
|
|
"""Given the input URL, turn the relevant data into an OCR object for later use. """
|
|
|
|
ocr = self.OCR()
|
|
ocr.num = '0'
|
|
match = re.search('/(OCR([0-9]+))/$', url)
|
|
if not match:
|
|
print('WARNING: i couldn\'t recognize the url, but i\'ll try plowing ahead blindly')
|
|
else:
|
|
ocr.num = match.group(1)
|
|
|
|
try:
|
|
ocr_data = urllib2.urlopen(url)
|
|
except urllib2.URLError as e:
|
|
print('ERROR: couldn\'t get data for ' + url + ', reason: ' + e.reason)
|
|
return
|
|
|
|
soup = BeautifulSoup.BeautifulSoup(''.join(ocr_data))
|
|
title_tag = soup.find('title').contents[0]
|
|
match = re.search('^ReMix: (.*) - OverClocked ReMix$', title_tag)
|
|
if not match:
|
|
print('ERROR: couldn\'t find the OCR name from the title')
|
|
return
|
|
|
|
ocr.title = match.group(1).replace('<','<').replace('>','>').replace('&','&')
|
|
|
|
# this is about to get gross...
|
|
download = soup.find(id='panel-download')
|
|
|
|
# get the checksum
|
|
ocr.checksum = download.contents[1].contents[3].contents[3].contents[1]
|
|
|
|
# get the download url(s)
|
|
ocr.urls = []
|
|
hrefs = download.contents[1].contents[7].findAll('a')
|
|
for link in hrefs:
|
|
ocr.urls.append(urllib.unquote_plus(link['href']))
|
|
|
|
# store the desired local filename
|
|
ocr.filename = ocr.urls[0].rsplit('/',1)[1]
|
|
|
|
return ocr
|
|
|
|
def _file_checksum_matches(self, filename, ocr):
|
|
"""Return if the MD5 checksum on a file matches that in the OCR listing."""
|
|
|
|
if os.path.isfile(filename):
|
|
# read and check md5sum
|
|
md5 = hashlib.md5()
|
|
with open(filename, 'rb') as local_file:
|
|
for chunk in iter(lambda: local_file.read(8192), ''):
|
|
md5.update(chunk)
|
|
return md5.hexdigest() == ocr.checksum
|
|
return False
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('-d', '--directory')
|
|
args = parser.parse_args()
|
|
|
|
if args.directory == None:
|
|
print('script needs -d/--directory [DIRECTORY] to use for OCRs')
|
|
sys.exit(2)
|
|
|
|
ocrs = OCRListScraper()
|
|
ocrs.build_ocr_list(0)
|
|
|
|
# fork off a number of worker threads based on the list of OCRs
|
|
|
|
# split the list into fourths
|
|
thread1 = OCRScraperWorker('T1', ocrs.ocrs[0::4], args.directory)
|
|
thread2 = OCRScraperWorker('T2', ocrs.ocrs[1::4], args.directory)
|
|
thread3 = OCRScraperWorker('T3', ocrs.ocrs[2::4], args.directory)
|
|
thread4 = OCRScraperWorker('T4', ocrs.ocrs[3::4], args.directory)
|
|
|
|
thread1.start()
|
|
thread2.start()
|
|
thread3.start()
|
|
thread4.start()
|
|
|
|
# vi:tabstop=4:expandtab:autoindent
|