ocrscrape.py - pull down all non-album OCRs on ocremix.org
This commit is contained in:
commit
0559a51b52
|
@ -0,0 +1,209 @@
|
|||
"""
|
||||
ocrscrape.py --- Download all non-album OCRs from ocremix.org
|
||||
Copyright (C) 2011 Brian S. Stephan
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Example usage:
|
||||
% python2 ocrscrape.py --directory ~/Music/ocremix.org/OCRs
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import BeautifulSoup
|
||||
import hashlib
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
class OCRListScraper:
|
||||
|
||||
"""
|
||||
Pull down the URLs for all non-album OCRemixes.
|
||||
|
||||
This doesn't actually download them, it just provides a list of URLs to go to
|
||||
and scrape.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Set up some constants and variables."""
|
||||
|
||||
self.BASE_DOMAIN = 'http://ocremix.org'
|
||||
self.REMIX_LIST = '/remixes/?%s'
|
||||
self.REMIX_ENTRY_REGEX = '(/remix/OCR([0-9]+)/)'
|
||||
self.ocrs = []
|
||||
|
||||
def build_ocr_list(self, offset=0):
|
||||
"""Populate the list of OCRs to download."""
|
||||
|
||||
if offset == 0:
|
||||
sys.stdout.write('building list of OCRs, please wait...')
|
||||
sys.stdout.flush()
|
||||
|
||||
try:
|
||||
params = urllib.urlencode({'offset': offset, 'sort': 'datedesc'})
|
||||
ocr_index = urllib2.urlopen(self.BASE_DOMAIN + self.REMIX_LIST % params)
|
||||
except urllib2.URLError as e:
|
||||
print('ERROR: couldn\'t get complete list of OCRs, reason: ' + e.reason)
|
||||
return
|
||||
|
||||
count = 0
|
||||
for line in ocr_index:
|
||||
match = re.search(self.REMIX_ENTRY_REGEX, line)
|
||||
if match and match.group(0):
|
||||
self.ocrs.append(self.BASE_DOMAIN + match.group(1))
|
||||
count = count + 1
|
||||
|
||||
# recurse
|
||||
if count > 0:
|
||||
self.build_ocr_list(offset+count)
|
||||
|
||||
if offset == 0:
|
||||
# print success at the end
|
||||
len_all = len(self.ocrs)
|
||||
ocrs_set = set(self.ocrs)
|
||||
self.ocrs = list(ocrs_set)
|
||||
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), len_all-len(self.ocrs)))
|
||||
|
||||
class OCRScraperWorker(threading.Thread):
|
||||
|
||||
"""
|
||||
Visit OCR URLs, analyze the links, check the local filesystem,
|
||||
and optionally download the MP3.
|
||||
|
||||
Will check/download all given URLs when the thread is started.
|
||||
"""
|
||||
|
||||
class OCR():
|
||||
"""Track the details of an OCR to pull down."""
|
||||
pass
|
||||
|
||||
def __init__(self, thread_id, ocrs, directory):
|
||||
"""Configure the worker thread's list of OCR URLs and target directory."""
|
||||
|
||||
self.thread_id = thread_id
|
||||
self.ocrs = ocrs
|
||||
self.directory = directory
|
||||
threading.Thread.__init__(self)
|
||||
|
||||
def run(self):
|
||||
"""Given a list of OCRs, identify their data and potentially fetch them."""
|
||||
|
||||
for ocr_str in self.ocrs:
|
||||
ocr = self._identify_ocr_data_from_url(ocr_str)
|
||||
# check if the file exists already
|
||||
do_download = False
|
||||
filename = os.path.abspath(os.path.abspath(self.directory) + '/' + ocr.filename)
|
||||
if os.path.isfile(filename):
|
||||
if self._file_checksum_matches(filename, ocr):
|
||||
os.utime(filename, None)
|
||||
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
else:
|
||||
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
do_download = True
|
||||
else:
|
||||
# download
|
||||
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
do_download = True
|
||||
|
||||
if do_download:
|
||||
urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
|
||||
if not self._file_checksum_matches(filename, ocr):
|
||||
sys.stdout.write('[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
|
||||
def _identify_ocr_data_from_url(self, url):
|
||||
"""Given the input URL, turn the relevant data into an OCR object for later checking and downloading."""
|
||||
|
||||
ocr = self.OCR()
|
||||
ocr.num = '0'
|
||||
match = re.search('/(OCR([0-9]+))/$', url)
|
||||
if not match:
|
||||
print('WARNING: i couldn\'t recognize the url, but i\'ll try plowing ahead blindly')
|
||||
else:
|
||||
ocr.num = match.group(1)
|
||||
|
||||
try:
|
||||
ocr_data = urllib2.urlopen(url)
|
||||
except urllib2.URLError as e:
|
||||
print('ERROR: couldn\'t get data for ' + url + ', reason: ' + e.reason)
|
||||
return
|
||||
|
||||
soup = BeautifulSoup.BeautifulSoup(''.join(ocr_data))
|
||||
title_tag = soup.find('title').contents[0]
|
||||
match = re.search('^ReMix: (.*) - OverClocked ReMix$', title_tag)
|
||||
if not match:
|
||||
print('ERROR: couldn\'t find the OCR name from the title')
|
||||
return
|
||||
|
||||
ocr.title = match.group(1).replace('<','<').replace('>','>').replace('&','&')
|
||||
|
||||
# this is about to get gross...
|
||||
download = soup.find(id='panel-download')
|
||||
|
||||
# get the checksum
|
||||
ocr.checksum = download.contents[1].contents[3].contents[3].contents[1]
|
||||
|
||||
# get the download url(s)
|
||||
ocr.urls = []
|
||||
hrefs = download.contents[1].contents[7].findAll('a')
|
||||
for link in hrefs:
|
||||
ocr.urls.append(urllib.unquote_plus(link['href']))
|
||||
|
||||
# store the desired local filename
|
||||
ocr.filename = ocr.urls[0].rsplit('/',1)[1]
|
||||
|
||||
return ocr
|
||||
|
||||
def _file_checksum_matches(self, filename, ocr):
|
||||
"""Return if the MD5 checksum on a file matches that in the OCR listing."""
|
||||
|
||||
if os.path.isfile(filename):
|
||||
# read and check md5sum
|
||||
md5 = hashlib.md5()
|
||||
with open(filename, 'rb') as local_file:
|
||||
for chunk in iter(lambda: local_file.read(8192), ''):
|
||||
md5.update(chunk)
|
||||
return md5.hexdigest() == ocr.checksum
|
||||
return False
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-d', '--directory')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.directory == None:
|
||||
print('script needs -d/--directory [DIRECTORY] to use for OCRs')
|
||||
sys.exit(2)
|
||||
|
||||
ocrs = OCRListScraper()
|
||||
ocrs.build_ocr_list(0)
|
||||
|
||||
# fork off a number of worker threads based on the list of OCRs
|
||||
|
||||
# split the list into fourths
|
||||
thread1 = OCRScraperWorker('T1', ocrs.ocrs[0::4], args.directory)
|
||||
thread2 = OCRScraperWorker('T2', ocrs.ocrs[1::4], args.directory)
|
||||
thread3 = OCRScraperWorker('T3', ocrs.ocrs[2::4], args.directory)
|
||||
thread4 = OCRScraperWorker('T4', ocrs.ocrs[3::4], args.directory)
|
||||
|
||||
thread1.start()
|
||||
thread2.start()
|
||||
thread3.start()
|
||||
thread4.start()
|
||||
|
||||
# vi:tabstop=4:expandtab:autoindent
|
Loading…
Reference in New Issue