ocrscrape.py - pull down all non-album OCRs on ocremix.org

This commit is contained in:
Brian S. Stephan 2011-12-06 16:18:10 -06:00
commit 0559a51b52
1 changed files with 209 additions and 0 deletions

ocrscrape.py Normal file
View File

@ -0,0 +1,209 @@
ocrscrape.py --- Download all non-album OCRs from ocremix.org
Copyright (C) 2011 Brian S. Stephan
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Example usage:
% python2 ocrscrape.py --directory ~/Music/ocremix.org/OCRs
import argparse
import BeautifulSoup
import hashlib
import os
import random
import re
import sys
import threading
import time
import urllib
import urllib2
class OCRListScraper:
Pull down the URLs for all non-album OCRemixes.
This doesn't actually download them, it just provides a list of URLs to go to
and scrape.
def __init__(self):
"""Set up some constants and variables."""
self.BASE_DOMAIN = 'http://ocremix.org'
self.REMIX_LIST = '/remixes/?%s'
self.REMIX_ENTRY_REGEX = '(/remix/OCR([0-9]+)/)'
self.ocrs = []
def build_ocr_list(self, offset=0):
"""Populate the list of OCRs to download."""
if offset == 0:
sys.stdout.write('building list of OCRs, please wait...')
params = urllib.urlencode({'offset': offset, 'sort': 'datedesc'})
ocr_index = urllib2.urlopen(self.BASE_DOMAIN + self.REMIX_LIST % params)
except urllib2.URLError as e:
print('ERROR: couldn\'t get complete list of OCRs, reason: ' + e.reason)
count = 0
for line in ocr_index:
match = re.search(self.REMIX_ENTRY_REGEX, line)
if match and match.group(0):
self.ocrs.append(self.BASE_DOMAIN + match.group(1))
count = count + 1
# recurse
if count > 0:
if offset == 0:
# print success at the end
len_all = len(self.ocrs)
ocrs_set = set(self.ocrs)
self.ocrs = list(ocrs_set)
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), len_all-len(self.ocrs)))
class OCRScraperWorker(threading.Thread):
Visit OCR URLs, analyze the links, check the local filesystem,
and optionally download the MP3.
Will check/download all given URLs when the thread is started.
class OCR():
"""Track the details of an OCR to pull down."""
def __init__(self, thread_id, ocrs, directory):
"""Configure the worker thread's list of OCR URLs and target directory."""
self.thread_id = thread_id
self.ocrs = ocrs
self.directory = directory
def run(self):
"""Given a list of OCRs, identify their data and potentially fetch them."""
for ocr_str in self.ocrs:
ocr = self._identify_ocr_data_from_url(ocr_str)
# check if the file exists already
do_download = False
filename = os.path.abspath(os.path.abspath(self.directory) + '/' + ocr.filename)
if os.path.isfile(filename):
if self._file_checksum_matches(filename, ocr):
os.utime(filename, None)
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'.format(self.thread_id, ocr.title.encode('utf-8')))
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'.format(self.thread_id, ocr.title.encode('utf-8')))
do_download = True
# download
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, ocr.title.encode('utf-8')))
do_download = True
if do_download:
urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
if not self._file_checksum_matches(filename, ocr):
sys.stdout.write('[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'.format(self.thread_id, ocr.title.encode('utf-8')))
def _identify_ocr_data_from_url(self, url):
"""Given the input URL, turn the relevant data into an OCR object for later checking and downloading."""
ocr = self.OCR()
ocr.num = '0'
match = re.search('/(OCR([0-9]+))/$', url)
if not match:
print('WARNING: i couldn\'t recognize the url, but i\'ll try plowing ahead blindly')
ocr.num = match.group(1)
ocr_data = urllib2.urlopen(url)
except urllib2.URLError as e:
print('ERROR: couldn\'t get data for ' + url + ', reason: ' + e.reason)
soup = BeautifulSoup.BeautifulSoup(''.join(ocr_data))
title_tag = soup.find('title').contents[0]
match = re.search('^ReMix: (.*) - OverClocked ReMix$', title_tag)
if not match:
print('ERROR: couldn\'t find the OCR name from the title')
ocr.title = match.group(1).replace('&lt;','<').replace('&gt;','>').replace('&amp;','&')
# this is about to get gross...
download = soup.find(id='panel-download')
# get the checksum
ocr.checksum = download.contents[1].contents[3].contents[3].contents[1]
# get the download url(s)
ocr.urls = []
hrefs = download.contents[1].contents[7].findAll('a')
for link in hrefs:
# store the desired local filename
ocr.filename = ocr.urls[0].rsplit('/',1)[1]
return ocr
def _file_checksum_matches(self, filename, ocr):
"""Return if the MD5 checksum on a file matches that in the OCR listing."""
if os.path.isfile(filename):
# read and check md5sum
md5 = hashlib.md5()
with open(filename, 'rb') as local_file:
for chunk in iter(lambda: local_file.read(8192), ''):
return md5.hexdigest() == ocr.checksum
return False
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--directory')
args = parser.parse_args()
if args.directory == None:
print('script needs -d/--directory [DIRECTORY] to use for OCRs')
ocrs = OCRListScraper()
# fork off a number of worker threads based on the list of OCRs
# split the list into fourths
thread1 = OCRScraperWorker('T1', ocrs.ocrs[0::4], args.directory)
thread2 = OCRScraperWorker('T2', ocrs.ocrs[1::4], args.directory)
thread3 = OCRScraperWorker('T3', ocrs.ocrs[2::4], args.directory)
thread4 = OCRScraperWorker('T4', ocrs.ocrs[3::4], args.directory)
# vi:tabstop=4:expandtab:autoindent