ocrscrape.py --- fiddling with long lines

This commit is contained in:
Brian S. Stephan 2011-12-06 16:34:58 -06:00
parent 0559a51b52
commit 943279ef0b
1 changed files with 12 additions and 6 deletions

View File

@ -78,7 +78,8 @@ class OCRListScraper:
len_all = len(self.ocrs) len_all = len(self.ocrs)
ocrs_set = set(self.ocrs) ocrs_set = set(self.ocrs)
self.ocrs = list(ocrs_set) self.ocrs = list(ocrs_set)
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), len_all-len(self.ocrs))) print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs),
len_all-len(self.ocrs)))
class OCRScraperWorker(threading.Thread): class OCRScraperWorker(threading.Thread):
@ -112,22 +113,27 @@ class OCRScraperWorker(threading.Thread):
if os.path.isfile(filename): if os.path.isfile(filename):
if self._file_checksum_matches(filename, ocr): if self._file_checksum_matches(filename, ocr):
os.utime(filename, None) os.utime(filename, None)
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'.format(self.thread_id, ocr.title.encode('utf-8'))) sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'
.format(self.thread_id, ocr.title.encode('utf-8')))
else: else:
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'.format(self.thread_id, ocr.title.encode('utf-8'))) sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'
.format(self.thread_id, ocr.title.encode('utf-8')))
do_download = True do_download = True
else: else:
# download # download
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, ocr.title.encode('utf-8'))) sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id,
ocr.title.encode('utf-8')))
do_download = True do_download = True
if do_download: if do_download:
urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename) urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
if not self._file_checksum_matches(filename, ocr): if not self._file_checksum_matches(filename, ocr):
sys.stdout.write('[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'.format(self.thread_id, ocr.title.encode('utf-8'))) sys.stdout.write(
'[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'
.format(self.thread_id, ocr.title.encode('utf-8')))
def _identify_ocr_data_from_url(self, url): def _identify_ocr_data_from_url(self, url):
"""Given the input URL, turn the relevant data into an OCR object for later checking and downloading.""" """Given the input URL, turn the relevant data into an OCR object for later use. """
ocr = self.OCR() ocr = self.OCR()
ocr.num = '0' ocr.num = '0'