ocrscrape.py --- fiddling with long lines

This commit is contained in:
Brian S. Stephan 2011-12-06 16:34:58 -06:00
parent 0559a51b52
commit 943279ef0b
1 changed files with 12 additions and 6 deletions

View File

@ -78,7 +78,8 @@ class OCRListScraper:
len_all = len(self.ocrs)
ocrs_set = set(self.ocrs)
self.ocrs = list(ocrs_set)
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), len_all-len(self.ocrs)))
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs),
len_all-len(self.ocrs)))
class OCRScraperWorker(threading.Thread):
@ -112,22 +113,27 @@ class OCRScraperWorker(threading.Thread):
if os.path.isfile(filename):
if self._file_checksum_matches(filename, ocr):
os.utime(filename, None)
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'.format(self.thread_id, ocr.title.encode('utf-8')))
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'
.format(self.thread_id, ocr.title.encode('utf-8')))
else:
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'.format(self.thread_id, ocr.title.encode('utf-8')))
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'
.format(self.thread_id, ocr.title.encode('utf-8')))
do_download = True
else:
# download
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, ocr.title.encode('utf-8')))
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id,
ocr.title.encode('utf-8')))
do_download = True
if do_download:
urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
if not self._file_checksum_matches(filename, ocr):
sys.stdout.write('[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'.format(self.thread_id, ocr.title.encode('utf-8')))
sys.stdout.write(
'[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'
.format(self.thread_id, ocr.title.encode('utf-8')))
def _identify_ocr_data_from_url(self, url):
"""Given the input URL, turn the relevant data into an OCR object for later checking and downloading."""
"""Given the input URL, turn the relevant data into an OCR object for later use. """
ocr = self.OCR()
ocr.num = '0'