From 943279ef0bf68a1a20f0f5a360fdd0a209024a65 Mon Sep 17 00:00:00 2001 From: "Brian S. Stephan" Date: Tue, 6 Dec 2011 16:34:58 -0600 Subject: [PATCH] ocrscrape.py --- fiddling with long lines --- ocrscrape.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/ocrscrape.py b/ocrscrape.py index b5ce62c..2f2d6ed 100644 --- a/ocrscrape.py +++ b/ocrscrape.py @@ -78,7 +78,8 @@ class OCRListScraper: len_all = len(self.ocrs) ocrs_set = set(self.ocrs) self.ocrs = list(ocrs_set) - print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), len_all-len(self.ocrs))) + print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), + len_all-len(self.ocrs))) class OCRScraperWorker(threading.Thread): @@ -112,22 +113,27 @@ class OCRScraperWorker(threading.Thread): if os.path.isfile(filename): if self._file_checksum_matches(filename, ocr): os.utime(filename, None) - sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'.format(self.thread_id, ocr.title.encode('utf-8'))) + sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n' + .format(self.thread_id, ocr.title.encode('utf-8'))) else: - sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'.format(self.thread_id, ocr.title.encode('utf-8'))) + sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n' + .format(self.thread_id, ocr.title.encode('utf-8'))) do_download = True else: # download - sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, ocr.title.encode('utf-8'))) + sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, + ocr.title.encode('utf-8'))) do_download = True if do_download: urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename) if not self._file_checksum_matches(filename, ocr): - sys.stdout.write('[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'.format(self.thread_id, ocr.title.encode('utf-8'))) + sys.stdout.write( + '[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n' + .format(self.thread_id, ocr.title.encode('utf-8'))) def _identify_ocr_data_from_url(self, url): - """Given the input URL, turn the relevant data into an OCR object for later checking and downloading.""" + """Given the input URL, turn the relevant data into an OCR object for later use. """ ocr = self.OCR() ocr.num = '0'