ocrscrape.py --- fiddling with long lines
This commit is contained in:
parent
0559a51b52
commit
943279ef0b
18
ocrscrape.py
18
ocrscrape.py
|
@ -78,7 +78,8 @@ class OCRListScraper:
|
|||
len_all = len(self.ocrs)
|
||||
ocrs_set = set(self.ocrs)
|
||||
self.ocrs = list(ocrs_set)
|
||||
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), len_all-len(self.ocrs)))
|
||||
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs),
|
||||
len_all-len(self.ocrs)))
|
||||
|
||||
class OCRScraperWorker(threading.Thread):
|
||||
|
||||
|
@ -112,22 +113,27 @@ class OCRScraperWorker(threading.Thread):
|
|||
if os.path.isfile(filename):
|
||||
if self._file_checksum_matches(filename, ocr):
|
||||
os.utime(filename, None)
|
||||
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'
|
||||
.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
else:
|
||||
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'
|
||||
.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
do_download = True
|
||||
else:
|
||||
# download
|
||||
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id,
|
||||
ocr.title.encode('utf-8')))
|
||||
do_download = True
|
||||
|
||||
if do_download:
|
||||
urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
|
||||
if not self._file_checksum_matches(filename, ocr):
|
||||
sys.stdout.write('[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
sys.stdout.write(
|
||||
'[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'
|
||||
.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||
|
||||
def _identify_ocr_data_from_url(self, url):
|
||||
"""Given the input URL, turn the relevant data into an OCR object for later checking and downloading."""
|
||||
"""Given the input URL, turn the relevant data into an OCR object for later use. """
|
||||
|
||||
ocr = self.OCR()
|
||||
ocr.num = '0'
|
||||
|
|
Loading…
Reference in New Issue