ocrscrape.py --- fiddling with long lines
This commit is contained in:
parent
0559a51b52
commit
943279ef0b
18
ocrscrape.py
18
ocrscrape.py
|
@ -78,7 +78,8 @@ class OCRListScraper:
|
||||||
len_all = len(self.ocrs)
|
len_all = len(self.ocrs)
|
||||||
ocrs_set = set(self.ocrs)
|
ocrs_set = set(self.ocrs)
|
||||||
self.ocrs = list(ocrs_set)
|
self.ocrs = list(ocrs_set)
|
||||||
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs), len_all-len(self.ocrs)))
|
print(' done! ({0:d} found ({1:d} duplicates))\n'.format(len(self.ocrs),
|
||||||
|
len_all-len(self.ocrs)))
|
||||||
|
|
||||||
class OCRScraperWorker(threading.Thread):
|
class OCRScraperWorker(threading.Thread):
|
||||||
|
|
||||||
|
@ -112,22 +113,27 @@ class OCRScraperWorker(threading.Thread):
|
||||||
if os.path.isfile(filename):
|
if os.path.isfile(filename):
|
||||||
if self._file_checksum_matches(filename, ocr):
|
if self._file_checksum_matches(filename, ocr):
|
||||||
os.utime(filename, None)
|
os.utime(filename, None)
|
||||||
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
sys.stdout.write('[{0:s}] \033[92m*\033[0m {1:s} already downloaded\n'
|
||||||
|
.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||||
else:
|
else:
|
||||||
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
sys.stdout.write('[{0:s}] \033[93m!\033[0m downloading {1:s} (checksum failure)\n'
|
||||||
|
.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||||
do_download = True
|
do_download = True
|
||||||
else:
|
else:
|
||||||
# download
|
# download
|
||||||
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
sys.stdout.write('[{0:s}] \033[91mx\033[0m downloading {1:s}\n'.format(self.thread_id,
|
||||||
|
ocr.title.encode('utf-8')))
|
||||||
do_download = True
|
do_download = True
|
||||||
|
|
||||||
if do_download:
|
if do_download:
|
||||||
urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
|
urllib.urlretrieve(ocr.urls[random.randint(0, len(ocr.urls)-1)], filename)
|
||||||
if not self._file_checksum_matches(filename, ocr):
|
if not self._file_checksum_matches(filename, ocr):
|
||||||
sys.stdout.write('[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'.format(self.thread_id, ocr.title.encode('utf-8')))
|
sys.stdout.write(
|
||||||
|
'[{0:s}] \033[93m!\033[0m {1:s} still mismatches, probably OCR listing problem\n'
|
||||||
|
.format(self.thread_id, ocr.title.encode('utf-8')))
|
||||||
|
|
||||||
def _identify_ocr_data_from_url(self, url):
|
def _identify_ocr_data_from_url(self, url):
|
||||||
"""Given the input URL, turn the relevant data into an OCR object for later checking and downloading."""
|
"""Given the input URL, turn the relevant data into an OCR object for later use. """
|
||||||
|
|
||||||
ocr = self.OCR()
|
ocr = self.OCR()
|
||||||
ocr.num = '0'
|
ocr.num = '0'
|
||||||
|
|
Loading…
Reference in New Issue