Last active
September 10, 2024 18:27
-
-
Save aboucaud/1208dbbcb75093834d8bdce15a7fd0fd to your computer and use it in GitHub Desktop.
Download images from urls given in a CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # !/usr/bin/python | |
| # -*- coding: utf-8 -*- | |
| # ----------------------------------------------------------------------------------------------- | |
| # Original script from @anokas: https://www.kaggle.com/anokas/py3-image-downloader-w-progress-bar | |
| # ----------------------------------------------------------------------------------------------- | |
| # Note: requires the tqdm package (pip install tqdm) | |
| # Note to Kagglers: This script will not run directly in Kaggle kernels. You | |
| # need to download it and run it on your local machine. | |
| # Downloads images from the Google Landmarks dataset using multiple threads. | |
| # Images that already exist will not be downloaded again, so the script can | |
| # resume a partially completed download. All images will be saved in the JPG | |
| # format with 90% compression quality. | |
| # Thanks to @maxwell: https://www.kaggle.com/maxwell110/python3-version-image-downloader | |
| import os | |
| import sys | |
| import csv | |
| import tqdm | |
| import multiprocessing | |
| from urllib import error | |
| from urllib import request | |
| from PIL import Image | |
| from io import BytesIO | |
| def parse_data(data_file): | |
| csvfile = open(data_file, 'r') | |
| csvreader = csv.reader(csvfile) | |
| key_url_list = [line[:2] for line in csvreader] | |
| return key_url_list[1:] # Chop off header | |
| def download_image(key_url): | |
| out_dir = sys.argv[2] | |
| (key, url) = key_url | |
| filename = os.path.join(out_dir, '{}.jpg'.format(key)) | |
| if os.path.exists(filename): | |
| print('Image {} already exists. Skipping download.'.format(filename)) | |
| return 0 | |
| try: | |
| response = request.urlopen(url) | |
| image_data = response.read() | |
| except: | |
| print('Warning: Could not download image {} from {}'.format(key, url)) | |
| return 1 | |
| try: | |
| pil_image = Image.open(BytesIO(image_data)) | |
| except: | |
| print('Warning: Failed to parse image {}'.format(key)) | |
| return 1 | |
| try: | |
| pil_image_rgb = pil_image.convert('RGB') | |
| except: | |
| print('Warning: Failed to convert image {} to RGB'.format(key)) | |
| return 1 | |
| try: | |
| pil_image_rgb.save(filename, format='JPEG', quality=90) | |
| except: | |
| print('Warning: Failed to save image {}'.format(filename)) | |
| return 1 | |
| return 0 | |
| def loader(): | |
| if len(sys.argv) != 3: | |
| print('Syntax: {} <data_file.csv> <output_dir/>'.format(sys.argv[0])) | |
| sys.exit(0) | |
| (data_file, out_dir) = sys.argv[1:] | |
| if not os.path.exists(out_dir): | |
| os.mkdir(out_dir) | |
| key_url_list = parse_data(data_file) | |
| pool = multiprocessing.Pool(processes=20) # Num of CPUs | |
| failures = sum(tqdm.tqdm(pool.imap_unordered(download_image, key_url_list), total=len(key_url_list))) | |
| print('Total number of download failures:', failures) | |
| pool.close() | |
| pool.terminate() | |
| # arg1 : data_file.csv | |
| # arg2 : output_dir | |
| if __name__ == '__main__': | |
| loader() |
Author
Thanks, the tqdm import was missing. Gist updated
I tried using it with a municipal historic archive website I'm scraping bc is too slow to use... The URLs look like this, and this doesn't work
http://www10.ava.es/amv/knobj?bd=9FBA3EC3CB10&t=0&obj=TUR_01620.jpg
Author
Hi @xdanielc, the URL shows that the images are accessed through a database, not an http server.
That's why it won't work.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
root@mail:/importcsv# pip3 install tqdm
Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (4.57.0)
root@mail:/importcsv# python3 py3k_image_downloader.py test.csv /importcsv/
Traceback (most recent call last):
File "py3k_image_downloader.py", line 94, in
loader()
File "py3k_image_downloader.py", line 85, in loader
failures = sum(tqdm.tqdm(pool.imap_unordered(download_image, key_url_list), total=len(key_url_list)))
NameError: name 'tqdm' is not defined
doesnt work?