Last active
September 10, 2024 18:27
-
-
Save aboucaud/1208dbbcb75093834d8bdce15a7fd0fd to your computer and use it in GitHub Desktop.
Download images from urls given in a CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # !/usr/bin/python | |
| # -*- coding: utf-8 -*- | |
| # ----------------------------------------------------------------------------------------------- | |
| # Original script from @anokas: https://www.kaggle.com/anokas/py3-image-downloader-w-progress-bar | |
| # ----------------------------------------------------------------------------------------------- | |
| # Note: requires the tqdm package (pip install tqdm) | |
| # Note to Kagglers: This script will not run directly in Kaggle kernels. You | |
| # need to download it and run it on your local machine. | |
| # Downloads images from the Google Landmarks dataset using multiple threads. | |
| # Images that already exist will not be downloaded again, so the script can | |
| # resume a partially completed download. All images will be saved in the JPG | |
| # format with 90% compression quality. | |
| # Thanks to @maxwell: https://www.kaggle.com/maxwell110/python3-version-image-downloader | |
| import os | |
| import sys | |
| import csv | |
| import tqdm | |
| import multiprocessing | |
| from urllib import error | |
| from urllib import request | |
| from PIL import Image | |
| from io import BytesIO | |
| def parse_data(data_file): | |
| csvfile = open(data_file, 'r') | |
| csvreader = csv.reader(csvfile) | |
| key_url_list = [line[:2] for line in csvreader] | |
| return key_url_list[1:] # Chop off header | |
| def download_image(key_url): | |
| out_dir = sys.argv[2] | |
| (key, url) = key_url | |
| filename = os.path.join(out_dir, '{}.jpg'.format(key)) | |
| if os.path.exists(filename): | |
| print('Image {} already exists. Skipping download.'.format(filename)) | |
| return 0 | |
| try: | |
| response = request.urlopen(url) | |
| image_data = response.read() | |
| except: | |
| print('Warning: Could not download image {} from {}'.format(key, url)) | |
| return 1 | |
| try: | |
| pil_image = Image.open(BytesIO(image_data)) | |
| except: | |
| print('Warning: Failed to parse image {}'.format(key)) | |
| return 1 | |
| try: | |
| pil_image_rgb = pil_image.convert('RGB') | |
| except: | |
| print('Warning: Failed to convert image {} to RGB'.format(key)) | |
| return 1 | |
| try: | |
| pil_image_rgb.save(filename, format='JPEG', quality=90) | |
| except: | |
| print('Warning: Failed to save image {}'.format(filename)) | |
| return 1 | |
| return 0 | |
| def loader(): | |
| if len(sys.argv) != 3: | |
| print('Syntax: {} <data_file.csv> <output_dir/>'.format(sys.argv[0])) | |
| sys.exit(0) | |
| (data_file, out_dir) = sys.argv[1:] | |
| if not os.path.exists(out_dir): | |
| os.mkdir(out_dir) | |
| key_url_list = parse_data(data_file) | |
| pool = multiprocessing.Pool(processes=20) # Num of CPUs | |
| failures = sum(tqdm.tqdm(pool.imap_unordered(download_image, key_url_list), total=len(key_url_list))) | |
| print('Total number of download failures:', failures) | |
| pool.close() | |
| pool.terminate() | |
| # arg1 : data_file.csv | |
| # arg2 : output_dir | |
| if __name__ == '__main__': | |
| loader() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @xdanielc, the URL shows that the images are accessed through a database, not an http server.
That's why it won't work.