-
-
Save bertcarremans/679624f369ed9270472e37f8333244f5 to your computer and use it in GitHub Desktop.
| # Copyright 2014-2017 Bert Carremans | |
| # Author: Bert Carremans <bertcarremans.be> | |
| # | |
| # License: BSD 3 clause | |
| import os | |
| import random | |
| from shutil import copyfile | |
| def img_train_test_split(img_source_dir, train_size): | |
| """ | |
| Randomly splits images over a train and validation folder, while preserving the folder structure | |
| Parameters | |
| ---------- | |
| img_source_dir : string | |
| Path to the folder with the images to be split. Can be absolute or relative path | |
| train_size : float | |
| Proportion of the original images that need to be copied in the subdirectory in the train folder | |
| """ | |
| if not (isinstance(img_source_dir, str)): | |
| raise AttributeError('img_source_dir must be a string') | |
| if not os.path.exists(img_source_dir): | |
| raise OSError('img_source_dir does not exist') | |
| if not (isinstance(train_size, float)): | |
| raise AttributeError('train_size must be a float') | |
| # Set up empty folder structure if not exists | |
| if not os.path.exists('data'): | |
| os.makedirs('data') | |
| else: | |
| if not os.path.exists('data/train'): | |
| os.makedirs('data/train') | |
| if not os.path.exists('data/validation'): | |
| os.makedirs('data/validation') | |
| # Get the subdirectories in the main image folder | |
| subdirs = [subdir for subdir in os.listdir(img_source_dir) if os.path.isdir(os.path.join(img_source_dir, subdir))] | |
| for subdir in subdirs: | |
| subdir_fullpath = os.path.join(img_source_dir, subdir) | |
| if len(os.listdir(subdir_fullpath)) == 0: | |
| print(subdir_fullpath + ' is empty') | |
| break | |
| train_subdir = os.path.join('data/train', subdir) | |
| validation_subdir = os.path.join('data/validation', subdir) | |
| # Create subdirectories in train and validation folders | |
| if not os.path.exists(train_subdir): | |
| os.makedirs(train_subdir) | |
| if not os.path.exists(validation_subdir): | |
| os.makedirs(validation_subdir) | |
| train_counter = 0 | |
| validation_counter = 0 | |
| # Randomly assign an image to train or validation folder | |
| for filename in os.listdir(subdir_fullpath): | |
| if filename.endswith(".jpg") or filename.endswith(".png"): | |
| fileparts = filename.split('.') | |
| if random.uniform(0, 1) <= train_size: | |
| copyfile(os.path.join(subdir_fullpath, filename), os.path.join(train_subdir, str(train_counter) + '.' + fileparts[1])) | |
| train_counter += 1 | |
| else: | |
| copyfile(os.path.join(subdir_fullpath, filename), os.path.join(validation_subdir, str(validation_counter) + '.' + fileparts[1])) | |
| validation_counter += 1 | |
| print('Copied ' + str(train_counter) + ' images to data/train/' + subdir) | |
| print('Copied ' + str(validation_counter) + ' images to data/validation/' + subdir) |
Very nice work, but I can see some discrepancy in the result. I have 2 folders as cats and dogs and both have 12500 images in when I run this program with 0.8 train size its showing the following result:
Copied 10005 images to data/train/dogs
Copied 2495 images to data/validation/dogs
Copied 9955 images to data/train/cats
Copied 2545 images to data/validation/cats
dogs and cats train folder must be with 10,000 images each and validation one 2500 images.
Please check
Thanks
Much better solution
!pip install split_folders
import splitfolders
or import split_folders
Split with a ratio.
To only split into training and validation set, set a tuple to ratio, i.e, (.8, .2).
splitfolders.ratio("train", output="output", seed=68, ratio=(0.8, 0.2, 0.0), group_prefix=None) # default values
I tries split ratio but it just says ratio is not an attribute.
import splitfolders
splitfolders.ratio("/Users/mavaylon/Research/Research_Gambier/Data_P/BP", output="/Users/mavaylon/Research/Research_Gambier/Data_P/output", seed=1337, ratio=(.7, .3), group_prefix=None) # default values
mavaylon1:
The library is correct. You can try the kaggle dataset. I give you the example as follows.
Downloads the furniture datasets
https://www.kaggle.com/akkithetechie/furniture-detector
import splitfolders
# The path to the directory where the original dataset was uncompressed
input_folder = 'home/user/datasets/kaggle/furniture_pictures'
# The directory where we will store our smaller dataset
output_folder = 'home/user/Documents/YOLO/furniture_pictures'
splitfolders.ratio(input_folder, output_folder, seed=1337, ratio=(.8, .1, .1), group_prefix=None)
Cheers!
Doesn't work !! @mikechen66
Dear @mikechen66 I got this output! Any help?
saved a lot of time
thanks

helpful man