"""
This script allows for creation of a validation set from the training set.
"""
# Utils
import argparse
import os
import shutil
import sys
import re
import numpy as np
import random
# Torch related stuff
import torchvision.datasets as datasets
from sklearn.model_selection import train_test_split
from util.misc import make_folder_if_not_exists
[docs]def split_dataset(dataset_folder, split, symbolic, debug=False):
"""
Partition a dataset into train/val splits on the filesystem.
Parameters
----------
dataset_folder : str
Path to the dataset folder (see datasets.image_folder_dataset.load_dataset for details).
split : float
Specifies how much of the training set should be converted into the validation set.
symbolic : bool
Does not make a copy of the data, but only symbolic links to the original data
debug : bool
Prints additional debug statements
Returns
-------
None
"""
# Getting the train dir
traindir = os.path.join(dataset_folder, 'train')
# Rename the original train dir
shutil.move(traindir, os.path.join(dataset_folder, 'original_train'))
traindir = os.path.join(dataset_folder, 'original_train')
# Sanity check on the training folder
if not os.path.isdir(traindir):
print("Train folder not found in the args.dataset_folder={}".format(dataset_folder))
sys.exit(-1)
# Load the dataset file names
train_ds = datasets.ImageFolder(traindir)
# Extract the actual file names and labels as entries
fileNames = np.asarray([item[0] for item in train_ds.imgs])
labels = np.asarray([item[1] for item in train_ds.imgs])
# Split the data into two sets
X_train, X_val, y_train, y_val = train_test_split(fileNames, labels,
test_size=split,
random_state=42,
stratify=labels)
if debug:
# Print number of elements for each class
for c in train_ds.classes:
print("labels ({}) {}".format(c, np.size(np.where(y_train == train_ds.class_to_idx[c]))))
for c in train_ds.classes:
print("split_train ({}) {}".format(c, np.size(np.where(y_train == train_ds.class_to_idx[c]))))
for c in train_ds.classes:
print("split_val ({}) {}".format(c, np.size(np.where(y_val == train_ds.class_to_idx[c]))))
# Create the folder structure to accommodate the two new splits
split_train_dir = os.path.join(dataset_folder, "train")
if os.path.exists(split_train_dir):
shutil.rmtree(split_train_dir)
os.makedirs(split_train_dir)
for class_label in train_ds.classes:
path = os.path.join(split_train_dir, class_label)
if os.path.exists(path):
shutil.rmtree(path)
os.makedirs(path)
split_val_dir = os.path.join(dataset_folder, "val")
if os.path.exists(split_val_dir):
shutil.rmtree(split_val_dir)
os.makedirs(split_val_dir)
for class_label in train_ds.classes:
path = os.path.join(split_val_dir, class_label)
if os.path.exists(path):
shutil.rmtree(path)
os.makedirs(path)
# Copying the splits into their folders
for X, y in zip(X_train, y_train):
src = X
file_name = os.path.basename(src)
dest = os.path.join(split_train_dir, train_ds.classes[y], file_name)
if symbolic:
os.symlink(src, dest)
else:
shutil.copy(X, dest)
for X, y in zip(X_val, y_val):
src = X
file_name = os.path.basename(src)
dest = os.path.join(split_val_dir, train_ds.classes[y], file_name)
if symbolic:
os.symlink(src, dest)
else:
shutil.copy(X, dest)
return
def _get_file_with_parents(filepath, levels=1):
common = filepath
for i in range(levels + 1):
common = os.path.dirname(common)
return os.path.relpath(filepath, common)
[docs]def split_dataset_writerIdentification(dataset_folder, split):
"""
Partition a dataset into train/val splits on the filesystem.
Parameters
----------
dataset_folder : str
Path to the dataset folder (see datasets.image_folder_dataset.load_dataset for details).
split : float
Specifies how much of the training set should be converted into the validation set.
symbolic : bool
Does not make a copy of the data, but only symbolic links to the original data
Returns
-------
None
"""
print("Data Splitting for Writer Identification\n")
# Getting the train dir
binarized_dataset = os.path.join(dataset_folder, "BinarizedDataset")
colored_dataset = os.path.join(dataset_folder, "ColoredDataset")
binarized_traindir = os.path.join(binarized_dataset, 'train')
colored_traindir = os.path.join(colored_dataset, 'train')
# Rename the original train dir
shutil.move(binarized_traindir, os.path.join(binarized_dataset, 'original_train'))
shutil.move(colored_traindir, os.path.join(colored_dataset, 'original_train'))
binarized_traindir = os.path.join(binarized_dataset, 'original_train')
colored_traindir = os.path.join(colored_dataset, 'original_train')
# Sanity check on the training folder
if not os.path.isdir(binarized_traindir):
print("Train folder not found in the args.dataset_folder={}".format(binarized_dataset))
sys.exit(-1)
if not os.path.isdir(colored_traindir):
print("Train folder not found in the args.dataset_folder={}".format(colored_dataset))
sys.exit(-1)
# Load the dataset file names
print("Loading dataset filenames\n")
fileNames = os.listdir(binarized_traindir)
print("Training set size: :" + str(len(fileNames)))
validation_size = int(len(fileNames)*split)
print("Validation set size: " + str(validation_size))
random.seed(42)
random.shuffle(fileNames)
validation_files = random.sample(fileNames, validation_size)
training_files = [file for file in fileNames if file not in validation_files]
# Print number of elements for each class
''''
for c in train_binarized_ds.classes:
print("labels ({}) {}".format(c, np.size(np.where(y_train == train_binarized_ds.class_to_idx[c]))))
for c in train_binarized_ds.classes:
print("split_train ({}) {}".format(c, np.size(np.where(y_train == train_binarized_ds.class_to_idx[c]))))
for c in train_binarized_ds.classes:
print("split_val ({}) {}".format(c, np.size(np.where(y_val == train_binarized_ds.class_to_idx[c]))))
'''
# Create the folder structure to accommodate the two new splits for binarized dataset
split_train_binarized_dir = os.path.join(binarized_dataset, "train")
if os.path.exists(split_train_binarized_dir):
shutil.rmtree(split_train_binarized_dir)
os.makedirs(split_train_binarized_dir)
split_train_color_dir = os.path.join(colored_dataset, "train")
if os.path.exists(split_train_color_dir):
shutil.rmtree(split_train_color_dir)
os.makedirs(split_train_color_dir)
print("Copying files to train folder\n")
for tf in training_files:
path_binarized = os.path.join(split_train_binarized_dir, tf)
path_color = os.path.join(split_train_color_dir, tf)
if os.path.exists(path_binarized):
shutil.rmtree(path_binarized)
os.makedirs(path_binarized)
if os.path.exists(path_color):
shutil.rmtree(path_color)
os.makedirs(path_color)
binarized_file_path = os.path.join(binarized_traindir, tf)
subfiles_binarized = os.listdir(binarized_file_path)
colored_file_path = os.path.join(colored_traindir, tf)
subfiles_colored = os.listdir(colored_file_path)
for i in range(len(subfiles_binarized)):
file = os.path.join(binarized_file_path, subfiles_binarized[i])
shutil.copy(file, path_binarized)
for i in range(len(subfiles_colored)):
file = os.path.join(colored_file_path, subfiles_colored[i])
shutil.copy(file, path_color)
split_val_binarized_dir = os.path.join(binarized_dataset, "val")
if os.path.exists(split_val_binarized_dir):
shutil.rmtree(split_val_binarized_dir)
os.makedirs(split_val_binarized_dir)
split_val_color_dir = os.path.join(colored_dataset, "val")
if os.path.exists(split_val_color_dir):
shutil.rmtree(split_val_color_dir)
os.makedirs(split_val_color_dir)
print("Copying files to val folder\n")
for vf in validation_files:
path_binarized = os.path.join(split_val_binarized_dir, vf)
path_color = os.path.join(split_val_color_dir, vf)
if os.path.exists(path_binarized):
shutil.rmtree(path_binarized)
os.makedirs(path_binarized)
if os.path.exists(path_color):
shutil.rmtree(path_color)
os.makedirs(path_color)
binarized_file_path = os.path.join(binarized_traindir, vf)
subfiles_binarized = os.listdir(binarized_file_path)
colored_file_path = os.path.join(colored_traindir, vf)
subfiles_colored = os.listdir(colored_file_path)
for i in range(len(subfiles_binarized)):
file = os.path.join(binarized_file_path, subfiles_binarized[i])
shutil.copy(file, path_binarized)
for i in range(len(subfiles_colored)):
file = os.path.join(colored_file_path, subfiles_colored[i])
shutil.copy(file, path_color)
print("Splitting is done!")
return
[docs]def split_dataset_segmentation(dataset_folder, split, symbolic, test=False):
"""
Partition a dataset into train/val(/test) splits on the filesystem for segmentation datasets organized
as dataset/data with the images and dataset/gt for the ground truth. The corresponding images need to have the same
name.
Parameters
----------
dataset_folder : str
Path to the dataset folder (see datasets.image_folder_dataset.load_dataset for details).
split : float
Specifies how much of the training set should be converted into the validation set.
symbolic : bool
Does not make a copy of the data, but only symbolic links to the original data
test: bool
If true, the validation set is split again (1:1) into a val and test set. Default false.
Returns
-------
None
"""
# Getting the train dir
orig_dir = os.path.join(dataset_folder, 'train')
# Rename the original train dir
shutil.move(orig_dir, os.path.join(dataset_folder, 'original_train'))
orig_dir = os.path.join(dataset_folder, 'original_train')
# Sanity check on the training folder
if not os.path.isdir(orig_dir):
print("Train folder not found in the args.dataset_folder={}".format(dataset_folder))
sys.exit(-1)
# get the dataset splits
path_data = os.path.join(orig_dir, "data")
path_gt = os.path.join(orig_dir, "gt")
file_names_data = sorted(
[f for f in os.listdir(path_data) if os.path.isfile(os.path.join(path_data, f))])
file_names_gt = sorted(
[f for f in os.listdir(path_gt) if os.path.isfile(os.path.join(path_gt, f))])
# Check data and ensure everything is cool
assert len(file_names_data) == len(file_names_gt)
for data, gt in zip(file_names_data, file_names_gt):
assert data[:-3] == gt[:-3] # exclude the extension which should be jpg and png
assert gt[-3:] == "png"
# Split the data into two sets
file_names = [(data, gt) for data, gt in zip(file_names_data, file_names_gt)]
filenames_train, filenames_val, _, _ = train_test_split(file_names, file_names,
test_size=split,
random_state=42)
if test:
# Split the data into two sets
filenames_val, filenames_test, _, _ = train_test_split(filenames_val, filenames_val,
test_size=0.5,
random_state=42)
# Make output folders
dataset_root = os.path.join(dataset_folder)
train_folder = os.path.join(dataset_root, 'train')
val_folder = os.path.join(dataset_root, 'val')
make_folder_if_not_exists(dataset_root)
make_folder_if_not_exists(train_folder)
make_folder_if_not_exists(val_folder)
if test:
test_folder = os.path.join(dataset_root, 'test')
make_folder_if_not_exists(test_folder)
folders = [train_folder, val_folder, test_folder] if test else [train_folder, val_folder]
file_splits = [filenames_train, filenames_val, filenames_test] if test else [filenames_train, filenames_val]
# Copying the splits into their folders
for folder, split_files in zip(folders, file_splits):
make_folder_if_not_exists(os.path.join(folder, 'data'))
make_folder_if_not_exists(os.path.join(folder, 'gt'))
for fdata, fgt in split_files:
if symbolic:
os.symlink(os.path.join(path_data, fdata), os.path.join(folder, 'data', fdata))
os.symlink(os.path.join(path_gt, fgt), os.path.join(folder, 'gt', fgt))
else:
shutil.copy(os.path.join(path_data, fdata), os.path.join(folder, 'data', fdata))
shutil.copy(os.path.join(path_gt, fgt), os.path.join(folder, 'gt', fgt))
return
if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='This script creates train/val splits '
'from a specified dataset folder.')
parser.add_argument('--dataset-folder',
help='path to root of the dataset.',
required=True,
type=str,
default=None)
parser.add_argument('--split',
help='Ratio of the split for validation set.'
'Example: if 0.2 the training set will be 80% and val 20%.',
type=float,
default=0.2)
parser.add_argument('--symbolic',
help='Make symbolic links instead of copies.',
action='store_true',
default=False)
parser.add_argument('--test',
help='Split val set into half to make a test set.',
action='store_true',
default=False)
parser.add_argument('--debug',
help='Print additional debug statements.',
action='store_true',
default=False)
args = parser.parse_args()
split_dataset(dataset_folder=args.dataset_folder, split=args.split, symbolic=args.symbolic)