import argparse
import fnmatch
import inspect
import os
import shutil
import sys
import numpy as np
import pandas as pd
import urllib
import zipfile
import re
import csv
import tarfile
import codecs
import gzip
import requests
from tqdm import tqdm
import rarfile
import torch
import torchvision
import wget
from PIL import Image
import scipy
from scipy.io import loadmat as _loadmat
from sklearn.model_selection import train_test_split as _train_test_split
from util.data.dataset_splitter import split_dataset, split_dataset_writerIdentification
from util.misc import get_all_files_in_folders_and_subfolders \
as _get_all_files_in_folders_and_subfolders, pil_loader, make_folder_if_not_exists
[docs]def mnist(args):
"""
Fetches and prepares (in a DeepDIVA friendly format) the MNIST dataset to the location specified
on the file system
Parameters
----------
args : dict
List of arguments necessary to run this routine. In particular its necessary to provide
output_folder as String containing the path where the dataset will be downloaded
Returns
-------
None
"""
# Use torchvision to download the dataset
torchvision.datasets.MNIST(root=args.output_folder, download=True)
# Load the data into memory
train_data, train_labels = torch.load(os.path.join(args.output_folder,
'MNIST',
'processed',
'training.pt'))
test_data, test_labels = torch.load(os.path.join(args.output_folder,
'MNIST',
'processed',
'test.pt'))
# Make output folders
dataset_root = os.path.join(args.output_folder, 'MNIST')
train_folder = os.path.join(dataset_root, 'train')
test_folder = os.path.join(dataset_root, 'test')
make_folder_if_not_exists(dataset_root)
make_folder_if_not_exists(train_folder)
make_folder_if_not_exists(test_folder)
def _write_data_to_folder(arr, labels, folder):
for i, (img, label) in enumerate(zip(arr, labels.detach().numpy())):
dest = os.path.join(folder, str(label))
make_folder_if_not_exists(dest)
Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png'))
# Write the images to the folders
_write_data_to_folder(train_data, train_labels, train_folder)
_write_data_to_folder(test_data, test_labels, test_folder)
shutil.rmtree(os.path.join(args.output_folder, 'MNIST', 'raw'))
shutil.rmtree(os.path.join(args.output_folder, 'MNIST', 'processed'))
split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
[docs]def svhn(args):
"""
Fetches and prepares (in a DeepDIVA friendly format) the SVHN dataset to the location specified
on the file system
Parameters
----------
args : dict
List of arguments necessary to run this routine. In particular its necessary to provide
output_folder as String containing the path where the dataset will be downloaded
Returns
-------
None
"""
# Use torchvision to download the dataset
torchvision.datasets.SVHN(root=args.output_folder, split='train', download=True)
torchvision.datasets.SVHN(root=args.output_folder, split='test', download=True)
# Load the data into memory
train = _loadmat(os.path.join(args.output_folder,
'train_32x32.mat'))
train_data, train_labels = train['X'], train['y'].astype(np.int64).squeeze()
np.place(train_labels, train_labels == 10, 0)
train_data = np.transpose(train_data, (3, 0, 1, 2))
test = _loadmat(os.path.join(args.output_folder,
'test_32x32.mat'))
test_data, test_labels = test['X'], test['y'].astype(np.int64).squeeze()
np.place(test_labels, test_labels == 10, 0)
test_data = np.transpose(test_data, (3, 0, 1, 2))
# Make output folders
dataset_root = os.path.join(args.output_folder, 'SVHN')
train_folder = os.path.join(dataset_root, 'train')
test_folder = os.path.join(dataset_root, 'test')
make_folder_if_not_exists(dataset_root)
make_folder_if_not_exists(train_folder)
make_folder_if_not_exists(test_folder)
def _write_data_to_folder(arr, labels, folder):
for i, (img, label) in enumerate(zip(arr, labels)):
dest = os.path.join(folder, str(label))
make_folder_if_not_exists(dest)
Image.fromarray(img).save(os.path.join(dest, str(i) + '.png'))
# Write the images to the folders
_write_data_to_folder(train_data, train_labels, train_folder)
_write_data_to_folder(test_data, test_labels, test_folder)
os.remove(os.path.join(args.output_folder, 'train_32x32.mat'))
os.remove(os.path.join(args.output_folder, 'test_32x32.mat'))
split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
[docs]def cifar10(args):
"""
Fetches and prepares (in a DeepDIVA friendly format) the CIFAR dataset to the location specified
on the file system
Parameters
----------
args : dict
List of arguments necessary to run this routine. In particular its necessary to provide
output_folder as String containing the path where the dataset will be downloaded
Returns
-------
None
"""
# Use torchvision to download the dataset
cifar_train = torchvision.datasets.CIFAR10(root=args.output_folder, train=True, download=True)
cifar_test = torchvision.datasets.CIFAR10(root=args.output_folder, train=False, download=True)
# Load the data into memory
train_data, train_labels = cifar_train.data, cifar_train.targets
test_data, test_labels = cifar_test.data, cifar_test.targets
# Make output folders
dataset_root = os.path.join(args.output_folder, 'CIFAR10')
train_folder = os.path.join(dataset_root, 'train')
test_folder = os.path.join(dataset_root, 'test')
make_folder_if_not_exists(dataset_root)
make_folder_if_not_exists(train_folder)
make_folder_if_not_exists(test_folder)
def _write_data_to_folder(arr, labels, folder):
for i, (img, label) in enumerate(zip(arr, labels)):
dest = os.path.join(folder, str(label))
make_folder_if_not_exists(dest)
Image.fromarray(img).save(os.path.join(dest, str(i) + '.png'))
# Write the images to the folders
_write_data_to_folder(train_data, train_labels, train_folder)
_write_data_to_folder(test_data, test_labels, test_folder)
os.remove(os.path.join(args.output_folder, 'cifar-10-python.tar.gz'))
shutil.rmtree(os.path.join(args.output_folder, 'cifar-10-batches-py'))
split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
[docs]def diva_hisdb(args):
"""
Fetches and prepares (in a DeepDIVA friendly format) the DIVA HisDB-all dataset for semantic segmentation to the location specified
on the file system
See also: https://diuf.unifr.ch/main/hisdoc/diva-hisdb
Output folder structure: ../HisDB/CB55/train
../HisDB/CB55/val
../HisDB/CB55/test
../HisDB/CB55/test/data -> images
../HisDB/CB55/test/gt -> pixel-wise annotated ground truth
Parameters
----------
args : dict
List of arguments necessary to run this routine. In particular its necessary to provide
output_folder as String containing the path where the dataset will be downloaded
Returns
-------
None
"""
# make the root folder
dataset_root = os.path.join(args.output_folder, 'HisDB')
make_folder_if_not_exists(dataset_root)
# links to HisDB data sets
link_public = urllib.parse.urlparse(
'https://diuf.unifr.ch/main/hisdoc/sites/diuf.unifr.ch.main.hisdoc/files/uploads/diva-hisdb/hisdoc/all.zip')
link_test_private = urllib.parse.urlparse(
'https://diuf.unifr.ch/main/hisdoc/sites/diuf.unifr.ch.main.hisdoc/files/uploads/diva-hisdb/hisdoc/private-test/all-privateTest.zip')
download_path_public = os.path.join(dataset_root, link_public.geturl().rsplit('/', 1)[-1])
download_path_private = os.path.join(dataset_root, link_test_private.geturl().rsplit('/', 1)[-1])
# download files
print('Downloading {}...'.format(link_public.geturl()))
urllib.request.urlretrieve(link_public.geturl(), download_path_public)
print('Downloading {}...'.format(link_test_private.geturl()))
urllib.request.urlretrieve(link_test_private.geturl(), download_path_private)
print('Download complete. Unpacking files...')
# unpack relevant folders
zip_file = zipfile.ZipFile(download_path_public)
# unpack imgs and gt
data_gt_zip = {f: re.sub(r'img', 'pixel-level-gt', f) for f in zip_file.namelist() if 'img' in f}
dataset_folders = [data_file.split('-')[-1][:-4] for data_file in data_gt_zip.keys()]
for data_file, gt_file in data_gt_zip.items():
dataset_name = data_file.split('-')[-1][:-4]
dataset_folder = os.path.join(dataset_root, dataset_name)
make_folder_if_not_exists(dataset_folder)
for file in [data_file, gt_file]:
zip_file.extract(file, dataset_folder)
with zipfile.ZipFile(os.path.join(dataset_folder, file), "r") as zip_ref:
zip_ref.extractall(dataset_folder)
# delete zips
os.remove(os.path.join(dataset_folder, file))
# create folder structure
for partition in ['train', 'val', 'test', 'test-public']:
for folder in ['data', 'gt']:
make_folder_if_not_exists(os.path.join(dataset_folder, partition, folder))
# move the files to the correct place
for folder in dataset_folders:
for k1, v1 in {'pixel-level-gt': 'gt', 'img': 'data'}.items():
for k2, v2 in {'public-test': 'test-public', 'training': 'train', 'validation': 'val'}.items():
current_path = os.path.join(dataset_root, folder, k1, k2)
new_path = os.path.join(dataset_root, folder, v2, v1)
for f in [f for f in os.listdir(current_path) if os.path.isfile(os.path.join(current_path, f))]:
shutil.move(os.path.join(current_path, f), os.path.join(new_path, f))
# remove old folders
shutil.rmtree(os.path.join(dataset_root, folder, k1))
# fix naming issue
for old, new in {'CS18': 'CSG18', 'CS863': 'CSG863'}.items():
os.rename(os.path.join(dataset_root, old), os.path.join(dataset_root, new))
# unpack private test folders
zip_file_private = zipfile.ZipFile(download_path_private)
data_gt_zip_private = {f: re.sub(r'img', 'pixel-level-gt', f) for f in zip_file_private.namelist() if 'img' in f}
for data_file, gt_file in data_gt_zip_private.items():
dataset_name = re.search('-(.*)-', data_file).group(1)
dataset_folder = os.path.join(dataset_root, dataset_name)
for file in [data_file, gt_file]:
zip_file_private.extract(file, dataset_folder)
with zipfile.ZipFile(os.path.join(dataset_folder, file), "r") as zip_ref:
zip_ref.extractall(os.path.join(dataset_folder, file[:-4]))
# delete zip
os.remove(os.path.join(dataset_folder, file))
# create folder structure
for folder in ['data', 'gt']:
make_folder_if_not_exists(os.path.join(dataset_folder, 'test', folder))
for old, new in {'pixel-level-gt': 'gt', 'img': 'data'}.items():
current_path = os.path.join(dataset_folder, "{}-{}-privateTest".format(old, dataset_name), dataset_name)
new_path = os.path.join(dataset_folder, "test", new)
for f in [f for f in os.listdir(current_path) if os.path.isfile(os.path.join(current_path, f))]:
# the ground truth files in the private test set have an additional ending, which needs to be remove
if new == "gt":
f_new = re.sub('_gt', r'', f)
else:
f_new = f
shutil.move(os.path.join(current_path, f), os.path.join(new_path, f_new))
# remove old folders
shutil.rmtree(os.path.dirname(current_path))
print('Finished. Data set up at {}.'.format(dataset_root))
[docs]def icdar2017_clamm(args):
url = "http://clamm.irht.cnrs.fr/wp-content/uploads/ICDAR2017_CLaMM_Training.zip"
print("Downloading " + url)
zip_name = "ICDAR2017_CLaMM_Training.zip"
local_filename, headers = urllib.request.urlretrieve(url, zip_name)
zfile = zipfile.ZipFile(local_filename)
# Make output folders
dataset_root = os.path.join(args.output_folder, 'ICDAR2017-CLAMM')
dataset_manuscriptDating = os.path.join(dataset_root , 'ManuscriptDating')
dataset_md_train = os.path.join(dataset_manuscriptDating , 'train')
dataset_styleClassification = os.path.join(dataset_root , 'StyleClassification')
dataset_sc_train = os.path.join(dataset_styleClassification, 'train')
test_sc_folder = os.path.join(dataset_styleClassification, 'test')
test_md_folder = os.path.join(dataset_manuscriptDating, 'test')
make_folder_if_not_exists(dataset_root)
make_folder_if_not_exists(dataset_manuscriptDating)
make_folder_if_not_exists(dataset_styleClassification)
make_folder_if_not_exists(test_sc_folder)
def _write_data_to_folder(zipfile, filenames, labels, folder, start_index, isTest):
print("Writing data\n")
sorted_labels = [None]*len(labels)
if isTest == 1:
for i in range(len(zipfile.infolist())):
entry = zipfile.infolist()[i]
if "IRHT_P_009793.tif" in entry.filename:
zipfile.infolist().remove(entry)
break
zip_infolist = zipfile.infolist()[1:]
for i in range(len(zip_infolist)):
entry = zip_infolist[i]
entry_index_infilenames = filenames.index(entry.filename[start_index:])
sorted_labels[i] = labels[entry_index_infilenames]
for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:], sorted_labels)):
with zipfile.open(enrty) as file:
img = Image.open(file)
dest = os.path.join(folder, str(label))
make_folder_if_not_exists(dest)
img.save(os.path.join(dest, str(i) + '.png'), "PNG", quality=100)
def getLabels(zfile):
print("Extracting labels\n")
filenames, md_labels, sc_labels = [], [], []
zip_infolist = zfile.infolist()[1:]
for entry in zip_infolist:
if '.csv' in entry.filename:
with zfile.open(entry) as file:
cf = file.read()
c = csv.StringIO(cf.decode())
next(c) # Skip the first line which is the header of csv file
for row in c:
md_label_strt_ind = row.rfind(';')
md_label_end_ind = row.rfind("\r")
md_labels.append(row[md_label_strt_ind+1:md_label_end_ind])
sc_labels_strt_ind = row[:md_label_strt_ind].rfind(';')
sc_labels.append(row[sc_labels_strt_ind+1:md_label_strt_ind])
filename_ind = row[:sc_labels_strt_ind].rfind(';')
if filename_ind > -1:
f_name = row[filename_ind+1:sc_labels_strt_ind]
else:
f_name = row[:sc_labels_strt_ind]
if isTest == 1 and f_name == 'IRHT_P_009783.tif':
print('No file named ' + f_name + ". This filename will not be added!")
else:
filenames.append(f_name)
zfile.infolist().remove(entry) # remove the csv file from infolist
if '.db' in entry.filename: # remove the db file from infolist
zfile.infolist().remove(entry)
return filenames, sc_labels, md_labels
isTest = 0
filenames, sc_labels, md_labels = getLabels(zfile)
start_index_training = len("ICDAR2017_CLaMM_Training/")
print("Training data is being prepared for style classification!\n")
_write_data_to_folder(zfile, filenames, sc_labels, dataset_sc_train, start_index_training, isTest)
print("Training data is being prepared for manuscript dating!\n")
_write_data_to_folder(zfile, filenames, md_labels, dataset_md_train, start_index_training, isTest)
os.remove(os.path.join(zfile.filename))
url = "http://clamm.irht.cnrs.fr/wp-content/uploads/ICDAR2017_CLaMM_task1_task3.zip"
print("Downloading " + url)
zip_name_test = "ICDAR2017_CLaMM_task1_task3.zip"
local_filename_test, headers_test = urllib.request.urlretrieve(url, zip_name_test)
zfile_test = zipfile.ZipFile(local_filename_test)
isTest = 1
filenames_test, sc_test_labels, md_test_labels = getLabels(zfile_test)
start_index_test = len("ICDAR2017_CLaMM_task1_task3/")
print("Test data is being prepared for style classification!\n")
_write_data_to_folder(zfile_test, filenames_test, sc_test_labels, test_sc_folder, start_index_test, 1)
print("Test data is being prepared for manuscript dating!\n")
_write_data_to_folder(zfile_test, filenames_test, md_test_labels, test_md_folder, start_index_test, 1)
os.remove(os.path.join(zfile_test.filename))
print("Training-Validation splitting\n")
split_dataset(dataset_folder=dataset_manuscriptDating, split=0.2, symbolic=False)
split_dataset(dataset_folder=dataset_styleClassification, split=0.2, symbolic=False)
print("ICDAR2017 CLaMM data is ready!")
[docs]def historical_wi(args):
train_binarized_url = "ftp://scruffy.caa.tuwien.ac.at/staff/database/icdar2017/icdar17-historicalwi-training-binarized.zip"
train_colored_url = "ftp://scruffy.caa.tuwien.ac.at/staff/database/icdar2017/icdar17-historicalwi-training-color.zip"
test_binarized_url = "https://zenodo.org/record/854353/files/ScriptNet-HistoricalWI-2017-binarized.zip?download=1"
test_colored_url = "https://zenodo.org/record/854353/files/ScriptNet-HistoricalWI-2017-color.zip?download=1"
urls = [train_binarized_url, train_colored_url, test_binarized_url, test_colored_url]
zip_name_train_binarized = "icdar17-historicalwi-training-binarized.zip"
zip_name_train_color = "icdar17-historicalwi-training-color.zip"
zip_name_test_binarized = "ScriptNet-HistoricalWI-2017-binarized.zip"
zip_name_test_color = "ScriptNet-HistoricalWI-2017-color.zip"
zip_names = [zip_name_train_binarized, zip_name_train_color, zip_name_test_binarized, zip_name_test_color]
start_indices = [len("icdar2017-training-binary/"), len("icdar2017-training-color/"),
len("ScriptNet-HistoricalWI-2017-binarized/"), len("ScriptNet-HistoricalWI-2017-color/")]
# Make output folders
"""
dataset_root = os.path.join(args.output_folder)
train_folder = os.path.join(dataset_root, 'train')
train_binarized_folder = os.path.join(train_folder, 'Binarized')
train_colored_folder = os.path.join(train_folder, 'Color')
test_folder = os.path.join(dataset_root, 'test')
test_binarized_folder = os.path.join(test_folder, 'Binarized')
test_colored_folder = os.path.join(test_folder, 'Color')
folders = [train_binarized_folder, train_colored_folder, test_binarized_folder, test_colored_folder]
make_folder_if_not_exists(dataset_root)
make_folder_if_not_exists(train_folder)
make_folder_if_not_exists(train_binarized_folder)
make_folder_if_not_exists(train_colored_folder)
make_folder_if_not_exists(test_folder)
make_folder_if_not_exists(test_binarized_folder)
make_folder_if_not_exists(test_colored_folder)
"""
dataset_root = os.path.join(os.path.join(args.output_folder, 'historical_wi'))
binarized_dataset = os.path.join(dataset_root, "BinarizedDataset")
train_binarized_folder = os.path.join(binarized_dataset, 'train')
test_binarized_folder = os.path.join(binarized_dataset, 'test')
colored_dataset = os.path.join(dataset_root, "ColoredDataset")
train_colored_folder = os.path.join(colored_dataset, 'train')
test_colored_folder = os.path.join(colored_dataset, 'test')
folders = [train_binarized_folder, train_colored_folder, test_binarized_folder, test_colored_folder]
make_folder_if_not_exists(dataset_root)
make_folder_if_not_exists(binarized_dataset)
make_folder_if_not_exists(colored_dataset)
make_folder_if_not_exists(train_binarized_folder)
make_folder_if_not_exists(train_colored_folder)
make_folder_if_not_exists(test_binarized_folder)
make_folder_if_not_exists(test_colored_folder)
def _write_data_to_folder(zipfile, labels, folder, isTrainingset):
print("Writing data to folder\n")
for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:], labels)):
with zipfile.open(enrty) as file:
img = Image.open(file)
dest = os.path.join(folder, str(label))
make_folder_if_not_exists(dest)
if isTrainingset == 1:
img.save(os.path.join(dest, str(i) + '.png'))
else:
img.save(os.path.join(dest, str(i) + '.jpg'))
def _get_labels(zipfile, start_index):
print("Extracting labels\n")
labels = []
for zipinfo in zipfile.infolist()[1:]:
file_name = zipinfo.filename
ind = file_name.find("-", start_index)
labels.append(file_name[start_index:ind])
return labels
#Prepare Datasets
for i in range(len(urls)):
if i < 2:
isTrainingset = 1
else:
isTrainingset = 0
print("Downloading " + urls[i])
local_filename, headers = urllib.request.urlretrieve(urls[i], zip_names[i])
zfile = zipfile.ZipFile(local_filename)
labels = _get_labels(zfile, start_indices[i])
_write_data_to_folder(zfile, labels, folders[i], isTrainingset)
os.remove(os.path.join(zfile.filename))
if i == 0:
print("Binary training data is ready!")
elif i == 1:
print("Colored training data is ready!")
elif i == 2:
print("Binary test data is ready!")
else:
print("Colored test data is ready!")
split_dataset_writerIdentification(dataset_folder=dataset_root, split=0.2)
print("Historical WI dataset is ready!")
[docs]def kmnist(args):
"""
Fetches and prepares (in a DeepDIVA friendly format) the K-MNIST dataset to the location specified
on the file system
Parameters
----------
args : dict
List of arguments necessary to run this routine. In particular its necessary to provide
output_folder as String containing the path where the dataset will be downloaded
Returns
-------
None
"""
def get_int(b):
return int(codecs.encode(b, 'hex'), 16)
def read_image_file(path):
with open(path, 'rb') as f:
data = f.read()
assert get_int(data[:4]) == 2051
length = get_int(data[4:8])
num_rows = get_int(data[8:12])
num_cols = get_int(data[12:16])
images = []
parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
return torch.from_numpy(parsed).view(length, num_rows, num_cols)
def read_label_file(path):
with open(path, 'rb') as f:
data = f.read()
assert get_int(data[:4]) == 2049
length = get_int(data[4:8])
parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
return torch.from_numpy(parsed).view(length).long()
try:
torchvision.datasets.KMNIST(root=args.output_folder, download=True)
except AttributeError:
url_list = ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz',
'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz',
'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz',
'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz']
raw_folder = os.path.join(args.output_folder, 'raw')
processed_folder = os.path.join(args.output_folder, 'processed')
make_folder_if_not_exists(raw_folder)
make_folder_if_not_exists(processed_folder)
training_file = 'training.pt'
test_file = 'test.pt'
for url in url_list:
print('Downloading ' + url)
data = urllib.request.urlopen(url)
filename = url.rpartition('/')[2]
file_path = os.path.join(raw_folder, filename)
with open(file_path, 'wb') as f:
f.write(data.read())
with open(file_path.replace('.gz', ''), 'wb') as out_f, \
gzip.GzipFile(file_path) as zip_f:
out_f.write(zip_f.read())
os.unlink(file_path)
# process and save as torch files
print('Processing...')
training_set = (
read_image_file(os.path.join(raw_folder, 'train-images-idx3-ubyte')),
read_label_file(os.path.join(raw_folder, 'train-labels-idx1-ubyte'))
)
test_set = (
read_image_file(os.path.join(raw_folder, 't10k-images-idx3-ubyte')),
read_label_file(os.path.join(raw_folder, 't10k-labels-idx1-ubyte'))
)
with open(os.path.join(processed_folder, training_file), 'wb') as f:
torch.save(training_set, f)
with open(os.path.join(processed_folder, test_file), 'wb') as f:
torch.save(test_set, f)
print('Done!')
# Load the data into memory
train_data, train_labels = torch.load(os.path.join(args.output_folder,
'processed',
'training.pt'))
test_data, test_labels = torch.load(os.path.join(args.output_folder,
'processed',
'test.pt'))
# Make output folders
dataset_root = os.path.join(args.output_folder, 'KMNIST')
train_folder = os.path.join(dataset_root, 'train')
test_folder = os.path.join(dataset_root, 'test')
make_folder_if_not_exists(dataset_root)
make_folder_if_not_exists(train_folder)
make_folder_if_not_exists(test_folder)
def _write_data_to_folder(arr, labels, folder):
for i, (img, label) in enumerate(zip(arr, labels)):
dest = os.path.join(folder, str(label))
make_folder_if_not_exists(dest)
Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png'))
# Write the images to the folders
_write_data_to_folder(train_data, train_labels, train_folder)
_write_data_to_folder(test_data, test_labels, test_folder)
shutil.rmtree(os.path.join(args.output_folder, 'raw'))
shutil.rmtree(os.path.join(args.output_folder, 'processed'))
split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
print("The KMNIST dataset is ready for you at {}".format(dataset_root))
# def kuzushiji_kanji(args):
# """
# Fetches and prepares (in a DeepDIVA friendly format) the K-MNIST dataset to the location specified
# on the file system
#
# Parameters
# ----------
# args : dict
# List of arguments necessary to run this routine. In particular its necessary to provide
# output_folder as String containing the path where the dataset will be downloaded
#
# Returns
# -------
# None
# """
# url = 'http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar'
# dataset_root = os.path.join(args.output_folder, 'kkanji')
#
# path = os.path.join(dataset_root, url.split('/')[-1])
# r = requests.get(url, stream=True)
# with open(path, 'wb') as f:
# total_length = int(r.headers.get('content-length'))
# print('Downloading {} - {:.1f} MB'.format(path, (total_length / 1024000)))
#
# for chunk in tqdm(r.iter_content(chunk_size=1024), total=int(total_length / 1024) + 1, unit="KB"):
# if chunk:
# f.write(chunk)
#
# print('All dataset files downloaded!')
#
# with tarfile.open(os.path.join(dataset_root, "kkanji.tar")) as f:
# f.extractall()
# shutil.rmtree(os.path.join(dataset_root, "kkanji.tar"))
#
#
# # Make output folders
# train_folder = os.path.join(dataset_root, 'train')
# test_folder = os.path.join(dataset_root, 'test')
#
# make_folder_if_not_exists(dataset_root)
# make_folder_if_not_exists(train_folder)
# make_folder_if_not_exists(test_folder)
#
# split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
# print("The kkanji dataset is ready for you at {}".format(dataset_root))
[docs]def fashion_mnist(args):
"""
Fetches and prepares (in a DeepDIVA friendly format) the Fashion-MNIST dataset to the location specified
on the file system
Parameters
----------
args : dict
List of arguments necessary to run this routine. In particular its necessary to provide
output_folder as String containing the path where the dataset will be downloaded
Returns
-------
None
"""
# Use torchvision to download the dataset
torchvision.datasets.FashionMNIST(root=args.output_folder, download=True)
# Load the data into memory
train_data, train_labels = torch.load(os.path.join(args.output_folder,
'processed',
'training.pt'))
test_data, test_labels = torch.load(os.path.join(args.output_folder,
'processed',
'test.pt'))
# Make output folders
dataset_root = os.path.join(args.output_folder, 'Fashion-MNIST')
train_folder = os.path.join(dataset_root, 'train')
test_folder = os.path.join(dataset_root, 'test')
make_folder_if_not_exists(dataset_root)
make_folder_if_not_exists(train_folder)
make_folder_if_not_exists(test_folder)
def _write_data_to_folder(arr, labels, folder):
for i, (img, label) in enumerate(zip(arr, labels)):
dest = os.path.join(folder, str(label))
make_folder_if_not_exists(dest)
Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png'))
# Write the images to the folders
_write_data_to_folder(train_data, train_labels, train_folder)
_write_data_to_folder(test_data, test_labels, test_folder)
shutil.rmtree(os.path.join(args.output_folder, 'raw'))
shutil.rmtree(os.path.join(args.output_folder, 'processed'))
split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
[docs]def miml(args):
"""
Fetches and prepares (in a DeepDIVA friendly format) the Multi-Instance Multi-Label Image Dataset
on the file system. Dataset available at: http://lamda.nju.edu.cn/data_MIMLimage.ashx
Parameters
----------
args : dict
List of arguments necessary to run this routine. In particular its necessary to provide
output_folder as String containing the path where the dataset will be downloaded
Returns
-------
None
"""
# Download the files
url = 'http://lamda.nju.edu.cn/files/miml-image-data.rar'
if not os.path.exists(os.path.join(args.output_folder, 'miml-image-data.rar')):
print('Downloading file!')
filename = wget.download(url, out=args.output_folder)
else:
print('File already downloaded!')
filename = os.path.join(args.output_folder, 'miml-image-data.rar')
# Extract the files
path_to_rar = filename
path_to_output = os.path.join(args.output_folder, 'tmp_miml')
rarfile.RarFile(path_to_rar).extractall(path_to_output)
path_to_rar = os.path.join(path_to_output, 'original.rar')
rarfile.RarFile(path_to_rar).extractall(path_to_output)
path_to_rar = os.path.join(path_to_output, 'processed.rar')
rarfile.RarFile(path_to_rar).extractall(path_to_output)
print('Extracted files...')
# Load the mat file
mat = _loadmat(os.path.join(path_to_output, 'miml data.mat'))
targets = mat['targets'].T
classes = [item[0][0] for item in mat['class_name']]
# Add filename at 0-index to correctly format the CSV headers
classes.insert(0, 'filename')
# Get list of all image files in the folder
images = [item for item in _get_all_files_in_folders_and_subfolders(path_to_output)
if item.endswith('jpg')]
images = sorted(images, key=lambda e: int(os.path.basename(e).split('.')[0]))
# Make splits
train_data, test_data, train_labels, test_labels = _train_test_split(images, targets, test_size=0.2,
random_state=42)
train_data, val_data, train_labels, val_labels = _train_test_split(train_data, train_labels, test_size=0.2,
random_state=42)
# print('Size of splits\ntrain:{}\nval:{}\ntest:{}'.format(len(train_data),
# len(val_data),
# len(test_data)))
# Make output folders
dataset_root = os.path.join(args.output_folder, 'MIML')
train_folder = os.path.join(dataset_root, 'train')
val_folder = os.path.join(dataset_root, 'val')
test_folder = os.path.join(dataset_root, 'test')
make_folder_if_not_exists(dataset_root)
make_folder_if_not_exists(train_folder)
make_folder_if_not_exists(val_folder)
make_folder_if_not_exists(test_folder)
def _write_data_to_folder(data, labels, folder, classes):
dest = os.path.join(folder, 'images')
make_folder_if_not_exists(dest)
for image, label in zip(data, labels):
shutil.copy(image, dest)
rows = np.column_stack(([os.path.join('images', os.path.basename(item)) for item in data], labels))
rows = sorted(rows, key=lambda e: int(e[0].split('/')[1].split('.')[0]))
output_csv = pd.DataFrame(rows)
output_csv.to_csv(os.path.join(folder, 'labels.csv'), header=classes, index=False)
return
# Write the images to the correct folders
print('Writing the data to the filesystem')
_write_data_to_folder(train_data, train_labels, train_folder, classes)
_write_data_to_folder(val_data, val_labels, val_folder, classes)
_write_data_to_folder(test_data, test_labels, test_folder, classes)
os.remove(filename)
shutil.rmtree(path_to_output)
print('All done!')
return
[docs]def glas(args):
"""
Fetches and prepares (in a DeepDIVA friendly format) the tubule dataset (from the GlaS challenge) for semantic
segmentation to the location specified on the file system
See also: https://github.com/choosehappy/public/tree/master/DL%20tutorial%20Code/3-tubule
Output folder structure: ../HisDB/GlaS/train
../HisDB/GlaS/val
../HisDB/GlaS/test
../HisDB/GlaS/test/data -> images
../HisDB/GlaS/test/gt -> pixel-wise annotated ground truth
Parameters
----------
args : dict
List of arguments necessary to run this routine. In particular its necessary to provide
output_folder as String containing the path where the dataset will be downloaded
Returns
-------
None
"""
def groupby_patient(list_to_group, index=3):
"""
split images by patient
:param list_to_group: list of image names
:param index: position of split by '-' in the image name to obtain patient ID
:return: dictionary where keys are patient IDs and values are lists of images that are from that patient
"""
return {
'-'.join(filename.split('-')[:index]): [file for file in list_to_group if '-'.join(file.split('-')[:index])
== '-'.join(filename.split('-')[:index])] for filename in
list_to_group}
def convert_gt(img_path):
img = pil_loader(img_path)
out_img = np.zeros((*img.shape, 3), dtype=np.uint8)
out_img[:, :, 2] = 1 # set everything to background in blue channel
out_img[:, :, 2][img != 0] = 2 # set glands to 2 in blue channel
out = Image.fromarray(out_img)
out.save(img_path)
# make the root folder
output_folder = args.output_folder
dataset_root = os.path.join(output_folder, 'GlaS')
make_folder_if_not_exists(dataset_root)
# links to HisDB data sets
link_tubules = urllib.parse.urlparse(
'http://andrewjanowczyk.com/wp-static/tubule.tgz')
download_path_tubules = os.path.join(dataset_root, link_tubules.geturl().rsplit('/', 1)[-1])
# download files
print('Downloading {}...'.format(link_tubules.geturl()))
urllib.request.urlretrieve(link_tubules.geturl(), download_path_tubules)
print('Download complete. Unpacking files...')
# unpack tubule folder that contains images, annotations and text files with lists of benign and malignant samples
tar_file = tarfile.open(download_path_tubules)
tar_file.extractall(path=dataset_root)
sets_dict = {}
# 20 benign + 20 malignant images
train_ids_b = ['09-1339-01',
'09-16566-03',
'09-21631-03',
'09-23232-02',
'm9_10741F-12T2N0', '10-13799-05'] # 4*5
train_ids_m = ['09-322-02',
'09-16566-02',
'10-13799-06',
'10-15247-02',
'm6_10719 T3N2a', 'm17_1421 IE-11 T3N2a', 'm18_1421 IE-11 1-86', 'm39_10-1273'] # 5*4
sets_dict['train'] = train_ids_b + train_ids_m
# validation has 29 images
val_ids_b = ['10-12813-05',
'10-13799-02',
'm2_10449-11E-T3N1b'] # 2*4 + 1 = 9
val_ids_m = ['09-1339-02',
'09-1339-05',
'09-1646-01',
'09-1646-02',
'09-23757-01'] # 5*4 = 20
sets_dict['val'] = val_ids_b + val_ids_m
# test has equal mal and ben and 16 img
test_ids_m = ['09-1646-03', '09-1646-05'] # 2*4 = 8
test_ids_b = ['10-12813-01', '10-13799-01'] # 2*4 = 8
sets_dict['test'] = test_ids_b + test_ids_m
print('Splitting the dataset into train, val and test')
for s in ['train', 'test', 'val']:
make_folder_if_not_exists(os.path.join(dataset_root, s, 'gt'))
make_folder_if_not_exists(os.path.join(dataset_root, s, 'data'))
print('CREATING {} SET'.format(s))
for patient in sets_dict[s]:
for img_file in os.listdir(dataset_root):
if patient in img_file:
if 'anno' in img_file:
# convert gt into correct data format
convert_gt(os.path.join(dataset_root, img_file))
out_file = os.path.join('gt', img_file.replace('_anno', ''))
else:
out_file = os.path.join('data', img_file)
shutil.move(os.path.join(dataset_root, img_file), os.path.join(dataset_root, s, out_file))
if __name__ == "__main__":
downloadable_datasets = [name[0] for name in inspect.getmembers(sys.modules[__name__],
inspect.isfunction) if not name[0].startswith('_')]
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='This script can be used to download some '
'datasets and prepare them in a standard format')
parser.add_argument('--dataset',
help='name of the dataset',
type=str,
choices=downloadable_datasets)
parser.add_argument('--output-folder',
help='path to where the dataset should be generated.',
required=False,
type=str,
default='./data/')
args = parser.parse_args()
getattr(sys.modules[__name__], args.dataset)(args)