Source code for util.data.get_a_dataset

import argparse
import fnmatch
import inspect
import os
import shutil
import sys

import numpy as np
import pandas as pd
import urllib
import zipfile
import re
import csv
import tarfile
import codecs
import gzip
import requests
from tqdm import tqdm
import rarfile
import torch
import torchvision
import wget
from PIL import Image
import scipy
from scipy.io import loadmat as _loadmat
from sklearn.model_selection import train_test_split as _train_test_split

from util.data.dataset_splitter import split_dataset, split_dataset_writerIdentification
from util.misc import get_all_files_in_folders_and_subfolders \
    as _get_all_files_in_folders_and_subfolders, pil_loader, make_folder_if_not_exists


[docs]def mnist(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the MNIST dataset to the location specified
    on the file system

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # Use torchvision to download the dataset
    torchvision.datasets.MNIST(root=args.output_folder, download=True)

    # Load the data into memory
    train_data, train_labels = torch.load(os.path.join(args.output_folder,
                                                       'MNIST',
                                                       'processed',
                                                       'training.pt'))
    test_data, test_labels = torch.load(os.path.join(args.output_folder,
                                                     'MNIST',
                                                     'processed',
                                                     'test.pt'))

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'MNIST')
    train_folder = os.path.join(dataset_root, 'train')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(arr, labels, folder):
        for i, (img, label) in enumerate(zip(arr, labels.detach().numpy())):
            dest = os.path.join(folder, str(label))
            make_folder_if_not_exists(dest)
            Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png'))

    # Write the images to the folders
    _write_data_to_folder(train_data, train_labels, train_folder)
    _write_data_to_folder(test_data, test_labels, test_folder)

    shutil.rmtree(os.path.join(args.output_folder, 'MNIST', 'raw'))
    shutil.rmtree(os.path.join(args.output_folder, 'MNIST', 'processed'))

    split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)


[docs]def svhn(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the SVHN dataset to the location specified
    on the file system

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # Use torchvision to download the dataset
    torchvision.datasets.SVHN(root=args.output_folder, split='train', download=True)
    torchvision.datasets.SVHN(root=args.output_folder, split='test', download=True)

    # Load the data into memory
    train = _loadmat(os.path.join(args.output_folder,
                                  'train_32x32.mat'))
    train_data, train_labels = train['X'], train['y'].astype(np.int64).squeeze()
    np.place(train_labels, train_labels == 10, 0)
    train_data = np.transpose(train_data, (3, 0, 1, 2))

    test = _loadmat(os.path.join(args.output_folder,
                                 'test_32x32.mat'))
    test_data, test_labels = test['X'], test['y'].astype(np.int64).squeeze()
    np.place(test_labels, test_labels == 10, 0)
    test_data = np.transpose(test_data, (3, 0, 1, 2))

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'SVHN')
    train_folder = os.path.join(dataset_root, 'train')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(arr, labels, folder):
        for i, (img, label) in enumerate(zip(arr, labels)):
            dest = os.path.join(folder, str(label))
            make_folder_if_not_exists(dest)
            Image.fromarray(img).save(os.path.join(dest, str(i) + '.png'))

    # Write the images to the folders
    _write_data_to_folder(train_data, train_labels, train_folder)
    _write_data_to_folder(test_data, test_labels, test_folder)

    os.remove(os.path.join(args.output_folder, 'train_32x32.mat'))
    os.remove(os.path.join(args.output_folder, 'test_32x32.mat'))

    split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)


[docs]def cifar10(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the CIFAR dataset to the location specified
    on the file system

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # Use torchvision to download the dataset
    cifar_train = torchvision.datasets.CIFAR10(root=args.output_folder, train=True, download=True)
    cifar_test = torchvision.datasets.CIFAR10(root=args.output_folder, train=False, download=True)

    # Load the data into memory
    train_data, train_labels = cifar_train.data, cifar_train.targets

    test_data, test_labels = cifar_test.data, cifar_test.targets

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'CIFAR10')
    train_folder = os.path.join(dataset_root, 'train')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(arr, labels, folder):
        for i, (img, label) in enumerate(zip(arr, labels)):
            dest = os.path.join(folder, str(label))
            make_folder_if_not_exists(dest)
            Image.fromarray(img).save(os.path.join(dest, str(i) + '.png'))

    # Write the images to the folders
    _write_data_to_folder(train_data, train_labels, train_folder)
    _write_data_to_folder(test_data, test_labels, test_folder)

    os.remove(os.path.join(args.output_folder, 'cifar-10-python.tar.gz'))
    shutil.rmtree(os.path.join(args.output_folder, 'cifar-10-batches-py'))

    split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)

[docs]def diva_hisdb(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the DIVA HisDB-all dataset for semantic segmentation to the location specified
    on the file system

    See also: https://diuf.unifr.ch/main/hisdoc/diva-hisdb

    Output folder structure: ../HisDB/CB55/train
                             ../HisDB/CB55/val
                             ../HisDB/CB55/test

                             ../HisDB/CB55/test/data -> images
                             ../HisDB/CB55/test/gt   -> pixel-wise annotated ground truth

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # make the root folder
    dataset_root = os.path.join(args.output_folder, 'HisDB')
    make_folder_if_not_exists(dataset_root)

    # links to HisDB data sets
    link_public = urllib.parse.urlparse(
        'https://diuf.unifr.ch/main/hisdoc/sites/diuf.unifr.ch.main.hisdoc/files/uploads/diva-hisdb/hisdoc/all.zip')
    link_test_private = urllib.parse.urlparse(
        'https://diuf.unifr.ch/main/hisdoc/sites/diuf.unifr.ch.main.hisdoc/files/uploads/diva-hisdb/hisdoc/private-test/all-privateTest.zip')
    download_path_public = os.path.join(dataset_root, link_public.geturl().rsplit('/', 1)[-1])
    download_path_private = os.path.join(dataset_root, link_test_private.geturl().rsplit('/', 1)[-1])

    # download files
    print('Downloading {}...'.format(link_public.geturl()))
    urllib.request.urlretrieve(link_public.geturl(), download_path_public)

    print('Downloading {}...'.format(link_test_private.geturl()))
    urllib.request.urlretrieve(link_test_private.geturl(), download_path_private)
    print('Download complete. Unpacking files...')

    # unpack relevant folders
    zip_file = zipfile.ZipFile(download_path_public)

    # unpack imgs and gt
    data_gt_zip = {f: re.sub(r'img', 'pixel-level-gt', f) for f in zip_file.namelist() if 'img' in f}
    dataset_folders = [data_file.split('-')[-1][:-4] for data_file in data_gt_zip.keys()]
    for data_file, gt_file in data_gt_zip.items():
        dataset_name = data_file.split('-')[-1][:-4]
        dataset_folder = os.path.join(dataset_root, dataset_name)
        make_folder_if_not_exists(dataset_folder)

        for file in [data_file, gt_file]:
            zip_file.extract(file, dataset_folder)
            with zipfile.ZipFile(os.path.join(dataset_folder, file), "r") as zip_ref:
                zip_ref.extractall(dataset_folder)
                # delete zips
                os.remove(os.path.join(dataset_folder, file))

        # create folder structure
        for partition in ['train', 'val', 'test', 'test-public']:
            for folder in ['data', 'gt']:
                make_folder_if_not_exists(os.path.join(dataset_folder, partition, folder))

    # move the files to the correct place
    for folder in dataset_folders:
        for k1, v1 in {'pixel-level-gt': 'gt', 'img': 'data'}.items():
            for k2, v2 in {'public-test': 'test-public', 'training': 'train', 'validation': 'val'}.items():
                current_path = os.path.join(dataset_root, folder, k1, k2)
                new_path = os.path.join(dataset_root, folder, v2, v1)
                for f in [f for f in os.listdir(current_path) if os.path.isfile(os.path.join(current_path, f))]:
                    shutil.move(os.path.join(current_path, f), os.path.join(new_path, f))
            # remove old folders
            shutil.rmtree(os.path.join(dataset_root, folder, k1))

    # fix naming issue
    for old, new in {'CS18': 'CSG18', 'CS863': 'CSG863'}.items():
        os.rename(os.path.join(dataset_root, old), os.path.join(dataset_root, new))

    # unpack private test folders
    zip_file_private = zipfile.ZipFile(download_path_private)

    data_gt_zip_private = {f: re.sub(r'img', 'pixel-level-gt', f) for f in zip_file_private.namelist() if 'img' in f}

    for data_file, gt_file in data_gt_zip_private.items():
        dataset_name = re.search('-(.*)-', data_file).group(1)
        dataset_folder = os.path.join(dataset_root, dataset_name)

        for file in [data_file, gt_file]:
            zip_file_private.extract(file, dataset_folder)
            with zipfile.ZipFile(os.path.join(dataset_folder, file), "r") as zip_ref:
                zip_ref.extractall(os.path.join(dataset_folder, file[:-4]))
            # delete zip
            os.remove(os.path.join(dataset_folder, file))

        # create folder structure
        for folder in ['data', 'gt']:
            make_folder_if_not_exists(os.path.join(dataset_folder, 'test', folder))

        for old, new in {'pixel-level-gt': 'gt', 'img': 'data'}.items():
            current_path = os.path.join(dataset_folder, "{}-{}-privateTest".format(old, dataset_name), dataset_name)
            new_path = os.path.join(dataset_folder, "test", new)
            for f in [f for f in os.listdir(current_path) if os.path.isfile(os.path.join(current_path, f))]:
                # the ground truth files in the private test set have an additional ending, which needs to be remove
                if new == "gt":
                    f_new = re.sub('_gt', r'', f)
                else:
                    f_new = f
                shutil.move(os.path.join(current_path, f), os.path.join(new_path, f_new))

            # remove old folders
            shutil.rmtree(os.path.dirname(current_path))

    print('Finished. Data set up at {}.'.format(dataset_root))

[docs]def icdar2017_clamm(args):

    url = "http://clamm.irht.cnrs.fr/wp-content/uploads/ICDAR2017_CLaMM_Training.zip"
    print("Downloading " + url)
    zip_name = "ICDAR2017_CLaMM_Training.zip"
    local_filename, headers = urllib.request.urlretrieve(url, zip_name)
    zfile = zipfile.ZipFile(local_filename)

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'ICDAR2017-CLAMM')
    dataset_manuscriptDating = os.path.join(dataset_root , 'ManuscriptDating')
    dataset_md_train = os.path.join(dataset_manuscriptDating , 'train')
    dataset_styleClassification = os.path.join(dataset_root , 'StyleClassification')
    dataset_sc_train = os.path.join(dataset_styleClassification, 'train')
    test_sc_folder = os.path.join(dataset_styleClassification, 'test')
    test_md_folder = os.path.join(dataset_manuscriptDating, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(dataset_manuscriptDating)
    make_folder_if_not_exists(dataset_styleClassification)
    make_folder_if_not_exists(test_sc_folder)

    def _write_data_to_folder(zipfile, filenames, labels, folder, start_index,  isTest):
        print("Writing data\n")
        sorted_labels = [None]*len(labels)
        if isTest == 1:
            for i in range(len(zipfile.infolist())):
                entry = zipfile.infolist()[i]
                if "IRHT_P_009793.tif" in entry.filename:
                    zipfile.infolist().remove(entry)
                    break

        zip_infolist = zipfile.infolist()[1:]

        for i in range(len(zip_infolist)):
            entry = zip_infolist[i]
            entry_index_infilenames = filenames.index(entry.filename[start_index:])
            sorted_labels[i] = labels[entry_index_infilenames]

        for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:], sorted_labels)):
            with zipfile.open(enrty) as file:
                img = Image.open(file)
                dest = os.path.join(folder, str(label))
                make_folder_if_not_exists(dest)
                img.save(os.path.join(dest, str(i) + '.png'), "PNG", quality=100)

    def getLabels(zfile):
        print("Extracting labels\n")
        filenames, md_labels, sc_labels = [], [], []
        zip_infolist = zfile.infolist()[1:]
        for entry in zip_infolist:
            if '.csv' in entry.filename:
                with zfile.open(entry) as file:
                    cf = file.read()
                    c = csv.StringIO(cf.decode())
                    next(c) # Skip the first line which is the header of csv file
                    for row in c:

                        md_label_strt_ind = row.rfind(';')
                        md_label_end_ind = row.rfind("\r")
                        md_labels.append(row[md_label_strt_ind+1:md_label_end_ind])
                        sc_labels_strt_ind = row[:md_label_strt_ind].rfind(';')
                        sc_labels.append(row[sc_labels_strt_ind+1:md_label_strt_ind])
                        filename_ind = row[:sc_labels_strt_ind].rfind(';')

                        if filename_ind > -1:
                            f_name = row[filename_ind+1:sc_labels_strt_ind]
                        else:
                            f_name = row[:sc_labels_strt_ind]
                        if isTest == 1 and f_name == 'IRHT_P_009783.tif':
                            print('No file named ' + f_name + ". This filename will not be added!")
                        else:
                            filenames.append(f_name)

                zfile.infolist().remove(entry) # remove the csv file from infolist
            if '.db' in entry.filename: # remove the db file from infolist
                zfile.infolist().remove(entry)
        return filenames, sc_labels, md_labels

    isTest = 0
    filenames, sc_labels, md_labels = getLabels(zfile)
    start_index_training = len("ICDAR2017_CLaMM_Training/")
    print("Training data is being prepared for style classification!\n")
    _write_data_to_folder(zfile, filenames, sc_labels, dataset_sc_train, start_index_training, isTest)
    print("Training data is being prepared for manuscript dating!\n")
    _write_data_to_folder(zfile, filenames, md_labels, dataset_md_train, start_index_training, isTest)

    os.remove(os.path.join(zfile.filename))

    url = "http://clamm.irht.cnrs.fr/wp-content/uploads/ICDAR2017_CLaMM_task1_task3.zip"
    print("Downloading " + url)
    zip_name_test = "ICDAR2017_CLaMM_task1_task3.zip"
    local_filename_test, headers_test = urllib.request.urlretrieve(url, zip_name_test)
    zfile_test = zipfile.ZipFile(local_filename_test)

    isTest = 1
    filenames_test, sc_test_labels, md_test_labels = getLabels(zfile_test)
    start_index_test = len("ICDAR2017_CLaMM_task1_task3/")
    print("Test data is being prepared for style classification!\n")
    _write_data_to_folder(zfile_test, filenames_test, sc_test_labels, test_sc_folder, start_index_test, 1)
    print("Test data is being prepared for manuscript dating!\n")
    _write_data_to_folder(zfile_test, filenames_test, md_test_labels, test_md_folder, start_index_test, 1)

    os.remove(os.path.join(zfile_test.filename))
    print("Training-Validation splitting\n")
    split_dataset(dataset_folder=dataset_manuscriptDating, split=0.2, symbolic=False)
    split_dataset(dataset_folder=dataset_styleClassification, split=0.2, symbolic=False)
    print("ICDAR2017 CLaMM data is ready!")


[docs]def historical_wi(args):

    train_binarized_url = "ftp://scruffy.caa.tuwien.ac.at/staff/database/icdar2017/icdar17-historicalwi-training-binarized.zip"
    train_colored_url = "ftp://scruffy.caa.tuwien.ac.at/staff/database/icdar2017/icdar17-historicalwi-training-color.zip"
    test_binarized_url = "https://zenodo.org/record/854353/files/ScriptNet-HistoricalWI-2017-binarized.zip?download=1"
    test_colored_url = "https://zenodo.org/record/854353/files/ScriptNet-HistoricalWI-2017-color.zip?download=1"
    urls = [train_binarized_url, train_colored_url, test_binarized_url, test_colored_url]

    zip_name_train_binarized = "icdar17-historicalwi-training-binarized.zip"
    zip_name_train_color = "icdar17-historicalwi-training-color.zip"
    zip_name_test_binarized = "ScriptNet-HistoricalWI-2017-binarized.zip"
    zip_name_test_color = "ScriptNet-HistoricalWI-2017-color.zip"
    zip_names = [zip_name_train_binarized, zip_name_train_color, zip_name_test_binarized, zip_name_test_color]
    start_indices = [len("icdar2017-training-binary/"), len("icdar2017-training-color/"),
                     len("ScriptNet-HistoricalWI-2017-binarized/"), len("ScriptNet-HistoricalWI-2017-color/")]

    # Make output folders
    """
    dataset_root = os.path.join(args.output_folder)
    train_folder = os.path.join(dataset_root, 'train')
    train_binarized_folder = os.path.join(train_folder, 'Binarized')
    train_colored_folder = os.path.join(train_folder, 'Color')
    test_folder = os.path.join(dataset_root, 'test')
    test_binarized_folder = os.path.join(test_folder, 'Binarized')
    test_colored_folder = os.path.join(test_folder, 'Color')
    folders = [train_binarized_folder, train_colored_folder, test_binarized_folder, test_colored_folder]

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(train_binarized_folder)
    make_folder_if_not_exists(train_colored_folder)
    make_folder_if_not_exists(test_folder)
    make_folder_if_not_exists(test_binarized_folder)
    make_folder_if_not_exists(test_colored_folder)
    """
    dataset_root = os.path.join(os.path.join(args.output_folder, 'historical_wi'))
    binarized_dataset = os.path.join(dataset_root, "BinarizedDataset")
    train_binarized_folder = os.path.join(binarized_dataset, 'train')
    test_binarized_folder = os.path.join(binarized_dataset, 'test')
    colored_dataset = os.path.join(dataset_root, "ColoredDataset")
    train_colored_folder = os.path.join(colored_dataset, 'train')
    test_colored_folder = os.path.join(colored_dataset, 'test')
    folders = [train_binarized_folder, train_colored_folder, test_binarized_folder, test_colored_folder]

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(binarized_dataset)
    make_folder_if_not_exists(colored_dataset)
    make_folder_if_not_exists(train_binarized_folder)
    make_folder_if_not_exists(train_colored_folder)
    make_folder_if_not_exists(test_binarized_folder)
    make_folder_if_not_exists(test_colored_folder)

    def _write_data_to_folder(zipfile, labels, folder, isTrainingset):
        print("Writing data to folder\n")
        for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:], labels)):
            with zipfile.open(enrty) as file:
                img = Image.open(file)
                dest = os.path.join(folder, str(label))
                make_folder_if_not_exists(dest)
                if isTrainingset == 1:
                    img.save(os.path.join(dest, str(i) + '.png'))
                else:
                    img.save(os.path.join(dest, str(i) + '.jpg'))

    def _get_labels(zipfile, start_index):
        print("Extracting labels\n")
        labels = []
        for zipinfo in zipfile.infolist()[1:]:
            file_name = zipinfo.filename
            ind = file_name.find("-", start_index)
            labels.append(file_name[start_index:ind])
        return labels

    #Prepare Datasets

    for i in range(len(urls)):
        if i < 2:
            isTrainingset = 1
        else:
            isTrainingset = 0

        print("Downloading " + urls[i])
        local_filename, headers = urllib.request.urlretrieve(urls[i], zip_names[i])
        zfile = zipfile.ZipFile(local_filename)
        labels = _get_labels(zfile, start_indices[i])
        _write_data_to_folder(zfile, labels, folders[i], isTrainingset)
        os.remove(os.path.join(zfile.filename))
        if i == 0:
            print("Binary training data is ready!")
        elif i == 1:
            print("Colored training data is ready!")
        elif i == 2:
            print("Binary test data is ready!")
        else:
            print("Colored test data is ready!")

    split_dataset_writerIdentification(dataset_folder=dataset_root, split=0.2)

    print("Historical WI dataset is ready!")


[docs]def kmnist(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the K-MNIST dataset to the location specified
    on the file system

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """

    def get_int(b):
        return int(codecs.encode(b, 'hex'), 16)

    def read_image_file(path):
        with open(path, 'rb') as f:
            data = f.read()
            assert get_int(data[:4]) == 2051
            length = get_int(data[4:8])
            num_rows = get_int(data[8:12])
            num_cols = get_int(data[12:16])
            images = []
            parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
            return torch.from_numpy(parsed).view(length, num_rows, num_cols)

    def read_label_file(path):
        with open(path, 'rb') as f:
            data = f.read()
            assert get_int(data[:4]) == 2049
            length = get_int(data[4:8])
            parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
            return torch.from_numpy(parsed).view(length).long()

    try:
        torchvision.datasets.KMNIST(root=args.output_folder, download=True)

    except AttributeError:
        url_list = ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz',
                    'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz',
                    'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz',
                    'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz']

        raw_folder = os.path.join(args.output_folder, 'raw')
        processed_folder = os.path.join(args.output_folder, 'processed')
        make_folder_if_not_exists(raw_folder)
        make_folder_if_not_exists(processed_folder)

        training_file = 'training.pt'
        test_file = 'test.pt'

        for url in url_list:
            print('Downloading ' + url)
            data = urllib.request.urlopen(url)
            filename = url.rpartition('/')[2]
            file_path = os.path.join(raw_folder, filename)
            with open(file_path, 'wb') as f:
                f.write(data.read())
            with open(file_path.replace('.gz', ''), 'wb') as out_f, \
                    gzip.GzipFile(file_path) as zip_f:
                out_f.write(zip_f.read())
            os.unlink(file_path)

        # process and save as torch files
        print('Processing...')

        training_set = (
            read_image_file(os.path.join(raw_folder, 'train-images-idx3-ubyte')),
            read_label_file(os.path.join(raw_folder, 'train-labels-idx1-ubyte'))
        )
        test_set = (
            read_image_file(os.path.join(raw_folder, 't10k-images-idx3-ubyte')),
            read_label_file(os.path.join(raw_folder, 't10k-labels-idx1-ubyte'))
        )
        with open(os.path.join(processed_folder, training_file), 'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(processed_folder, test_file), 'wb') as f:
            torch.save(test_set, f)

        print('Done!')

    # Load the data into memory
    train_data, train_labels = torch.load(os.path.join(args.output_folder,
                                                       'processed',
                                                       'training.pt'))
    test_data, test_labels = torch.load(os.path.join(args.output_folder,
                                                     'processed',
                                                     'test.pt'))

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'KMNIST')
    train_folder = os.path.join(dataset_root, 'train')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(arr, labels, folder):
        for i, (img, label) in enumerate(zip(arr, labels)):
            dest = os.path.join(folder, str(label))
            make_folder_if_not_exists(dest)
            Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png'))

    # Write the images to the folders
    _write_data_to_folder(train_data, train_labels, train_folder)
    _write_data_to_folder(test_data, test_labels, test_folder)

    shutil.rmtree(os.path.join(args.output_folder, 'raw'))
    shutil.rmtree(os.path.join(args.output_folder, 'processed'))

    split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
    print("The KMNIST dataset is ready for you at {}".format(dataset_root))


# def kuzushiji_kanji(args):
#     """
#     Fetches and prepares (in a DeepDIVA friendly format) the K-MNIST dataset to the location specified
#     on the file system
#
#     Parameters
#     ----------
#     args : dict
#         List of arguments necessary to run this routine. In particular its necessary to provide
#         output_folder as String containing the path where the dataset will be downloaded
#
#     Returns
#     -------
#         None
#     """
#     url = 'http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar'
#     dataset_root = os.path.join(args.output_folder, 'kkanji')
#
#     path = os.path.join(dataset_root, url.split('/')[-1])
#     r = requests.get(url, stream=True)
#     with open(path, 'wb') as f:
#         total_length = int(r.headers.get('content-length'))
#         print('Downloading {} - {:.1f} MB'.format(path, (total_length / 1024000)))
#
#         for chunk in tqdm(r.iter_content(chunk_size=1024), total=int(total_length / 1024) + 1, unit="KB"):
#             if chunk:
#                 f.write(chunk)
#
#     print('All dataset files downloaded!')
#
#     with tarfile.open(os.path.join(dataset_root, "kkanji.tar")) as f:
#         f.extractall()
#     shutil.rmtree(os.path.join(dataset_root, "kkanji.tar"))
#
#
#     # Make output folders
#     train_folder = os.path.join(dataset_root, 'train')
#     test_folder = os.path.join(dataset_root, 'test')
#
#     make_folder_if_not_exists(dataset_root)
#     make_folder_if_not_exists(train_folder)
#     make_folder_if_not_exists(test_folder)
#
#     split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
#     print("The kkanji dataset is ready for you at {}".format(dataset_root))


[docs]def fashion_mnist(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the Fashion-MNIST dataset to the location specified
    on the file system

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # Use torchvision to download the dataset
    torchvision.datasets.FashionMNIST(root=args.output_folder, download=True)

    # Load the data into memory
    train_data, train_labels = torch.load(os.path.join(args.output_folder,
                                                       'processed',
                                                       'training.pt'))
    test_data, test_labels = torch.load(os.path.join(args.output_folder,
                                                     'processed',
                                                     'test.pt'))

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'Fashion-MNIST')
    train_folder = os.path.join(dataset_root, 'train')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(arr, labels, folder):
        for i, (img, label) in enumerate(zip(arr, labels)):
            dest = os.path.join(folder, str(label))
            make_folder_if_not_exists(dest)
            Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png'))

    # Write the images to the folders
    _write_data_to_folder(train_data, train_labels, train_folder)
    _write_data_to_folder(test_data, test_labels, test_folder)

    shutil.rmtree(os.path.join(args.output_folder, 'raw'))
    shutil.rmtree(os.path.join(args.output_folder, 'processed'))

    split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)

[docs]def miml(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the Multi-Instance Multi-Label Image Dataset
    on the file system. Dataset available at: http://lamda.nju.edu.cn/data_MIMLimage.ashx

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # Download the files
    url = 'http://lamda.nju.edu.cn/files/miml-image-data.rar'
    if not os.path.exists(os.path.join(args.output_folder, 'miml-image-data.rar')):
        print('Downloading file!')
        filename = wget.download(url, out=args.output_folder)
    else:
        print('File already downloaded!')
        filename = os.path.join(args.output_folder, 'miml-image-data.rar')

    # Extract the files
    path_to_rar = filename
    path_to_output = os.path.join(args.output_folder, 'tmp_miml')
    rarfile.RarFile(path_to_rar).extractall(path_to_output)
    path_to_rar = os.path.join(path_to_output, 'original.rar')
    rarfile.RarFile(path_to_rar).extractall(path_to_output)
    path_to_rar = os.path.join(path_to_output, 'processed.rar')
    rarfile.RarFile(path_to_rar).extractall(path_to_output)
    print('Extracted files...')

    # Load the mat file
    mat = _loadmat(os.path.join(path_to_output, 'miml data.mat'))
    targets = mat['targets'].T
    classes = [item[0][0] for item in mat['class_name']]
    # Add filename at 0-index to correctly format the CSV headers
    classes.insert(0, 'filename')

    # Get list of all image files in the folder
    images = [item for item in _get_all_files_in_folders_and_subfolders(path_to_output)
              if item.endswith('jpg')]
    images = sorted(images, key=lambda e: int(os.path.basename(e).split('.')[0]))

    # Make splits
    train_data, test_data, train_labels, test_labels = _train_test_split(images, targets, test_size=0.2,
                                                                         random_state=42)
    train_data, val_data, train_labels, val_labels = _train_test_split(train_data, train_labels, test_size=0.2,
                                                                       random_state=42)

    # print('Size of splits\ntrain:{}\nval:{}\ntest:{}'.format(len(train_data),
    #                                                     len(val_data),
    #                                                     len(test_data)))

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'MIML')
    train_folder = os.path.join(dataset_root, 'train')
    val_folder = os.path.join(dataset_root, 'val')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(val_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(data, labels, folder, classes):
        dest = os.path.join(folder, 'images')
        make_folder_if_not_exists(dest)
        for image, label in zip(data, labels):
            shutil.copy(image, dest)

        rows = np.column_stack(([os.path.join('images', os.path.basename(item)) for item in data], labels))
        rows = sorted(rows, key=lambda e: int(e[0].split('/')[1].split('.')[0]))
        output_csv = pd.DataFrame(rows)
        output_csv.to_csv(os.path.join(folder, 'labels.csv'), header=classes, index=False)
        return

    # Write the images to the correct folders
    print('Writing the data to the filesystem')
    _write_data_to_folder(train_data, train_labels, train_folder, classes)
    _write_data_to_folder(val_data, val_labels, val_folder, classes)
    _write_data_to_folder(test_data, test_labels, test_folder, classes)

    os.remove(filename)
    shutil.rmtree(path_to_output)
    print('All done!')
    return


[docs]def glas(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the tubule dataset (from the GlaS challenge) for semantic
    segmentation to the location specified on the file system

    See also: https://github.com/choosehappy/public/tree/master/DL%20tutorial%20Code/3-tubule

    Output folder structure: ../HisDB/GlaS/train
                             ../HisDB/GlaS/val
                             ../HisDB/GlaS/test

                             ../HisDB/GlaS/test/data -> images
                             ../HisDB/GlaS/test/gt   -> pixel-wise annotated ground truth

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """

    def groupby_patient(list_to_group, index=3):
        """
        split images by patient
        :param list_to_group: list of image names
        :param index: position of split by '-' in the image name to obtain patient ID
        :return:  dictionary where keys are patient IDs and values are lists of images that are from that patient
        """
        return {
            '-'.join(filename.split('-')[:index]): [file for file in list_to_group if '-'.join(file.split('-')[:index])
                                                    == '-'.join(filename.split('-')[:index])] for filename in
            list_to_group}

    def convert_gt(img_path):
        img = pil_loader(img_path)

        out_img = np.zeros((*img.shape, 3), dtype=np.uint8)
        out_img[:, :, 2] = 1  # set everything to background in blue channel
        out_img[:, :, 2][img != 0] = 2  # set glands to 2 in blue channel

        out = Image.fromarray(out_img)
        out.save(img_path)

    # make the root folder
    output_folder = args.output_folder
    dataset_root = os.path.join(output_folder, 'GlaS')
    make_folder_if_not_exists(dataset_root)

    # links to HisDB data sets
    link_tubules = urllib.parse.urlparse(
        'http://andrewjanowczyk.com/wp-static/tubule.tgz')

    download_path_tubules = os.path.join(dataset_root, link_tubules.geturl().rsplit('/', 1)[-1])

    # download files
    print('Downloading {}...'.format(link_tubules.geturl()))
    urllib.request.urlretrieve(link_tubules.geturl(), download_path_tubules)

    print('Download complete. Unpacking files...')

    # unpack tubule folder that contains images, annotations and text files with lists of benign and malignant samples
    tar_file = tarfile.open(download_path_tubules)
    tar_file.extractall(path=dataset_root)

    sets_dict = {}
    # 20 benign + 20 malignant images
    train_ids_b = ['09-1339-01',
                   '09-16566-03',
                   '09-21631-03',
                   '09-23232-02',
                   'm9_10741F-12T2N0', '10-13799-05']  # 4*5

    train_ids_m = ['09-322-02',
                   '09-16566-02',
                   '10-13799-06',
                   '10-15247-02',
                   'm6_10719 T3N2a', 'm17_1421 IE-11 T3N2a', 'm18_1421 IE-11 1-86', 'm39_10-1273']  # 5*4

    sets_dict['train'] = train_ids_b + train_ids_m

    # validation has 29 images
    val_ids_b = ['10-12813-05',
                 '10-13799-02',
                 'm2_10449-11E-T3N1b']  # 2*4 + 1 = 9

    val_ids_m = ['09-1339-02',
                 '09-1339-05',
                 '09-1646-01',
                 '09-1646-02',
                 '09-23757-01']  # 5*4 = 20

    sets_dict['val'] = val_ids_b + val_ids_m

    # test has equal mal and ben and 16 img

    test_ids_m = ['09-1646-03', '09-1646-05']  # 2*4 = 8
    test_ids_b = ['10-12813-01', '10-13799-01']  # 2*4 = 8

    sets_dict['test'] = test_ids_b + test_ids_m

    print('Splitting the dataset into train, val and test')
    for s in ['train', 'test', 'val']:
        make_folder_if_not_exists(os.path.join(dataset_root, s, 'gt'))
        make_folder_if_not_exists(os.path.join(dataset_root, s, 'data'))

        print('CREATING {} SET'.format(s))
        for patient in sets_dict[s]:
            for img_file in os.listdir(dataset_root):
                if patient in img_file:
                    if 'anno' in img_file:
                        # convert gt into correct data format
                        convert_gt(os.path.join(dataset_root, img_file))
                        out_file = os.path.join('gt', img_file.replace('_anno', ''))
                    else:
                        out_file = os.path.join('data', img_file)

                    shutil.move(os.path.join(dataset_root, img_file), os.path.join(dataset_root, s, out_file))


if __name__ == "__main__":
    downloadable_datasets = [name[0] for name in inspect.getmembers(sys.modules[__name__],
                                                                    inspect.isfunction) if not name[0].startswith('_')]
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description='This script can be used to download some '
                                                 'datasets and prepare them in a standard format')

    parser.add_argument('--dataset',
                        help='name of the dataset',
                        type=str,
                        choices=downloadable_datasets)
    parser.add_argument('--output-folder',
                        help='path to where the dataset should be generated.',
                        required=False,
                        type=str,
                        default='./data/')
    args = parser.parse_args()

    getattr(sys.modules[__name__], args.dataset)(args)