Source code for template.setup

# Utils
import colorlog
import inspect
import json
import logging
import numpy as np
import os
import pandas as pd
import random
import shutil
import subprocess
import sys
import tarfile
import tempfile
import time
# Torch related stuff
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.nn.parallel
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from tensorboardX import SummaryWriter

# DeepDIVA
import models
from datasets import image_folder_dataset, bidimensional_dataset
from util.data.dataset_analytics import compute_mean_std, compute_mean_std_segmentation
from util.data.dataset_integrity import verify_integrity_quick, verify_integrity_deep
from util.misc import get_all_files_in_folders_and_subfolders


[docs]def set_up_model(output_channels, model_name, pretrained, no_cuda, resume, load_model, disable_databalancing, dataset_folder,
                 inmem, workers, optimizer_name=None, criterion_name=None, num_classes=None, ablate=False, **kwargs):
    """
    Instantiate model, optimizer, criterion. Load a pretrained model or resume from a checkpoint.

    Parameters
    ----------
    output_channels : int
        Specify shape of final layer of network. Only used if num_classes is not specified.
    model_name : string
        Name of the model
    pretrained : bool
        Specify whether to load a pretrained model or not
    optimizer_name : string
        Name of the optimizer
    criterion_name : string
        Name of the criterion
    no_cuda : bool
        Specify whether to use the GPU or not
    resume : string
        Path to a saved checkpoint
    load_model : string
        Path to a saved model
    start_epoch : int
        Epoch from which to resume training. If if not resuming a previous experiment the value is 0
    disable_databalancing : boolean
        If True the criterion will not be fed with the class frequencies. Use with care.
    dataset_folder : String
        Location of the dataset on the file system
    inmem : boolean
        Load the whole dataset in memory. If False, only file names are stored and images are loaded
        on demand. This is slower than storing everything in memory.
    workers : int
        Number of workers to use for the dataloaders
    num_classes: int
        Number of classes for the model
    ablate : boolean
        If True, remove the final layer of the given model.

    Returns
    -------
    model : nn.Module
        The actual model
    criterion : nn.loss
        The criterion for the network
    optimizer : torch.optim
        The optimizer for the model
    best_value : float
        Specifies the former best value obtained by the model.
        Relevant only if you are resuming training.
    """

    # Initialize the model
    logging.info('Setting up model {}'.format(model_name))

    output_channels = output_channels if num_classes == None else num_classes
    model = models.__dict__[model_name](output_channels=output_channels, pretrained=pretrained, ablate=ablate, **kwargs)

    # Get the optimizer created with the specified parameters in kwargs (such as lr, momentum, ... )
    if optimizer_name:
        optimizer = _get_optimizer(optimizer_name, model, **kwargs)

    if criterion_name:
        criterion = _get_criterion(criterion_name, disable_databalancing, dataset_folder, inmem, workers, **kwargs)

    # Transfer model to GPU (if desired)
    if not no_cuda:
        logging.info('Transfer model to GPU')
        model = torch.nn.DataParallel(model).cuda()
        criterion = criterion.cuda()
        cudnn.benchmark = True

    # Load saved model
    if load_model:
        if os.path.isfile(load_model):
            # TODO: Remove or make param: map_location
            model_dict = torch.load(load_model, map_location='cpu')
            logging.info('Loading a saved model')
            try:
                model.load_state_dict(model_dict['state_dict'], strict=False)
            except Exception as exp:
                logging.warning(exp)
        else:
            logging.error("No model dict found at '{}'".format(load_model))
            sys.exit(-1)

    # Resume from checkpoint
    if resume:
        if os.path.isfile(resume):
            logging.info("Loading checkpoint '{}'".format(resume))
            checkpoint = torch.load(resume)
            best_value = checkpoint['best_value']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            # val_losses = [checkpoint['val_loss']] #not used?
            logging.info("Loaded checkpoint '{}' (epoch {})"
                         .format(resume, checkpoint['epoch']))
        else:
            logging.error("No checkpoint found at '{}'".format(resume))
            sys.exit(-1)
    else:
        best_value = 0.0

    # Some of these might be None depending on the input parameters
    return model, criterion, optimizer, best_value


def _load_class_frequencies_weights_from_file(dataset_folder, inmem, workers, runner_class, **kwargs):
    """
    This function simply recovers class_frequencies_weights from the analytics.csv file

    Parameters
    ----------
    dataset_folder : string
        Path string that points to the three folder train/val/test. Example: ~/../../data/svhn
    inmem : boolean
        Flag: if False, the dataset is loaded in an online fashion i.e. only file names are stored and images are loaded
        on demand. This is slower than storing everything in memory.
    workers : int
        Number of workers to use for the mean/std computation
    runner_class: string
        specifies the runner class (mean and std have to be computed differently for the semantic segmentation)

    Returns
    -------
    ndarray[double]
        Class frequencies for the selected dataset, contained in the analytics.csv file.
    """
    csv_file = _load_analytics_csv(dataset_folder, inmem, workers, runner_class, **kwargs)
    return np.array([x for x in csv_file.ix[2, 1:].values if str(x) != 'nan']).astype(float)


def _load_class_encodings(dataset_folder, inmem, workers, runner_class, **kwargs):
    """
    This function simply recovers class_encodings from the analytics.csv file

    Parameters
    ----------
    dataset_folder : string
        Path string that points to the three folder train/val/test. Example: ~/../../data/svhn
    inmem : boolean
        Flag: if False, the dataset is loaded in an online fashion i.e. only file names are stored and images are loaded
        on demand. This is slower than storing everything in memory.
    workers : int
        Number of workers to use for the mean/std computation
    runner_class: string
        specifies the runner class (mean and std have to be computed differently for the semantic segmentation)

    Returns
    -------
    ndarray[double]
        Class encodings for the selected dataset, contained in the analytics.csv file.
    """
    csv_file = _load_analytics_csv(dataset_folder, inmem, workers, runner_class, **kwargs)
    return np.array([x for x in csv_file.ix[3, 1:].values if str(x) != 'nan']).astype(int)

def _get_optimizer(optimizer_name, model, **kwargs):
    """
    This function serves as interface between the command line and the optimizer.
    In fact each optimizer has a different set of parameters and in this way one can just change the optimizer
    in his experiments just by changing the parameters passed to the entry point.

    Parameters
    ----------
    optimizer_name:
        Name of the optimizers. See: torch.optim for a list of possible values
    model:
        The model with which the training will be done
    kwargs:
        List of all arguments to be used to init the optimizer
    Returns
    -------
    torch.optim
        The optimizer initialized with the provided parameters
    """
    # Verify the optimizer exists
    assert optimizer_name in torch.optim.__dict__

    params = {}
    # For all arguments declared in the constructor signature of the selected optimizer
    for p in inspect.getfullargspec(torch.optim.__dict__[optimizer_name].__init__).args:
        # Add it to a dictionary in case it exists a corresponding value in kwargs
        if p in kwargs:
            params.update({p: kwargs[p]})
    # Create an return the optimizer with the correct list of parameters
    return torch.optim.__dict__[optimizer_name](model.parameters(), **params)


def _get_criterion(criterion_name, disable_databalancing, dataset_folder, inmem, workers, **kwargs):
    """
    This function serves as an interface between the command line and the criterion.

    Parameters
    ----------
     criterion_name : string
        Name of the criterion
    disable_databalancing : boolean
        If True the criterion will not be fed with the class frequencies. Use with care.
    dataset_folder : String
        Location of the dataset on the file system
    inmem : boolean
        Load the whole dataset in memory. If False, only file names are stored and images are loaded
        on demand. This is slower than storing everything in memory.
    workers : int
        Number of workers to use for the dataloaders

    Returns
    -------
    torch.nn
        The initalized criterion

    """
    # Verify that the criterion exists
    assert criterion_name in torch.nn.__dict__

    # Instantiate the criterion
    criterion = torch.nn.__dict__[criterion_name]()

    if not disable_databalancing:
        try:
            logging.info('Loading weights for data balancing')
            weights = _load_class_frequencies_weights_from_file(dataset_folder, inmem, workers, **kwargs)
            criterion.weight = torch.from_numpy(weights).type(torch.FloatTensor)
        except:
            logging.warning('Unable to load information for data balancing. Using normal criterion')
    return criterion


[docs]def set_up_dataloaders(model_expected_input_size, dataset_folder, batch_size, workers,
                       disable_dataset_integrity, enable_deep_dataset_integrity, inmem=False, **kwargs):
    """
    Set up the dataloaders for the specified datasets.

    Parameters
    ----------
    model_expected_input_size : tuple
        Specify the height and width that the model expects.
    dataset_folder : string
        Path string that points to the three folder train/val/test. Example: ~/../../data/svhn
    batch_size : int
        Number of datapoints to process at once
    workers : int
        Number of workers to use for the dataloaders
    inmem : boolean
        Flag: if False, the dataset is loaded in an online fashion i.e. only file names are stored and images are loaded
        on demand. This is slower than storing everything in memory.

    Returns
    -------
    train_loader : torch.utils.data.DataLoader
    val_loader : torch.utils.data.DataLoader
    test_loader : torch.utils.data.DataLoader
        Dataloaders for train, val and test.
    int
        Number of classes for the model.
    """

    # Recover dataset name
    dataset = os.path.basename(os.path.normpath(dataset_folder))
    logging.info('Loading {} from:{}'.format(dataset, dataset_folder))

    ###############################################################################################
    # Load the dataset splits as images
    try:
        logging.debug("Try to load dataset as images")
        train_ds, val_ds, test_ds = image_folder_dataset.load_dataset(dataset_folder, inmem, workers)

        # Loads the analytics csv and extract mean and std
        mean, std = _load_mean_std_from_file(dataset_folder, inmem, workers, kwargs['runner_class'])

        # Set up dataset transforms
        logging.debug('Setting up dataset transforms')
        transform = transforms.Compose([
            transforms.Resize(model_expected_input_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std)
        ])

        train_ds.transform = transform
        val_ds.transform = transform
        test_ds.transform = transform

        train_loader, val_loader, test_loader = _dataloaders_from_datasets(batch_size, train_ds, val_ds, test_ds,
                                                                           workers)
        logging.info("Dataset loaded as images")
        _verify_dataset_integrity(dataset_folder, disable_dataset_integrity, enable_deep_dataset_integrity)
        return train_loader, val_loader, test_loader, len(train_ds.classes)

    except RuntimeError:
        logging.debug("No images found in dataset folder provided")

    ###############################################################################################
    # Load the dataset splits as bidimensional
    try:
        logging.debug("Try to load dataset as bidimensional")
        train_ds, val_ds, test_ds = bidimensional_dataset.load_dataset(dataset_folder)

        # Loads the analytics csv and extract mean and std
        # TODO: update bidimensional to work with new load_mean_std functions
        mean, std = _load_mean_std_from_file(dataset_folder, inmem, workers, kwargs['runner_class'])

        # Bring mean and std into range [0:1] from original domain
        mean = np.divide((mean - train_ds.min_coords), np.subtract(train_ds.max_coords, train_ds.min_coords))
        std = np.divide((std - train_ds.min_coords), np.subtract(train_ds.max_coords, train_ds.min_coords))

        # Set up dataset transforms
        logging.debug('Setting up dataset transforms')
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std)
        ])

        train_ds.transform = transform
        val_ds.transform = transform
        test_ds.transform = transform

        train_loader, val_loader, test_loader = _dataloaders_from_datasets(batch_size, train_ds, val_ds, test_ds,
                                                                           workers)
        logging.info("Dataset loaded as bidimensional data")
        _verify_dataset_integrity(dataset_folder, disable_dataset_integrity, enable_deep_dataset_integrity)
        return train_loader, val_loader, test_loader, len(train_ds.classes)

    except RuntimeError:
        logging.debug("No bidimensional found in dataset folder provided")

    ###############################################################################################
    # Verify that eventually a dataset has been correctly loaded
    logging.error("No datasets have been loaded. Verify dataset folder location or dataset folder structure")
    sys.exit(-1)


def _verify_dataset_integrity(dataset_folder, disable_dataset_integrity, enable_deep_dataset_integrity):
    """
    Verifies dataset integrity by looking at the footprint.json in the dataset folder.
    In case the deep check is enable, the program will be stopped in case the check
    is not passed.

    Parameters
    ----------
    dataset_folder : string
        Path string that points to the three folder train/val/test. Example: ~/../../data/svhn
    disable_dataset_integrity : boolean
        Flag to enable or disable verifying the dataset integrity
    enable_deep_dataset_integrity : boolean
        Flag to enable or disable verifying the dataset integrity in a deep fashion (check the hashes of all files)
    Returns
    -------
        None
    """
    if not disable_dataset_integrity:
        if enable_deep_dataset_integrity:
            if not verify_integrity_deep(dataset_folder):
                sys.exit(-1)
        else:
            verify_integrity_quick(dataset_folder)


def _load_mean_std_from_file(dataset_folder, inmem, workers, runner_class):
    """
    This function simply recovers mean and std from the analytics.csv file

    Parameters
    ----------
    dataset_folder : string
        Path string that points to the three folder train/val/test. Example: ~/../../data/svhn
    inmem : boolean
        Flag: if False, the dataset is loaded in an online fashion i.e. only file names are stored and images are loaded
        on demand. This is slower than storing everything in memory.
    workers : int
        Number of workers to use for the mean/std computation
    runner_class: string
        specifies the runner class (mean and std have to be computed differently for the semantic segmentation)


    Returns
    -------
    ndarray[double], ndarray[double]
        Mean and Std of the selected dataset, contained in the analytics.csv file.
    """
    # Loads the analytics csv and extract mean and std
    try:
        csv_file = _load_analytics_csv(dataset_folder, inmem, workers, runner_class)
        mean = csv_file.ix[0, 1:3].values.astype(float)
        std = csv_file.ix[1, 1:3].values.astype(float)
    except KeyError:
        import sys
        logging.error('analytics.csv located in {} incorrectly formed. '
                      'Try to delete it and run again'.format(dataset_folder))
        sys.exit(-1)
    return mean, std


def _load_analytics_csv(dataset_folder, inmem, workers, runner_class, **kwargs):
    """
    This function loads the analytics.csv file and attempts creating it, if it is missing

    Parameters
    ----------
    dataset_folder : string
        Path string that points to the three folder train/val/test. Example: ~/../../data/svhn
    inmem : boolean
        Flag: if False, the dataset is loaded in an online fashion i.e. only file names are stored and images are loaded
        on demand. This is slower than storing everything in memory.
    workers : int
        Number of workers to use for the mean/std computation
    runner_class: string
        specifies the runner class (mean and std have to be computed differently for the semantic segmentation)

    Returns
    -------
    file
        The csv file
    """
    # If analytics.csv file not present, run the analytics on the dataset
    if not os.path.exists(os.path.join(dataset_folder, "analytics.csv")):
        logging.warning('Missing analytics.csv file for dataset located at {}'.format(dataset_folder))
        try:
            logging.warning('Attempt creating analytics.csv file for dataset located at {}'.format(dataset_folder))
            if runner_class is not None and 'segmentation' in runner_class:
                compute_mean_std_segmentation(dataset_folder=dataset_folder,
                                              inmem=inmem,
                                              workers=workers,
                                              filter_boundaries=True if 'divahisdb' in runner_class else False)
            else:
                compute_mean_std(dataset_folder=dataset_folder, inmem=inmem, workers=workers)
            logging.warning('Created analytics.csv file for dataset located at {} '.format(dataset_folder))
        except:
            logging.error('Creation of analytics.csv failed.')
            sys.exit(-1)
    # Loads the analytics csv
    return pd.read_csv(os.path.join(dataset_folder, "analytics.csv"), header=None)


def _dataloaders_from_datasets(batch_size, train_ds, val_ds, test_ds, workers):
    """
    This function creates (and returns) dataloader from datasets objects

    Parameters
    ----------
    batch_size : int
        The size of the mini batch
    train_ds : data.Dataset
    val_ds : data.Dataset
    test_ds : data.Dataset
        Train, validation and test splits
    workers:
        Number of workers to use to load the data.

    Returns
    -------
    train_loader : torch.utils.data.DataLoader
    val_loader : torch.utils.data.DataLoader
    test_loader : torch.utils.data.DataLoader
        The dataloaders for each split passed
    """
    # Setup dataloaders
    logging.debug('Setting up dataloaders')
    train_loader = torch.utils.data.DataLoader(train_ds,
                                               shuffle=True,
                                               batch_size=batch_size,
                                               num_workers=workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_ds,
                                             batch_size=batch_size,
                                             num_workers=workers,
                                             pin_memory=True)
    test_loader = torch.utils.data.DataLoader(test_ds,
                                              batch_size=batch_size,
                                              num_workers=workers,
                                              pin_memory=True)
    return train_loader, val_loader, test_loader


#######################################################################################################################
[docs]def set_up_logging(parser, experiment_name, output_folder, quiet, args_dict, debug, **kwargs):
    """
    Set up a logger for the experiment

    Parameters
    ----------
    parser : parser
        The argument parser
    experiment_name : string
        Name of the experiment. If not specify, accepted from command line.
    output_folder : string
        Path to where all experiment logs are stored.
    quiet : bool
        Specify whether to print log to console or only to text file
    debug : bool
        Specify the logging level
    args_dict : dict
        Contains the entire argument dictionary specified via command line.

    Returns
    -------
    log_folder : String
        The final logging folder tree
    writer : tensorboardX.writer.SummaryWriter
        The tensorboard writer object. Used to log values on file for the tensorboard visualization.
    """
    LOG_FILE = 'logs.txt'

    # Experiment name override
    if experiment_name is None:
        experiment_name = input("Experiment name:")

    # Recover dataset name
    dataset = os.path.basename(os.path.normpath(kwargs['dataset_folder']))

    """
    We extract the TRAIN parameters names (such as model_name, lr, ... ) from the parser directly. 
    This is a somewhat risky operation because we access _private_variables of parsers classes.
    However, within our context this can be regarded as safe. 
    Shall we be wrong, a quick fix is writing a list of possible parameters such as:
    
        train_param_list = ['model_name','lr', ...] 
    
    and manually maintain it (boring!).
    
    Resources:
    https://stackoverflow.com/questions/31519997/is-it-possible-to-only-parse-one-argument-groups-parameters-with-argparse
    """

    # Fetch all non-default parameters
    non_default_parameters = []

    for group in parser._action_groups[2:]:
        if group.title not in ['GENERAL', 'DATA']:
            for action in group._group_actions:
                if (kwargs[action.dest] is not None) and (
                        kwargs[action.dest] != action.default) and action.dest != 'load_model':
                    non_default_parameters.append(str(action.dest) + "=" + str(kwargs[action.dest]))

    # Build up final logging folder tree with the non-default training parameters
    log_folder = os.path.join(*[output_folder, experiment_name, dataset, *non_default_parameters,
                                '{}'.format(time.strftime('%d-%m-%y-%Hh-%Mm-%Ss'))])
    if not os.path.exists(log_folder):
        os.makedirs(log_folder)

    # Setup logging
    root = logging.getLogger()
    log_level = logging.DEBUG if debug else logging.INFO
    root.setLevel(log_level)
    format = "[%(asctime)s] [%(levelname)8s] --- %(message)s (%(filename)s:%(lineno)s)"
    date_format = '%Y-%m-%d %H:%M:%S'

    if os.isatty(2):
        cformat = '%(log_color)s' + format
        formatter = colorlog.ColoredFormatter(cformat, date_format,
                                              log_colors={
                                                  'DEBUG': 'cyan',
                                                  'INFO': 'white',
                                                  'WARNING': 'yellow',
                                                  'ERROR': 'red',
                                                  'CRITICAL': 'red,bg_white',
                                              })
    else:
        formatter = logging.Formatter(format, date_format)

    if not quiet:
        ch = logging.StreamHandler()
        ch.setFormatter(formatter)
        root.addHandler(ch)

    fh = logging.FileHandler(os.path.join(log_folder, LOG_FILE))
    fh.setFormatter(logging.Formatter(format, date_format))
    root.addHandler(fh)

    logging.info('Setup logging. Log file: {}'.format(os.path.join(log_folder, LOG_FILE)))

    # Save args to logs_folder
    logging.info('Arguments saved to: {}'.format(os.path.join(log_folder, 'args.txt')))
    with open(os.path.join(log_folder, 'args.txt'), 'w') as f:
        f.write(json.dumps(args_dict))

    # Save all environment packages to logs_folder
    environment_yml = os.path.join(log_folder, 'environment.yml')
    subprocess.call('conda env export > {}'.format(environment_yml), shell=True)

    # Define Tensorboard SummaryWriter
    logging.info('Initialize Tensorboard SummaryWriter')

    # Add all parameters to Tensorboard
    writer = SummaryWriter(log_dir=log_folder)
    writer.add_text('Args', json.dumps(args_dict))

    return log_folder, writer


[docs]def copy_code(output_folder):
    """
    Makes a tar file with DeepDIVA that exists during runtime.

    Parameters
    ----------
    output_folder : str
        Path to output directory

    Returns
    -------
        None
    """
    # All file extensions to be saved by copy-code.
    FILE_TYPES = ['.sh', '.py']

    # Get DeepDIVA root
    cwd = os.getcwd()
    dd_root = os.path.join(cwd.split('DeepDIVA')[0], 'DeepDIVA')

    files = get_all_files_in_folders_and_subfolders(dd_root)

    # Get all files types in DeepDIVA as specified in FILE_TYPES
    code_files = [item for item in files if item.endswith(tuple(FILE_TYPES))]

    tmp_dir = tempfile.mkdtemp()

    for item in code_files:
        dest = os.path.join(tmp_dir, 'DeepDIVA', item.split('DeepDIVA')[1][1:])
        if not os.path.exists(os.path.dirname(dest)):
            os.makedirs(os.path.dirname(dest))
        shutil.copy(item, dest)

    # TODO: make it save a zipfile instead of a tarfile.
    with tarfile.open(os.path.join(output_folder, 'DeepDIVA.tar.gz'), 'w:gz') as tar:
        tar.add(tmp_dir, arcname='DeepDIVA')

    # Clean up all temporary files
    shutil.rmtree(tmp_dir)


[docs]def set_up_env(gpu_id, seed, multi_run, no_cuda, **kwargs):
    """
    Set up the execution environment.

    Parameters
    ----------
    gpu_id : string
        Specify the GPUs to be used
    seed :    int
        Seed all possible seeds for deterministic run
    multi_run : int
        Number of runs over the same code to produce mean-variance graph.
    no_cuda : bool
        Specify whether to use the GPU or not

    Returns
    -------
        None
    """

    # Set visible GPUs
    if gpu_id is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = gpu_id

    # Check if GPU's are available
    gpu_available = torch.cuda.is_available()
    if not gpu_available and not no_cuda:
        logging.warning('There are no GPUs available on this system, or your NVIDIA drivers are outdated.')
        logging.warning('Switch to CPU only computation using --no-cuda.')
        sys.exit(-1)

    # Seed the random
    if seed is None:
        # If seed is not specified by user, select a random value for the seed and then log it.
        seed = np.random.randint(2 ** 32 - 1, )
        logging.info('Randomly chosen seed is: {}'.format(seed))
    else:
        try:
            assert multi_run == None
        except:
            logging.warning('Arguments for seed AND multi-run should not be active at the same time!')
            raise SystemExit

        # Disable CuDNN only if seed is specified by user. Otherwise we can assume that the user does not want to
        # sacrifice speed for deterministic behaviour.
        # TODO: Check if setting torch.backends.cudnn.deterministic=True will ensure deterministic behavior.
        # Initial tests show torch.backends.cudnn.deterministic=True does not work correctly.
        if not no_cuda:
            torch.backends.cudnn.enabled = False

    # Python
    random.seed(seed)

    # Numpy random
    np.random.seed(seed)

    # Torch random
    torch.manual_seed(seed)
    if not no_cuda:
        torch.cuda.manual_seed_all(seed)