Source code for util.data.dataset_analytics

"""
This script perform some analysis on the dataset provided.
In particular computes std and mean (to be used to center your dataset).

Structure of the dataset expected:

Split folders
-------------
'args.dataset-folder' has to point to the parent of the train folder.
Example:

        ~/../../data/svhn

where the dataset_folder contains the train sub-folder as follow:

    args.dataset_folder/train

Classes folders
---------------
The train split should have different classes in a separate folder with the class
name. The file name can be arbitrary (e.g does not have to be 0-* for classes 0 of MNIST).
Example:

    train/dog/whatever.png
    train/dog/you.png
    train/dog/like.png

    train/cat/123.png
    train/cat/nsdf3.png
    train/cat/asd932_.png
"""

# Utils
import argparse
import logging
import os
from multiprocessing import Pool
import numpy as np
import pandas as pd
from PIL import Image

# Torch related stuff
import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from util.misc import load_numpy_image

[docs]def compute_mean_std(dataset_folder, inmem, workers):
    """
    Computes mean and std of a dataset. Saves the results as CSV file in the dataset folder.

    Parameters
    ----------
    dataset_folder : String (path)
        Path to the dataset folder (see above for details)
    inmem : Boolean
        Specifies whether is should be computed i nan online of offline fashion.
    workers : int
        Number of workers to use for the mean/std computation

    Returns
    -------
        None
    """

    # Getting the train dir
    traindir = os.path.join(dataset_folder, 'train')

    # Sanity check on the training folder
    if not os.path.isdir(traindir):
        logging.warning("Train folder not found in the args.dataset_folder={}".format(dataset_folder))
        return

    # Load the dataset file names
    train_ds = datasets.ImageFolder(traindir, transform=transforms.Compose([transforms.ToTensor()]))

    # Extract the actual file names and labels as entries
    file_names = np.asarray([item[0] for item in train_ds.imgs])

    # Compute mean and std
    if inmem:
        mean, std = cms_inmem(file_names)
    else:
        mean, std = cms_online(file_names, workers)

    # Check if the dataset is a multi-label dataset
    if not os.path.exists(os.path.join(traindir, 'labels.csv')):
        # Use normal class frequency computation
        class_frequencies_weights = _get_class_frequencies_weights(train_ds, workers)
    else:
        # Use multi-label class frequency computation
        class_frequencies_weights = _get_class_frequencies_weights_multilabel(os.path.join(traindir, 'labels.csv'))

    # Save results as CSV file in the dataset folder
    df = pd.DataFrame([mean, std, class_frequencies_weights])
    df.index = ['mean[RGB]', 'std[RGB]', 'class_frequencies_weights[num_classes]']
    df.to_csv(os.path.join(dataset_folder, 'analytics.csv'), header=False)


[docs]def compute_mean_std_segmentation(dataset_folder, inmem, workers, filter_boundaries):
    """
    Computes mean and std of a dataset for semantic segmentation. Saves the results as CSV file in the dataset folder.

    Parameters
    ----------
    dataset_folder : String (path)
        Path to the dataset folder (see above for details)
    inmem : Boolean
        Specifies whether is should be computed i nan online of offline fashion.
    workers : int
        Number of workers to use for the mean/std computation
    filter_boundaries : bool
        specifies whether thr boundary pixels should be removed or not
    Returns
    -------
        None
    """

    # Getting the train dir
    traindir = os.path.join(dataset_folder, 'train')

    # Load the dataset file names
    train_ds = datasets.ImageFolder(traindir, transform=transforms.Compose([transforms.ToTensor()]))

    # Extract the actual file names and labels as entries
    file_names_all = np.asarray([item[0] for item in train_ds.imgs])
    file_names_gt = np.asarray([f for f in file_names_all if '/gt/' in f])
    file_names_data = np.asarray([f for f in file_names_all if '/data/' in f])

    # Compute mean and std
    if inmem:
        mean, std = cms_inmem(file_names_data)
    else:
        mean, std = cms_online(file_names_data, workers)

    # Compute class frequencies weights
    class_frequencies_weights, class_ints = _get_class_frequencies_weights_segmentation(file_names_gt, filter_boundaries)
    # print(class_frequencies_weights)
    # Save results as CSV file in the dataset folder
    df = pd.DataFrame([mean, std, class_frequencies_weights, class_ints])
    df.index = ['mean[RGB]', 'std[RGB]', 'class_frequencies_weights[num_classes]', 'class_encodings']
    df.to_csv(os.path.join(dataset_folder, 'analytics.csv'), header=False)


# Loads an image with OpenCV and returns the channel wise means of the image.
def _return_mean(image_path):
    img = load_numpy_image(image_path)
    mean = np.array([np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]) / 255.0
    return mean


# Loads an image with OpenCV and returns the channel wise std of the image.
def _return_std(image_path, mean):
    img = load_numpy_image(image_path) / 255.0
    m2 = np.square(np.array([img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]))
    return np.sum(np.sum(m2, axis=1), 1), m2.size / 3.0


[docs]def cms_online(file_names, workers):
    """
    Computes mean and image_classification deviation in an online fashion. This is useful when the dataset is too big to
    be allocated in memory. 

    Parameters
    ----------
    file_names : List of String
        List of file names of the dataset
    workers : int
        Number of workers to use for the mean/std computation

    Returns
    -------
    mean : double
    std : double
    """

    # Set up a pool of workers
    pool = Pool(workers)

    logging.info('Begin computing the mean')

    # Online mean
    results = pool.map(_return_mean, file_names)
    mean_sum = np.sum(np.array(results), axis=0)

    # Divide by number of samples in train set
    mean = mean_sum / file_names.size

    logging.info('Finished computing the mean')
    logging.info('Begin computing the std')

    # Online image_classification deviation
    results = pool.starmap(_return_std, [[item, mean] for item in file_names])
    std_sum = np.sum(np.array([item[0] for item in results]), axis=0)
    total_pixel_count = np.sum(np.array([item[1] for item in results]))
    std = np.sqrt(std_sum / total_pixel_count)
    logging.info('Finished computing the std')

    # Shut down the pool
    pool.close()

    return mean, std


[docs]def cms_inmem(file_names):
    """
    Computes mean and image_classification deviation in an offline fashion. This is possible only when the dataset can
    be allocated in memory.

    Parameters
    ----------
    file_names: List of String
        List of file names of the dataset
    Returns
    -------
    mean : double
    std : double
    """
    img = np.zeros([file_names.size] + list(load_numpy_image(file_names[0]).shape))

    # Load all samples
    for i, sample in enumerate(file_names):
        img[i] = load_numpy_image(sample)

    mean = np.array([np.mean(img[:, :, :, 0]), np.mean(img[:, :, :, 1]), np.mean(img[:, :, :, 2])]) / 255.0
    std = np.array([np.std(img[:, :, :, 0]), np.std(img[:, :, :, 1]), np.std(img[:, :, :, 2])]) / 255.0

    return mean, std


def _get_class_frequencies_weights(dataset, workers):
    """
    Get the weights proportional to the inverse of their class frequencies.
    The vector sums up to 1

    Parameters
    ----------
    train_loader: torch.utils.data.dataloader.DataLoader
        Dataloader for the training se
    workers: int
        Number of workers to use for the mean/std computation

    Returns
    -------
    ndarray[double] of size (num_classes)
        The weights vector as a 1D array normalized (sum up to 1)
    """
    logging.info('Begin computing class frequencies weights')
    all_labels = None
    try:
        all_labels = [item[1] for item in dataset.imgs]
    except:
        all_labels = [item for item in dataset.labels]
    finally:
        if all_labels == None:
            data_loader = torch.utils.data.DataLoader(dataset, batch_size=64, num_workers=workers)
            all_labels = []
            for target, label in data_loader:
                all_labels.append(label)
            all_labels = np.concatenate(all_labels).reshape(len(dataset))

    total_num_samples = len(all_labels)
    num_samples_per_class = np.unique(all_labels, return_counts=True)[1]
    class_frequencies = (num_samples_per_class / total_num_samples)
    logging.info('Finished computing class frequencies weights')
    logging.info('Class frequencies (rounded): {class_frequencies}'
                 .format(class_frequencies=np.around(class_frequencies * 100, decimals=2)))
    # Normalize vector to sum up to 1.0 (in case the Loss function does not do it)
    return (1 / num_samples_per_class) / ((1 / num_samples_per_class).sum())


def _get_class_frequencies_weights_segmentation(gt_images, filter_boundaries):
    """
    Get the weights proportional to the inverse of their class frequencies.
    The vector sums up to 1

    Parameters
    ----------
    gt_images: list of strings
        Path to all ground truth images, which contain the pixel-wise label
    workers: int
        Number of workers to use for the mean/std computation
    filter_boundaries : bool
        specifies whether thr boundary pixels should be removed or not

    Returns
    -------
    ndarray[double] of size (num_classes) and ints the classes are represented as
        The weights vector as a 1D array normalized (sum up to 1)
    """
    logging.info('Begin computing class frequencies weights')

    total_num_pixels = 0
    label_counter = {}

    for path in gt_images:
        img = np.array(Image.open(path))
        if filter_boundaries:
            mask = img[:, :, 0].astype(np.uint8) == 128
            img[mask, 2] = 1
        img = img[:, :, 2].flatten()
        total_num_pixels += len(img)
        for i, j in zip(*np.unique(img, return_counts=True)):
            label_counter[i] = label_counter.get(i, 0) + j

    classes = np.array(sorted(label_counter.keys()))
    num_samples_per_class = np.array([label_counter[k] for k in classes])
    class_frequencies = (num_samples_per_class / total_num_pixels)
    logging.info('Finished computing class frequencies weights')
    logging.info('Class frequencies (rounded): {class_frequencies}'
                 .format(class_frequencies=np.around(class_frequencies * 100, decimals=2)))
    # Normalize vector to sum up to 1.0 (in case the Loss function does not do it)
    return (1 / num_samples_per_class) / ((1 / num_samples_per_class).sum()), classes


def _get_class_frequencies_weights_coco(dataset, name_onehotindex, **kwargs):
    """
    Get the weights proportional to the inverse of their class frequencies.
    The vector sums up to 1

    Parameters
    ----------
    dataset: pycocotools.coco.COCO
        COCO dataset loaded with the pycocotools and the torchvision dataset loader

    classes: dict
        dictionary containing the class names and the corresponding index for argmax

    Returns
    -------
    ndarray[double] of size (num_classes)
        The weights vector as a 1D array normalized (sum up to 1)
    """
    logging.info('Begin computing class frequencies weights')

    count_labels = {v: 0 for v in name_onehotindex.values()}

    for (_, gt_mask) in dataset:
        for k, v in zip(*np.unique(np.array(gt_mask).flatten(), return_counts=True)):
            count_labels[k] += v

    total_num_samples = sum(count_labels.values())
    num_samples_per_class = np.array([count_labels[k] for k in sorted(count_labels.keys())])
    class_frequencies = (num_samples_per_class / total_num_samples)
    logging.info('Finished computing class frequencies weights')
    logging.info('Class frequencies (rounded): {class_frequencies}'
                 .format(class_frequencies=np.around(class_frequencies * 100, decimals=2)))
    # Normalize vector to sum up to 1.0 (in case the Loss function does not do it)
    return (1 / num_samples_per_class) / ((1 / num_samples_per_class).sum())


def _get_class_frequencies_weights_multilabel(dataset_labels):
    """
    Computes the weights for each class (as required by torch.nn.BCEWithLogitsLoss).
    The weight for each class is #neg_samples/#pos_samples.

    Parameters
    ----------
    dataset_folder: torch.utils.data.dataloader.DataLoader
        Path to a labels.csv file with labels for each training sample

    Returns
    -------
    ndarray[double] of size (num_classes)
        The weights vector as a 1D array
    """
    logging.info('Begin computing class weights')

    labels_df = pd.read_csv(dataset_labels)
    classes = labels_df.columns
    labels = labels_df.values

    # Replace all -1 with 0
    labels[labels == -1] = 0

    # Remove the filenames
    labels = labels[:, 1:]

    weights = []
    for i in range(len(labels[0])):
        pos = len(np.where(labels[:, i] == 1)[0])
        neg = len(labels) - pos
        weight = neg / pos
        weights.append(weight)

    weights = np.array(weights)

    logging.info('Finished computing class weights')
    logging.info('Class weights (rounded): {}'.format(np.around(weights, decimals=2)))
    return weights


if __name__ == "__main__":
    logging.basicConfig(
        format='%(asctime)s - %(filename)s:%(funcName)s %(levelname)s: %(message)s',
        level=logging.INFO
    )

    ###############################################################################
    # Argument Parser

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='This script perform some analysis on the dataset provided')

    parser.add_argument('--dataset-folder',
                        help='location of the dataset on the machine e.g root/data',
                        required=True,
                        type=str)

    parser.add_argument('--online',
                        action='store_true',
                        help='Compute it in an online fashion (because it probably will not fin in memory')

    args = parser.parse_args()

    compute_mean_std(dataset_folder=args.dataset_folder,
                     inmem=args.online)