Source code for datasets.bidimensional_dataset

"""
Load a dataset of bidimensional points by specifying the folder where its located.
"""

# Utils
import logging
import os
import os.path
import sys
import numpy as np
import pandas

# Torch related stuff
import torch.utils.data as data


[docs]def load_dataset(dataset_folder): """ Loads the dataset from file system and provides the dataset splits for train validation and test The dataset is expected to be in the following structure, where 'dataset_folder' has to point to the root of the three folder train/val/test. Example: dataset_folder = "~/../../data/bd_xor" which contains the splits sub-folders as follow: 'dataset_folder'/train 'dataset_folder'/val 'dataset_folder'/test Parameters ---------- dataset_folder : string Path to the dataset on the file System Returns ------- train_ds : data.Dataset val_ds : data.Dataset test_ds : data.Dataset Train, validation and test splits """ # Get the splits folders train_dir = os.path.join(dataset_folder, 'train', 'data.csv') val_dir = os.path.join(dataset_folder, 'val', 'data.csv') test_dir = os.path.join(dataset_folder, 'test', 'data.csv') # Sanity check on the splits folders if not os.path.exists(train_dir): logging.error("Train data.csv not found in the dataset_folder=" + dataset_folder) sys.exit(-1) if not os.path.exists(val_dir): logging.error("Val data.csv not found in the dataset_folder=" + dataset_folder) sys.exit(-1) if not os.path.exists(test_dir): logging.error("Test data.csv not found in the dataset_folder=" + dataset_folder) sys.exit(-1) # Get the datasets train_ds = Bidimensional(train_dir) val_ds = Bidimensional(val_dir) test_ds = Bidimensional(test_dir) return train_ds, val_ds, test_ds
[docs]class Bidimensional(data.Dataset): """ This class loads the data.csv file and prepares it as a dataset. """ def __init__(self, path, transform=None, target_transform=None): """ Load the data.csv file and prepare it as a dataset. Parameters ---------- path : string Path to the dataset on the file System transform : torchvision.transforms Transformation to apply on the data target_transform : torchvision.transforms Transformation to apply on the labels """ self.path = os.path.expanduser(path) self.transform = transform self.target_transform = target_transform # Read data from the csv file self.data = pandas.read_csv(self.path).values # Shuffle the data once (otherwise you get clusters of samples of same class in each minibatch for val and test) np.random.shuffle(self.data) self.min_coords = np.min(self.data[:, 0]), np.min(self.data[:, 1]) self.max_coords = np.max(self.data[:, 0]), np.max(self.data[:, 1]) # Set expected class attributes self.classes = np.unique(np.unique(self.data[:, 2])) self.num_classes = len(self.classes) def __getitem__(self, index): """ Retrieve a sample by index Parameters ---------- index : int Returns ------- point : FloatTensor target : int label of the point """ x, y, target = self.data[index] point = np.array([x, y]) target = target.astype(np.int64) if self.transform is not None: # The reshape and scaling are is absolutely necessary as torch.transform.ToTensor() # converts a PIL.Image(RGB) or numpy.ndarray (H x W x C) in the range [0, 255] to a # torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. # Bring from domain range into [0;255] point = np.divide((point - self.min_coords), np.subtract(self.max_coords, self.min_coords)) * 255 # Reshape into (H x W x C) point = point.reshape(1, 1, 2) # Apply transforms point = self.transform(point) if self.target_transform is not None: target = self.target_transform(target) return point, target def __len__(self): return len(self.data)