Source code for util.visualization.embedding

"""
This script generates embedding visualization for features produced by the apply_model script.
"""
import argparse
import inspect
import os
import pickle
import sys
from multiprocessing import Pool

import matplotlib as mpl
import torch

# To facilitate plotting on a headless server
mpl.use('Agg')
import matplotlib.pyplot as plt
from tensorboardX import SummaryWriter
import numpy as np
from sklearn.manifold import TSNE, Isomap, MDS
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from util.misc import load_numpy_image, save_numpy_image
from PIL import Image
########################################################################################################################
[docs]def tsne(features, n_components=2):
    """
    Returns the embedded points for TSNE.
    Parameters
    ----------
    features: numpy.ndarray
        contains the input feature vectors.
    n_components: int
        number of components to transform the features into

    Returns
    -------
    embedding: numpy.ndarray
        x,y(z) points that the feature vectors have been transformed into
    """
    embedding = TSNE(n_components=n_components).fit_transform(features)
    return embedding


[docs]def isomap(features, n_components=2):
    """
    Returns the embedded points for Isomap.
    Parameters
    ----------
    features: numpy.ndarray
        contains the input feature vectors.
    n_components: int
        number of components to transform the features into

    Returns
    -------
    embedding: numpy.ndarray
        x,y(z) points that the feature vectors have been transformed into
    """
    embedding = Isomap(n_components=n_components, n_jobs=-1).fit_transform(features)
    return embedding

[docs]def mds(features, n_components=2):
    """
    Returns the embedded points for MDS.
    Parameters
    ----------
    features: numpy.ndarray
        contains the input feature vectors.
    n_components: int
        number of components to transform the features into

    Returns
    -------
    embedding: numpy.ndarray
        x,y(z) points that the feature vectors have been transformed into
    """
    embedding = MDS(n_components=n_components, n_jobs=-1).fit_transform(features)
    return embedding

[docs]def pca(features, n_components=2):
    """
    Returns the embedded points for PCA.
    Parameters
    ----------
    features: numpy.ndarray
        contains the input feature vectors.
    n_components: int
        number of components to transform the features into

    Returns
    -------
    embedding: numpy.ndarray
        x,y(z) points that the feature vectors have been transformed into
    """
    embedding = PCA(n_components=n_components).fit_transform(features)
    return embedding

########################################################################################################################
def _make_embedding(features, labels, embedding, three_d=False):
    """
    Generate an embedding image using features from a model.

    Adapted from https://indico.io/blog/visualizing-with-t-sne/
    Parameters
    ----------
    features: numpy.ndarray
        contains the feature array generated by the apply_model runner class
    labels:  numpy.ndarray
        contains labels for corresponding feature vectors
    embedding: str
        type of embedding to use
    three_d: bool
        specify whether to generate 2d or 3d visualization

    Returns
    -------
    data: numpy.ndarray
        contains an image of the plotted visualization

    """

    plt.style.use(['seaborn-white', 'seaborn-paper'])
    fig = plt.figure(figsize=(8, 8))
    plt.tight_layout()
    mpl.rc("font", family="Times New Roman")

    X = features

    le = LabelEncoder()

    labels = le.fit_transform(labels)

    cmap = plt.cm.get_cmap('jet', len(np.unique(labels)))

    if three_d:
        ax = plt.axes(projection='3d')
        X_embedded = getattr(sys.modules[__name__], embedding)(X, n_components=3)
        ax.scatter3D(X_embedded[:, 0], X_embedded[:, 1], X_embedded[:, 2], c=labels, cmap=cmap)
    else:
        X_embedded = getattr(sys.modules[__name__], embedding)(X)
        plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=labels, cmap=cmap)

    # plt.colorbar(ticks=range(len(np.unique(labels))))

    fig.canvas.draw()
    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    fig.clf()
    plt.close()
    return data


def _load_thumbnail(path):
    """
    Return a thumbnail version of any image
    Parameters
    ----------
    path: str
        path to an image

    Returns
    -------
    img: numpy.ndarray
        resized image of size 16x16
    """
    img = Image.open(path).thumbnail((16, 16))
    img = np.array(img)
    return img

def _make_folder_if_not_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)

def _main(args):
    """
    Main routine of script to generate embeddings.
    Parameters
    ----------
    args : argparse.Namespace
        contains all arguments parsed from input

    Returns
    -------
    None

    """
    with open(args.results_file, 'rb') as f:
        results = pickle.load(f)

    features, preds, labels, filenames = results

    _make_folder_if_not_exists(os.path.dirname(args.output_file))

    if args.tensorboard:
        if args.output.endwith('.png'):
            output_loc = os.path.dirname(args.output)
        else:
            output_loc = args.output
        writer = SummaryWriter(log_dir=output_loc)
        # with Pool(16) as pool:
        # images = pool.map(_load_thumbnail, filenames)
        writer.add_embedding(torch.from_numpy(features), metadata=labels,
                             # label_img=torch.from_numpy(np.array(images)).unsqueeze(1))
                             label_img=None)
        return
    else:
        viz_img = _make_embedding(features=features, labels=labels, embedding=args.embedding, three_d=args.three_d)
        save_numpy_image(args.output_file, viz_img)
        return


if __name__ == "__main__":
    # Embedding options:
    embedding_options = [name[0] for name in inspect.getmembers(sys.modules[__name__], inspect.isfunction)]

    parser = argparse.ArgumentParser()

    parser.add_argument('--results-file',
                        type=str,
                        help='path to a results pickle file')

    parser.add_argument('--embedding',
                        help='which embedding to use for the features',
                        choices=embedding_options,
                        type=str)
    parser.add_argument('--output',
                        type=str,
                        default='./output.png',
                        help='path to generate output image')
    parser.add_argument('--3d',
                        dest='three_d',
                        action='store_true',
                        default=False,
                        help='enable 3d plots')
    parser.add_argument('--tensorboard',
                        action='store_true',
                        default=False,
                        help='store embeddings to tensorboard')

    args = parser.parse_args()

    _main(args)