Source code for hypertools.tools.describe

#!/usr/bin/env python

from __future__ import division
from builtins import range
import warnings
import numpy as np
from scipy.stats.stats import pearsonr
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns
from .reduce import reduce as reducer
from .format_data import format_data as formatter
from .._shared.helpers import memoize


[docs]def describe(x, reduce='IncrementalPCA', max_dims=None, show=True, format_data=True): """ Create plot describing covariance with as a function of number of dimensions This function correlates the raw data with reduced data to get a sense for how well the data can be summarized with n dimensions. Useful for evaluating quality of dimensionality reduced plots. Parameters ---------- x : Numpy array, DataFrame or list of arrays/dfs A list of Numpy arrays or Pandas Dataframes reduce : str or dict Decomposition/manifold learning model to use. Models supported: PCA, IncrementalPCA, SparsePCA, MiniBatchSparsePCA, KernelPCA, FastICA, FactorAnalysis, TruncatedSVD, DictionaryLearning, MiniBatchDictionaryLearning, TSNE, Isomap, SpectralEmbedding, LocallyLinearEmbedding, and MDS. Can be passed as a string, but for finer control of the model parameters, pass as a dictionary, e.g. reduce={'model' : 'PCA', 'params' : {'whiten' : True}}. See scikit-learn specific model docs for details on parameters supported for each model. max_dims : int Maximum number of dimensions to consider show : bool Plot the result (default : true) format_data : bool Whether or not to first call the format_data function (default: True). Returns ---------- result : dict A dictionary with the analysis results. 'average' is the correlation by number of components for all data. 'individual' is a list of lists, where each list is a correlation by number of components vector (for each input list). """ warnings.warn('When input data is large, this computation can take a long time.') def summary(x, max_dims=None): # if data is a list, stack it if type(x) is list: x = np.vstack(x) # if max dims is not set, make it the length of the minimum number of columns if max_dims is None: if x.shape[1]>x.shape[0]: max_dims = x.shape[0] else: max_dims = x.shape[1] # correlation matrix for all dimensions alldims = get_cdist(x) corrs=[] for dims in range(2, max_dims): reduced = get_cdist(reducer(x, ndims=dims, reduce=reduce)) corrs.append(get_corr(alldims, reduced)) del reduced return corrs # common format if format_data: x = formatter(x, ppca=True) # a dictionary to store results result = {} result['average'] = summary(x, max_dims) result['individual'] = [summary(x_i, max_dims) for x_i in x] if max_dims is None: max_dims = len(result['average']) # if show, plot it if show: fig, ax = plt.subplots() ax = sns.tsplot(data=result['individual'], time=[i for i in range(2, max_dims+2)], err_style="unit_traces") ax.set_title('Correlation with raw data by number of components') ax.set_ylabel('Correlation') ax.set_xlabel('Number of components') plt.show() return result
@memoize def get_corr(reduced, alldims): return pearsonr(alldims.ravel(), reduced.ravel())[0] @memoize def get_cdist(x): return cdist(x, x)