Source code for hypertools.tools.describe
#!/usr/bin/env python
from __future__ import division
from builtins import range
import warnings
import numpy as np
from scipy.stats.stats import pearsonr
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns
from .reduce import reduce as reducer
from .format_data import format_data as formatter
from .._shared.helpers import memoize
[docs]def describe(x, reduce='IncrementalPCA', max_dims=None, show=True,
format_data=True):
"""
Create plot describing covariance with as a function of number of dimensions
This function correlates the raw data with reduced data to get a sense
for how well the data can be summarized with n dimensions. Useful for
evaluating quality of dimensionality reduced plots.
Parameters
----------
x : Numpy array, DataFrame or list of arrays/dfs
A list of Numpy arrays or Pandas Dataframes
reduce : str or dict
Decomposition/manifold learning model to use. Models supported: PCA,
IncrementalPCA, SparsePCA, MiniBatchSparsePCA, KernelPCA, FastICA,
FactorAnalysis, TruncatedSVD, DictionaryLearning, MiniBatchDictionaryLearning,
TSNE, Isomap, SpectralEmbedding, LocallyLinearEmbedding, and MDS. Can be
passed as a string, but for finer control of the model parameters, pass
as a dictionary, e.g. reduce={'model' : 'PCA', 'params' : {'whiten' : True}}.
See scikit-learn specific model docs for details on parameters supported
for each model.
max_dims : int
Maximum number of dimensions to consider
show : bool
Plot the result (default : true)
format_data : bool
Whether or not to first call the format_data function (default: True).
Returns
----------
result : dict
A dictionary with the analysis results. 'average' is the correlation
by number of components for all data. 'individual' is a list of lists,
where each list is a correlation by number of components vector (for each
input list).
"""
warnings.warn('When input data is large, this computation can take a long time.')
def summary(x, max_dims=None):
# if data is a list, stack it
if type(x) is list:
x = np.vstack(x)
# if max dims is not set, make it the length of the minimum number of columns
if max_dims is None:
if x.shape[1]>x.shape[0]:
max_dims = x.shape[0]
else:
max_dims = x.shape[1]
# correlation matrix for all dimensions
alldims = get_cdist(x)
corrs=[]
for dims in range(2, max_dims):
reduced = get_cdist(reducer(x, ndims=dims, reduce=reduce))
corrs.append(get_corr(alldims, reduced))
del reduced
return corrs
# common format
if format_data:
x = formatter(x, ppca=True)
# a dictionary to store results
result = {}
result['average'] = summary(x, max_dims)
result['individual'] = [summary(x_i, max_dims) for x_i in x]
if max_dims is None:
max_dims = len(result['average'])
# if show, plot it
if show:
fig, ax = plt.subplots()
ax = sns.tsplot(data=result['individual'], time=[i for i in range(2, max_dims+2)], err_style="unit_traces")
ax.set_title('Correlation with raw data by number of components')
ax.set_ylabel('Correlation')
ax.set_xlabel('Number of components')
plt.show()
return result
@memoize
def get_corr(reduced, alldims):
return pearsonr(alldims.ravel(), reduced.ravel())[0]
@memoize
def get_cdist(x):
return cdist(x, x)