Source code for hypertools.tools.reduce

#!/usr/bin/env python

import warnings
from sklearn.decomposition import PCA, FastICA, IncrementalPCA, KernelPCA, FactorAnalysis, TruncatedSVD, SparsePCA, MiniBatchSparsePCA, DictionaryLearning, MiniBatchDictionaryLearning
from sklearn.manifold import TSNE, MDS, SpectralEmbedding, LocallyLinearEmbedding, Isomap
from umap import UMAP
from .._shared.helpers import *
from .normalize import normalize as normalizer
from .align import align as aligner
from .format_data import format_data as formatter

# dictionary of models
models = {
    'PCA': PCA,
    'IncrementalPCA': IncrementalPCA,
    'SparsePCA': SparsePCA,
    'MiniBatchSparsePCA': MiniBatchSparsePCA,
    'KernelPCA': KernelPCA,
    'FastICA': FastICA,
    'FactorAnalysis': FactorAnalysis,
    'TruncatedSVD': TruncatedSVD,
    'DictionaryLearning': DictionaryLearning,
    'MiniBatchDictionaryLearning': MiniBatchDictionaryLearning,
    'TSNE': TSNE,
    'Isomap': Isomap,
    'SpectralEmbedding': SpectralEmbedding,
    'LocallyLinearEmbedding': LocallyLinearEmbedding,
    'MDS': MDS,
    'UMAP': UMAP
}

# main function
[docs]@memoize def reduce(x, reduce='IncrementalPCA', ndims=None, normalize=None, align=None, model=None, model_params=None, internal=False, format_data=True): """ Reduces dimensionality of an array, or list of arrays Parameters ---------- x : Numpy array or list of arrays Dimensionality reduction using PCA is performed on this array. reduce : str or dict Decomposition/manifold learning model to use. Models supported: PCA, IncrementalPCA, SparsePCA, MiniBatchSparsePCA, KernelPCA, FastICA, FactorAnalysis, TruncatedSVD, DictionaryLearning, MiniBatchDictionaryLearning, TSNE, Isomap, SpectralEmbedding, LocallyLinearEmbedding, MDS and UMAP. Can be passed as a string, but for finer control of the model parameters, pass as a dictionary, e.g. reduce={'model' : 'PCA', 'params' : {'whiten' : True}}. See scikit-learn specific model docs for details on parameters supported for each model. ndims : int Number of dimensions to reduce format_data : bool Whether or not to first call the format_data function (default: True). model : None Deprecated argument. Please use reduce. model_params : None Deprecated argument. Please use reduce. align : None Deprecated argument. Please use new analyze function to perform combinations of transformations normalize : None Deprecated argument. Please use new analyze function to perform combinations of transformations Returns ---------- x_reduced : Numpy array or list of arrays The reduced data with ndims dimensionality is returned. If the input is a list, a list is returned. """ # deprecation warning if (model is not None) or (model_params is not None): warnings.warn('Model and model params will be deprecated. Please use the \ reduce keyword. See API docs for more info: http://hypertools.readthedocs.io/en/latest/hypertools.tools.reduce.html#hypertools.tools.reduce') reduce = { 'model': model, 'params': model_params } # if model is None, just return data if reduce is None: return x elif isinstance(reduce, (str, np.string_)): model_name = reduce model_params = { 'n_components': ndims } elif isinstance(reduce, dict): try: model_name = reduce['model'] model_params = reduce['params'] except KeyError: raise ValueError('If passing a dictionary, pass the model as the value of the "model" key and a \ dictionary of custom params as the value of the "params" key.') else: # handle other possibilities below model_name = reduce try: # if the model passed is a string, make sure it's one of the supported options if isinstance(model_name, (str, np.string_)): model = models[model_name] # otherwise check any custom object for necessary methods else: model = model_name getattr(model, 'fit_transform') getattr(model, 'n_components') except (KeyError, AttributeError): raise ValueError('reduce must be one of the supported options or support n_components and fit_transform \ methods. See http://hypertools.readthedocs.io/en/latest/hypertools.tools.reduce.html#hypertools.tools.reduce \ for supported models') # check for multiple values from n_components & ndims args if 'n_components' in model_params: if (ndims is None) or (ndims == model_params['n_components']): pass else: warnings.warn('Unequal values passed to dims and n_components. Using ndims parameter.') model_params['n_components'] = ndims else: model_params['n_components'] = ndims # convert to common format if format_data: x = formatter(x, ppca=True) # if ndims/n_components is not passed or all data is < ndims-dimensional, just return it if model_params['n_components'] is None or all([i.shape[1] <= model_params['n_components'] for i in x]): return x stacked_x = np.vstack(x) if stacked_x.shape[0] == 1: warnings.warn('Cannot reduce the dimensionality of a single row of' ' data. Return zeros length of ndims') return [np.zeros((1, model_params['n_components']))] elif stacked_x.shape[0] < model_params['n_components']: warnings.warn('The number of rows in your data is less than ndims.' ' The data will be reduced to the number of rows.') # deprecation warnings if normalize is not None: warnings.warn('The normalize argument will be deprecated for this function. Please use the \ analyze function to perform combinations of these transformations. See API docs for more info: http://hypertools.readthedocs.io/en/latest/hypertools.analyze.html#hypertools.analyze') x = normalizer(x, normalize=normalize) if align is not None: warnings.warn('The align argument will be deprecated for this function. Please use the \ analyze function to perform combinations of these transformations. See API docs for more info: http://hypertools.readthedocs.io/en/latest/hypertools.analyze.html#hypertools.analyze') x = aligner(x, align=align) # initialize model model = model(**model_params) # reduce data x_reduced = reduce_list(x, model) # return data if internal or len(x_reduced) > 1: return x_reduced else: return x_reduced[0]
# sub functions def reduce_list(x, model): split = np.cumsum([len(xi) for xi in x])[:-1] x_r = np.vsplit(model.fit_transform(np.vstack(x)), split) if len(x) > 1: return [xi for xi in x_r] else: return [x_r[0]]