Source code for hypertools.tools.normalize

#!/usr/bin/env python
from __future__ import division
from builtins import range

import numpy as np

from .format_data import format_data as formatter
from .._shared.helpers import memoize


[docs]@memoize
def normalize(x, normalize='across', internal=False, format_data=True):
    """
    Z-transform the columns or rows of an array, or list of arrays

    This function normalizes the rows or columns of the input array(s).  This
    can be useful because data reduction and machine learning techniques are
    sensitive to scaling differences between features. By default, the function
    is set to normalize 'across' the columns of all lists, but it can also
    normalize the columns 'within' each individual list, or alternatively, for
    each row in the array.

    Parameters
    ----------
    x : Numpy array or list of arrays
        This can either be a single array, or list of arrays

    normalize : str or False or None
        If set to 'across', the columns of the input data will be z-scored
        across lists (default). That is, the z-scores will be computed with
        with respect to column n across all arrays passed in the list. If set
        to 'within', the columns will be z-scored within each list that is
        passed. If set to 'row', each row of the input data will be z-scored.
        If set to False, the input data will be returned with no z-scoring.

    format_data : bool
        Whether or not to first call the format_data function (default: True).

    Returns
    ----------
    normalized_x : Numpy array or list of arrays
        An array or list of arrays where the columns or rows are z-scored. If
        the input was a list, a list is returned.  Otherwise, an array is
        returned.

    """

    assert normalize in ['across','within','row', False, None], "scale_type must be across, within, row or none."

    if normalize in [False, None]:
        return x
    else:

        if format_data:
            x = formatter(x, ppca=True)

        zscore = lambda X, y: (y - np.mean(X)) / np.std(X) if len(set(y)) > 1 else np.zeros(y.shape)

        if normalize == 'across':
            x_stacked=np.vstack(x)
            normalized_x = [np.array([zscore(x_stacked[:,j], i[:,j]) for j in range(i.shape[1])]).T for i in x]

        elif normalize == 'within':
            normalized_x = [np.array([zscore(i[:,j], i[:,j]) for j in range(i.shape[1])]).T for i in x]

        elif normalize == 'row':
            normalized_x = [np.array([zscore(i[j,:], i[j,:]) for j in range(i.shape[0])]) for i in x]

        if internal or len(normalized_x)>1:
            return normalized_x
        else:
            return normalized_x[0]