Source code for hypertools.tools.normalize
#!/usr/bin/env python
from __future__ import division
from builtins import range
import numpy as np
from .format_data import format_data as formatter
from .._shared.helpers import memoize
[docs]@memoize
def normalize(x, normalize='across', internal=False, format_data=True):
"""
Z-transform the columns or rows of an array, or list of arrays
This function normalizes the rows or columns of the input array(s). This
can be useful because data reduction and machine learning techniques are
sensitive to scaling differences between features. By default, the function
is set to normalize 'across' the columns of all lists, but it can also
normalize the columns 'within' each individual list, or alternatively, for
each row in the array.
Parameters
----------
x : Numpy array or list of arrays
This can either be a single array, or list of arrays
normalize : str or False or None
If set to 'across', the columns of the input data will be z-scored
across lists (default). That is, the z-scores will be computed with
with respect to column n across all arrays passed in the list. If set
to 'within', the columns will be z-scored within each list that is
passed. If set to 'row', each row of the input data will be z-scored.
If set to False, the input data will be returned with no z-scoring.
format_data : bool
Whether or not to first call the format_data function (default: True).
Returns
----------
normalized_x : Numpy array or list of arrays
An array or list of arrays where the columns or rows are z-scored. If
the input was a list, a list is returned. Otherwise, an array is
returned.
"""
assert normalize in ['across','within','row', False, None], "scale_type must be across, within, row or none."
if normalize in [False, None]:
return x
else:
if format_data:
x = formatter(x, ppca=True)
zscore = lambda X, y: (y - np.mean(X)) / np.std(X) if len(set(y)) > 1 else np.zeros(y.shape)
if normalize == 'across':
x_stacked=np.vstack(x)
normalized_x = [np.array([zscore(x_stacked[:,j], i[:,j]) for j in range(i.shape[1])]).T for i in x]
elif normalize == 'within':
normalized_x = [np.array([zscore(i[:,j], i[:,j]) for j in range(i.shape[1])]).T for i in x]
elif normalize == 'row':
normalized_x = [np.array([zscore(i[j,:], i[j,:]) for j in range(i.shape[0])]) for i in x]
if internal or len(normalized_x)>1:
return normalized_x
else:
return normalized_x[0]