import warnings
import numpy as np
import six
from .._externals.ppca import PPCA
from .._shared.helpers import get_type
[docs]def format_data(x, vectorizer='CountVectorizer',
semantic='LatentDirichletAllocation', corpus='wiki', ppca=True, text_align='hyper'):
"""
Formats data into a list of numpy arrays
This function is useful to identify rows of your array that contain missing
data or nans. The returned indices can be used to remove the rows with
missing data, or label the missing data points that are interpolated
using PPCA.
Parameters
----------
x : numpy array, dataframe, string or (mixed) list
The data to convert
vectorizer : str, dict, class or class instance
The vectorizer to use. Built-in options are 'CountVectorizer' or
'TfidfVectorizer'. To change default parameters, set to a dictionary
e.g. {'model' : 'CountVectorizer', 'params' : {'max_features' : 10}}. See
http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text
for details. You can also specify your own vectorizer model as a class,
or class instance. With either option, the class must have a
fit_transform method (see here: http://scikit-learn.org/stable/data_transforms.html).
If a class, pass any parameters as a dictionary to vectorizer_params. If
a class instance, no parameters can be passed.
semantic : str, dict, class or class instance
Text model to use to transform text data. Built-in options are
'LatentDirichletAllocation' or 'NMF' (default: LDA). To change default
parameters, set to a dictionary e.g. {'model' : 'NMF', 'params' :
{'n_components' : 10}}. See
http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition
for details on the two model options. You can also specify your own
text model as a class, or class instance. With either option, the class
must have a fit_transform method (see here:
http://scikit-learn.org/stable/data_transforms.html).
If a class, pass any parameters as a dictionary to text_params. If
a class instance, no parameters can be passed.
corpus : list (or list of lists) of text samples or 'wiki', 'nips', 'sotus'.
Text to use to fit the semantic model (optional). If set to 'wiki', 'nips'
or 'sotus' and the default semantic and vectorizer models are used, a
pretrained model will be loaded which can save a lot of time.
ppca : bool
Performs PPCA to fill in missing values (default: True)
text_align : str
Alignment algorithm to use when both text and numerical data are passed.
If numerical arrays have the same shape, and the text data contains the
same number of samples, the text and numerical data are automatically
aligned to a common space. Example use case: an array of movie frames
(frames by pixels) and text descriptions of the frame. In this case,
the movie and text will be automatically aligned to the same space
(default: hyperalignment).
Returns
----------
data : list of numpy arrays
A list of formatted arrays
"""
# not sure why i needed to import here, but its the only way I could get it to work
from .df2mat import df2mat
from .text2mat import text2mat
from ..datageometry import DataGeometry
# if x is not a list, make it one
if type(x) is not list:
x = [x]
if all([isinstance(xi, six.string_types) for xi in x]):
x = [x]
# check data type for each element in list
dtypes = list(map(get_type, x))
# handle text data:
if any(map(lambda x: x in ['list_str', 'str', 'arr_str'], dtypes)):
# default text args
text_args = {
'vectorizer' : vectorizer,
'semantic' : semantic,
'corpus' : corpus
}
# filter text data
text_data = []
for i,j in zip(x, dtypes):
if j in ['list_str', 'str', 'arr_str']:
text_data.append(np.array(i).reshape(-1, 1))
# convert text to numerical matrices
text_data = text2mat(text_data, **text_args)
# replace the text data with transformed data
processed_x = []
textidx=0
for i, dtype in enumerate(dtypes):
if dtype in ['list_str', 'str', 'arr_str']:
processed_x.append(text_data[textidx])
textidx+=1
elif dtype == 'df':
processed_x.append(df2mat(x[i]))
elif dtype == 'geo':
text_args = {
'vectorizer' : vectorizer,
'semantic' : semantic,
'corpus' : corpus
}
for j in format_data(x[i].get_data(), **text_args):
processed_x.append(j)
else:
processed_x.append(x[i])
# reshape anything that is 1d
if any([i.ndim<=1 for i in processed_x]):
processed_x = [np.reshape(i,(i.shape[0],1)) if i.ndim==1 else i for i in processed_x]
contains_text = any([dtype in ['list_str', 'str', 'arr_str'] for dtype in dtypes])
contains_num = any([dtype in ['list_num', 'array', 'df', 'arr_num'] for dtype in dtypes])
# if there are any nans in any of the lists, use ppca
if ppca is True:
if contains_num:
num_data = []
for i,j in zip(processed_x, dtypes):
if j in ['list_num', 'array', 'df', 'arr_num']:
num_data.append(i)
if np.isnan(np.vstack(num_data)).any():
warnings.warn('Missing data: Inexact solution computed with PPCA (see https://github.com/allentran/pca-magic for details)')
num_data = fill_missing(num_data)
x_temp = []
for dtype in dtypes:
if dtype in ['list_str', 'str', 'arr_str']:
x_temp.append(text_data.pop(0))
elif dtype in ['list_num', 'array', 'df', 'arr_num']:
x_temp.append(num_data.pop(0))
processed_x = x_temp
# if input data contains both text and numerical data
if contains_num and contains_text:
# and if they have the same number of samples
if np.unique(np.array([i.shape[0] for i, j in zip(processed_x, dtypes)])).shape[0] == 1:
from .align import align as aligner
# align the data
warnings.warn('Numerical and text data with same number of '
'samples detected. Aligning data to a common space.')
processed_x = aligner(processed_x, align=text_align, format_data=False)
return processed_x
def fill_missing(x):
# ppca if missing data
m = PPCA()
m.fit(data=np.vstack(x))
x_pca = m.transform()
# if the whole row is missing, return nans
all_missing = [idx for idx, a in enumerate(np.vstack(x)) if all([type(b)==np.nan for b in a])]
if len(all_missing)>0:
for i in all_missing:
x_pca[i, :] = np.nan
# get the original lists back
if len(x)>1:
x_split = np.cumsum([i.shape[0] for i in x][:-1])
return list(np.split(x_pca, x_split, axis=0))
else:
return [x_pca]