Source code for hypertools.tools.format_data

import warnings

import numpy as np
import six

from .._externals.ppca import PPCA
from .._shared.helpers import get_type


[docs]def format_data(x, vectorizer='CountVectorizer', semantic='LatentDirichletAllocation', corpus='wiki', ppca=True, text_align='hyper'): """ Formats data into a list of numpy arrays This function is useful to identify rows of your array that contain missing data or nans. The returned indices can be used to remove the rows with missing data, or label the missing data points that are interpolated using PPCA. Parameters ---------- x : numpy array, dataframe, string or (mixed) list The data to convert vectorizer : str, dict, class or class instance The vectorizer to use. Built-in options are 'CountVectorizer' or 'TfidfVectorizer'. To change default parameters, set to a dictionary e.g. {'model' : 'CountVectorizer', 'params' : {'max_features' : 10}}. See http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text for details. You can also specify your own vectorizer model as a class, or class instance. With either option, the class must have a fit_transform method (see here: http://scikit-learn.org/stable/data_transforms.html). If a class, pass any parameters as a dictionary to vectorizer_params. If a class instance, no parameters can be passed. semantic : str, dict, class or class instance Text model to use to transform text data. Built-in options are 'LatentDirichletAllocation' or 'NMF' (default: LDA). To change default parameters, set to a dictionary e.g. {'model' : 'NMF', 'params' : {'n_components' : 10}}. See http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition for details on the two model options. You can also specify your own text model as a class, or class instance. With either option, the class must have a fit_transform method (see here: http://scikit-learn.org/stable/data_transforms.html). If a class, pass any parameters as a dictionary to text_params. If a class instance, no parameters can be passed. corpus : list (or list of lists) of text samples or 'wiki', 'nips', 'sotus'. Text to use to fit the semantic model (optional). If set to 'wiki', 'nips' or 'sotus' and the default semantic and vectorizer models are used, a pretrained model will be loaded which can save a lot of time. ppca : bool Performs PPCA to fill in missing values (default: True) text_align : str Alignment algorithm to use when both text and numerical data are passed. If numerical arrays have the same shape, and the text data contains the same number of samples, the text and numerical data are automatically aligned to a common space. Example use case: an array of movie frames (frames by pixels) and text descriptions of the frame. In this case, the movie and text will be automatically aligned to the same space (default: hyperalignment). Returns ---------- data : list of numpy arrays A list of formatted arrays """ # not sure why i needed to import here, but its the only way I could get it to work from .df2mat import df2mat from .text2mat import text2mat from ..datageometry import DataGeometry # if x is not a list, make it one if type(x) is not list: x = [x] if all([isinstance(xi, six.string_types) for xi in x]): x = [x] # check data type for each element in list dtypes = list(map(get_type, x)) # handle text data: if any(map(lambda x: x in ['list_str', 'str', 'arr_str'], dtypes)): # default text args text_args = { 'vectorizer' : vectorizer, 'semantic' : semantic, 'corpus' : corpus } # filter text data text_data = [] for i,j in zip(x, dtypes): if j in ['list_str', 'str', 'arr_str']: text_data.append(np.array(i).reshape(-1, 1)) # convert text to numerical matrices text_data = text2mat(text_data, **text_args) # replace the text data with transformed data processed_x = [] textidx=0 for i, dtype in enumerate(dtypes): if dtype in ['list_str', 'str', 'arr_str']: processed_x.append(text_data[textidx]) textidx+=1 elif dtype == 'df': processed_x.append(df2mat(x[i])) elif dtype == 'geo': text_args = { 'vectorizer' : vectorizer, 'semantic' : semantic, 'corpus' : corpus } for j in format_data(x[i].get_data(), **text_args): processed_x.append(j) else: processed_x.append(x[i]) # reshape anything that is 1d if any([i.ndim<=1 for i in processed_x]): processed_x = [np.reshape(i,(i.shape[0],1)) if i.ndim==1 else i for i in processed_x] contains_text = any([dtype in ['list_str', 'str', 'arr_str'] for dtype in dtypes]) contains_num = any([dtype in ['list_num', 'array', 'df', 'arr_num'] for dtype in dtypes]) # if there are any nans in any of the lists, use ppca if ppca is True: if contains_num: num_data = [] for i,j in zip(processed_x, dtypes): if j in ['list_num', 'array', 'df', 'arr_num']: num_data.append(i) if np.isnan(np.vstack(num_data)).any(): warnings.warn('Missing data: Inexact solution computed with PPCA (see https://github.com/allentran/pca-magic for details)') num_data = fill_missing(num_data) x_temp = [] for dtype in dtypes: if dtype in ['list_str', 'str', 'arr_str']: x_temp.append(text_data.pop(0)) elif dtype in ['list_num', 'array', 'df', 'arr_num']: x_temp.append(num_data.pop(0)) processed_x = x_temp # if input data contains both text and numerical data if contains_num and contains_text: # and if they have the same number of samples if np.unique(np.array([i.shape[0] for i, j in zip(processed_x, dtypes)])).shape[0] == 1: from .align import align as aligner # align the data warnings.warn('Numerical and text data with same number of ' 'samples detected. Aligning data to a common space.') processed_x = aligner(processed_x, align=text_align, format_data=False) return processed_x
def fill_missing(x): # ppca if missing data m = PPCA() m.fit(data=np.vstack(x)) x_pca = m.transform() # if the whole row is missing, return nans all_missing = [idx for idx, a in enumerate(np.vstack(x)) if all([type(b)==np.nan for b in a])] if len(all_missing)>0: for i in all_missing: x_pca[i, :] = np.nan # get the original lists back if len(x)>1: x_split = np.cumsum([i.shape[0] for i in x][:-1]) return list(np.split(x_pca, x_split, axis=0)) else: return [x_pca]