Source code for mFlow.Blocks.experimental_protocol

import sys, os
from mFlow.Workflow.compute_graph import node

import copy
import numpy as np
import pandas as pd
import copy
import itertools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold, KFold, GroupShuffleSplit

import warnings
from sklearn.exceptions import ConvergenceWarning,UndefinedMetricWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
import  time

[docs]def df_to_sk(df): features = list(set(df.columns) - {'target'}) X = df[features].values Y = df["target"].values return X,Y
[docs]def addTarget(*args, **kwargs): if("name" in kwargs): name = kwargs["name"] del kwargs["name"] else: name = "addTarget" return node(function = __addTarget, args=args, kwargs=kwargs, name=name)
def __addTarget(df_data,df_labels,*args, key_data="dataframe", key_labels="dataframe",**kwargs): df = df_data[key_data].join(df_labels[key_labels],how="inner") return {"dataframe": df}
[docs]def ExpTrainTest(*args, **kwargs): if("name" in kwargs): name = kwargs["name"] del kwargs["name"] else: name = "Train-Test Experiment" if("n_folds" in kwargs): n_folds = kwargs["n_folds"] else: n_folds = 5 args = list(args) estimators = copy.copy(args[1]) node_list = [] for estimator in estimators: new_estimator = {estimator: estimators[estimator]} args[1] = new_estimator name = "EXP-TT: %s"%(estimator) node_list.append(node(function = __ExpTrainTest, args=copy.copy(args), kwargs=copy.copy(kwargs), name=name)) return node_list
def __ExpTrainTest(df, estimators, metrics=(), random_state=11, train_size=.8, partition_index_number=0,key="dataframe", grouped=True, show=False): df=df[key] if(grouped): index = df.index.names partition_index = index[partition_index_number] all_ids = np.unique(np.array(df.index.get_level_values(partition_index))) tr_ids, te_ids = train_test_split(all_ids, train_size=train_size,test_size=1-train_size, random_state=random_state) df_tr = df[df.index.get_level_values(partition_index).isin(list(tr_ids))] df_te = df[df.index.get_level_values(partition_index).isin(list(te_ids))] else: df_tr,df_te=train_test_split(df, train_size=train_size,test_size=1-train_size, random_state=random_state) X_tr, Y_tr = df_to_sk(df_tr) X_te, Y_te = df_to_sk(df_te) m = list(map(lambda x: str(x.__name__), metrics)) report = pd.DataFrame(columns=m, index=list(estimators.keys()),dtype=float) fit_estimators={} for name in estimators: if(show): print(" Fitting and testing %s"%name) estimator = copy.deepcopy(estimators[name]) estimator.fit(X_tr,Y_tr) fit_estimators[name]=estimator y_predict = estimator.predict(X_te) for i, metric in enumerate(metrics): report[m[i]].loc[name] = metric(Y_te, y_predict) return {"report":report, "fit_estimators":fit_estimators}
[docs]def ExpCV(*args, **kwargs): #Example of a node expander #Single function call returns a list of nodes #Calling function must accept a list of nodes if("n_folds" in kwargs): n_folds = kwargs["n_folds"] else: n_folds = 5 args = list(args) estimators = copy.copy(args[1]) node_list = [] for k in range(n_folds): for estimator in estimators: new_estimator = {estimator: estimators[estimator]} args[1] = new_estimator kwargs["fold"] = k name = "EXP-CV(%d): %s"%(k+1, estimator) node_list.append(node(function = __ExpCV, args=copy.copy(args), kwargs=copy.copy(kwargs), name=name)) return node_list
def __ExpCV(df, estimators, metrics=(), random_state=11, partition_index_number=0, n_folds=5, fold=None,key="dataframe",grouped=True, show=False): df=df[key] X, Y = df_to_sk(df) if(grouped): index = df.index.names partition_index = index[partition_index_number] skf = GroupKFold(n_splits=n_folds) iterator = enumerate(skf.split(X, Y, groups=df.index.get_level_values(partition_index))) else: skf = KFold(n_splits=n_folds, shuffle=False, random_state=random_state) iterator = enumerate(skf.split(X,Y)) #Prepare multi-level report m = list(map(lambda x: str(x.__name__), metrics)) folds = [fold+1] methods = list(estimators.keys()) tuples=itertools.product(methods,folds) index = pd.MultiIndex.from_tuples(tuples, names=['Method', 'Fold']) report = pd.DataFrame(columns=m, index=index, dtype=float) fit_estimators=[{}]*n_folds for j, (train_index, test_index) in iterator: if(j != fold): continue X_tr = X[train_index,:] X_te = X[test_index,:] Y_tr = Y[train_index] Y_te = Y[test_index] for name in estimators: if(show): print(" Fitting and testing %s"%(name)) #kwargs = copy.copy(estimators[name]) #del kwargs["estimator"] #estimator = estimators[name]["estimator"](**kwargs) estimator = copy.deepcopy(estimators[name]) estimator.fit(X_tr,Y_tr) fit_estimators[j][name]=estimator y_predict = estimator.predict(X_te) for i, metric in enumerate(metrics): val = metric(Y_te, y_predict) report[m[i]].loc[name,j+1] = val return {"report":report, "fit_estimators":fit_estimators}
[docs]def ExpWithin(*args, **kwargs): #Example of a node expander #Single function call returns a list of nodes #Calling function must accept a list of nodes if("n_folds" in kwargs): n_folds = kwargs["n_folds"] else: n_folds = 5 args = list(args) estimators = copy.copy(args[1]) node_list = [] for k in range(n_folds): for estimator in estimators: new_estimator = {estimator: estimators[estimator]} args[1] = new_estimator kwargs["fold"] = k name = "EXP-Within(%d): %s"%(k+1, estimator) node_list.append(node(function = __ExpWithin, args=copy.copy(args), kwargs=copy.copy(kwargs), name=name)) return node_list
def __ExpWithin(df, estimators, metrics=(), random_state=11, partition_index_number=0, fold=None,n_folds=None, key="dataframe",train_size=.8,split="temporal", show=False): df = df[key] #Get train and test sets for one individual index = df.index.names partition_index = index[partition_index_number] ids = list(set(df.index.get_level_values(partition_index))) df = df.loc[ids[fold]] #X, Y = df_to_sk(df) if(split=="random"): df_tr,df_te=train_test_split(df, train_size=train_size,test_size=1-train_size, random_state=random_state) elif(split=="temporal"): train_size = int(train_size*df.shape[0]) df_tr = df[:train_size] df_te = df[train_size:] else: raise ValueError("Split type % s is not defined"%split) X_tr, Y_tr = df_to_sk(df_tr) X_te, Y_te = df_to_sk(df_te) #Prepare multi-level report m = list(map(lambda x: str(x.__name__), metrics)) folds = [fold+1] methods = list(estimators.keys()) tuples = itertools.product(methods,folds) index = pd.MultiIndex.from_tuples(tuples, names=['Method', 'Individual']) report = pd.DataFrame(columns=m, index=index, dtype=float) fit_estimators=[{}]*n_folds for name in estimators: if(show): print(" Fitting and testing %s"%(name)) #kwargs = copy.copy(estimators[name]) #del kwargs["estimator"] #estimator = estimators[name]["estimator"](**kwargs) estimator = copy.deepcopy(estimators[name]) estimator.fit(X_tr,Y_tr) fit_estimators[fold][name]=estimator y_predict = estimator.predict(X_te) for i, metric in enumerate(metrics): val = metric(Y_te, y_predict) report[m[i]].loc[name,fold+1] = val return {"report":report, "fit_estimators":fit_estimators}