Skip to content

Problem with many highly correlated features #22

@woznicak

Description

@woznicak

I have data with highly correlated variables and many of them are clustering together at height 0 at algorithms. The function hierarchical_importance does not work with this example.

Below I provide reproducible example.

library(dplyr)
library(triplot)
library(DALEX)
library(gbm)

## download https://github.com/woznicak/MetaFeaturesImpact/blob/master/summary_results_surrogate_models_rank_per_algo.Rd
## This is the Rdata object with list of explainers

summary_results <- readRDS('summary_results_surrogate_models_rank_per_algo.Rd')

explainer_gbm <- summary_results$explainer_GBM_deep[[11]]

tri_var_imp <- calculate_triplot(explainer_gbm, 
                                 data = explainer_gbm$data,
                                 y = explainer_gbm$y,
                                 new_observation = explainer_gbm$data[1,],
                                 predict_function = sexplainer_gbm$predict_function)

I work around this problem and this is my fixing of function

hierarchical_importance <- function (x, data, y = NULL, predict_function = predict, type = "predict", 
          new_observation = NULL, N = 1000, loss_function = DALEX::loss_root_mean_square, 
          B = 10, fi_type = c("raw", "ratio", "difference"), clust_method = "complete", 
          cor_method = "spearman", ...) 
{
  if (all(type != "predict", is.null(y))) {
    stop("Target is needed for hierarchical_importance calculated at model \n         level")
  }
  fi_type <- match.arg(fi_type)
  x_hc <- hclust(as.dist(1 - abs(cor(data, method = cor_method))), 
                 method = clust_method)
  cutting_heights <- x_hc$height
  # aspects_list_previous <- list_variables(x_hc, 1)
  aspects_list_previous <- as.list(colnames(data))
  int_node_importance <- as.data.frame(NULL)
  for (i in c(1:(length(cutting_heights) - 1))) {
    aspects_list_current <- list_variables(x_hc, 1 - cutting_heights[i])
    t1 <- match(aspects_list_current, setdiff(aspects_list_current, 
                                              aspects_list_previous))
    for(k in na.omit(t1)){
      t2 <- which(t1 == k)
      t3 <- aspects_list_current[t2]
      group_name <- names(t3)
      if (type != "predict") {
        
        explainer <- explain(model = x, data = data, y = y,
                             predict_function = predict_function, verbose = FALSE)
        res_ai <- feature_importance(explainer = explainer,
                                     variable_groups = aspects_list_current, N = N,
                                     loss_function = loss_function, B = B, type = fi_type)
        
        
       
        class(res_ai) <- c("model_parts" ,"feature_importance_explainer", class(res_ai))
        
        
        
        res_ai <- res_ai[res_ai$permutation == "0", ]
        int_node_importance[nrow(int_node_importance) + 1, 1] <- res_ai[res_ai$variable == 
                                                                          group_name, ]$dropout_loss
      }
      else {
        res_ai <- aspect_importance(x = x, data = data, 
                                    predict_function = predict_function, new_observation = new_observation, 
                                    variable_groups = aspects_list_current, N = N)
        int_node_importance[nrow(int_node_importance) +1 , 1] <- res_ai[res_ai$variable_groups == 
                                                                          group_name, ]$importance
      }
      int_node_importance[nrow(int_node_importance), 2] <- group_name
      int_node_importance[nrow(int_node_importance), 3] <- cutting_heights[i]
    }
    
    aspects_list_previous <- aspects_list_current
  }
  if (type != "predict") {
    res <- feature_importance(explainer = explainer, variable_groups = , N = N, loss_function = loss_function, B = B)
    res <- res[res$permutation == "0", ]
    baseline_val <- res[res$variable == "aspect.group1", 
    ]$dropout_loss
    int_node_importance[length(cutting_heights), 1] <- baseline_val
  }
  else {
    int_node_importance[(nrow(int_node_importance)+1):length(cutting_heights), 1] <- NA
  }
  x_hc$height <- int_node_importance$V1
  hi <- list(x_hc, type, new_observation)
  class(hi) <- c("hierarchical_importance")
  return(hi)
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions