library(robustmatrix)
library(ggplot2)
library(dplyr)
library(ggnewscale)
library(ggrepel)
library(tidyr)
library(tibble)
library(forcats)This file documents how to use the methods introduced by double-blind (2024) and provides an overview of
the methods of the robustmatrix package. All examples
discussed in double-blind (2024) can be
reproduced using this file. This document aims to illustrate how to use
the functions of the robustmatrix package.
The mmcd function implements the FastMMCD algorithm and
computes the robust matrix-variate Matrix Minimum Covariance Determinant
(MMCD) estimators. For outlier explanation, the
matrixShapley function decomposes the squared Mahalanobis
distance into outlyingness contributions form a matrix’s rows, columns,
or cells.
The real-world data examples discussed here are the mountain
weather, darwin, and video data sets. The former
two data sets are included in the package and can be loaded using
data(weather) and data(darwin), respectively.
Related publications and data sources are listed in the help files. The
video data set is from the Perception Test Images Sequences
from Li et al. (2004) and the preprocessed
video is available at https://wis.kuleuven.be/statdatascience/robust/software.
Weather data from Austria’s highest weather station, situated in the Austrian Central Alps on the glaciated mountain “Hoher Sonnblick”, standing at an elevation of 3106 meters above sea level. An array of dimension \((p,q,n)\), comprising \(n = 136\) observations, each represented by a \(p = 5\) times \(q = 12\) dimensional matrix. Observed parameters are monthly averages of
from 1891 to 2022.
data(weather)
X <- weather[,,!apply(weather,3, function(x) any(is.na(x)))]
n<- dim(X)[3]
p<- dim(X)[1]
q<- dim(X)[2]Parameter estimation of mean, rowwise and columnwise covariance matrix using maximum likelihood estimation and robust parameter estimation based on MMCD.
par_MMLE <- mmle(X, lambda = 0)
set.seed(1)
par_MMCD <- mmcd(X, alpha = 0.5 ,lambda = 0, nsamp = 5000, nthreads = 1)Computation of (robust) Mahalanbosi distances.
MD <- as.numeric(mmd(X = X, 
                     mu = par_MMLE$mu, 
                     cov_row = par_MMLE$cov_row_inv, 
                     cov_col = par_MMLE$cov_col_inv, 
                     inverted = TRUE))
MD_rob <- as.numeric(mmd(X = X, 
                         mu = par_MMCD$mu, 
                         cov_row = par_MMCD$cov_row_inv, 
                         cov_col = par_MMCD$cov_col_inv, 
                         inverted = TRUE))
out_quant <- qchisq(0.95, p*q)
outliers <- which(MD_rob > out_quant)Distance-distance of yearly outlyingness scores, using the robust procedure, we can unmask several outliers.
names(MD_rob) <- dimnames(X)[[3]]
subs <- MD_rob < out_quant
plt_dd <- ggplot(data = NULL, aes(x = sqrt(MD), y = sqrt(MD_rob))) +
  geom_point() +
  geom_hline(yintercept = sqrt(out_quant)) +
  geom_vline(xintercept = sqrt(out_quant)) +
  geom_label_repel(aes(x = sqrt(MD[!subs]), 
                       y = sqrt(MD_rob[!subs]), 
                       label = names(MD_rob[!subs])), 
                   size = 2.5, max.overlaps = 200, nudge_y = 0.5) +
  theme_classic() +
  xlim(c(min(sqrt(MD)),max(sqrt(MD))+1)) +
  labs(x = "MMD", y = "Robust MMD")
plt_ddComputation of cellwise Shapley values.
shv_cell <- array(dim = dim(X))
shv_cell[] <- matrixShapley(X, 
                            mu = par_MMCD$mu, 
                            cov_row = par_MMCD$cov_row_inv, 
                            cov_col = par_MMCD$cov_col_inv, 
                            inverted = TRUE, type = "cell")
dimnames(shv_cell) <- dimnames(X)Preparing data for plots.
long_weather <- function(x, val_to, var_levels = NULL) {
  y <- x %>%
    rownames_to_column(var = "var") %>%
    pivot_longer(cols = -var, names_to = "parameter", values_to = val_to)
  if(!is.null(var_levels)){
    y <- y %>%
      mutate(var = factor(var, levels = var_levels))
  }
  y
}
var_names <- c("T [°C]" = "T",
               "SH [h]" = "SH",
               "P [mm/m²]" = "P",
               "AP [hPa]" = "AP",
               "SP [%]" = "SP")
set <- cbind("var" = 1891:2022,
             "Temperature [°C]" = 0,
             "Preciptiation [mm/m²]" = 0,
             "Solid Preciptiation [%]" = 0,
             "Sunshine hours [h]" = 0,
             "Air pressure [hPa]" = 0)
set[!(set[,1] %in% dimnames(X)[[3]]),-1] <- 1
set <- data.frame(set)
colnames(set) <- c("var",
                   "T [°C]",
                   "P [mm/m²]",
                   "SP [%]",
                   "SH [h]",
                   "AP [hPa]")
set_long <- set %>% pivot_longer(-c(var))shv_year <- apply(shv_cell, c(1,3),sum)
X_center <- array(dim = dim(X), dimnames = dimnames(X))
X_center[] <- (apply(X,3,function(x) x - par_MMCD$mu))
X_center_sign <- array(dim = dim(X), dimnames = dimnames(X))
X_center_sign[] <- sign(X_center)
sign_mat <- apply(X_center,c(1,3),function(x) sign(mean(x)))
sign_mat[,-outliers] <- 0
shv_rel <- apply(shv_year, 2, function(x) x/sum(x))
shv_rel_outliers <- shv_rel
shv_rel_outliers[,-outliers] <- 0
shv_rel_outliers[shv_rel_outliers < 0] <- 0 # ONLY PLOT POSITIVE SHAPLEY VALUES
shv_years_long <- long_weather(data.frame(t(shv_rel_outliers)), val_to = "shv_rel")
sign_years_long <- long_weather(data.frame(t(sign_mat)), val_to = "sign")
plt_data_years <- inner_join(shv_years_long,sign_years_long) %>% 
  mutate(val = sign*abs(shv_rel), 
         sign = as.character(sign), 
         var = as.numeric(var),
         parameter = fct_recode(factor(parameter),!!! var_names))shv_long_1895 <- long_weather(data.frame(t(shv_cell[,,outliers[1]])), val_to = "shv", var_levels = dimnames(X)[[2]])
X_long_1895 <- long_weather(data.frame(t(X[,,outliers[1]])) %>% mutate(SP = SP*100), val_to = "x", var_levels = dimnames(X)[[2]])
sign_long_1895 <- long_weather(data.frame(t(sign(X[,,outliers[1]] - par_MMCD$mu))), val_to = "sign", var_levels = dimnames(X)[[2]])
plt_data_1895 <- inner_join(shv_long_1895,sign_long_1895) %>% 
  mutate(val = sign*abs(shv), 
         sign = as.character(sign)) %>%
  inner_join(X_long_1895) %>%
  mutate(parameter = fct_recode(factor(parameter),!!! var_names))
shv_long_2022 <- long_weather(data.frame(t(shv_cell[,,n])), val_to = "shv", var_levels = dimnames(X)[[2]])
X_long_2022 <- long_weather(data.frame(t(X[,,n])) %>% mutate(SP = SP*100), val_to = "x", var_levels = dimnames(X)[[2]])
sign_long_2022 <- long_weather(data.frame(t(sign(X[,,n] - par_MMCD$mu))), val_to = "sign", var_levels = dimnames(X)[[2]])
plt_data_2022 <- inner_join(shv_long_2022,sign_long_2022) %>% 
  mutate(val = sign*abs(shv), 
         sign = as.character(sign)) %>%
  inner_join(X_long_2022) %>%
  mutate(parameter = fct_recode(factor(parameter),!!! var_names))Yearly outlyingess contributions based on rowwise Shapley values. In the plot, non-outlying years are colored white, while years with missing data are gray. For the outliers, blue indicates “above average”, red indicates “below average”, and color intensity is proportional to the rowwise Shapley value.
ggplot(plt_data_years, aes(x = var, y = parameter, fill = val)) +
  geom_tile(color = "lightgray") +
  theme_minimal() +
  scale_fill_gradient2(low = "#1F78B4", mid = "white", high = "#E31A1C", 
                       midpoint = 0, guide = "none")  +
  scale_x_continuous(breaks = seq(from = 1880, to = 2020, by = 5), expand = c(0,0)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1.2, hjust=1),
        axis.title = element_blank(), title = element_blank()) +
  geom_tile(data = set_long, aes(x = var, y = name, alpha = factor(value)), fill = "gray") +
  scale_alpha_manual(values = c(0,1)) +
  guides(alpha = FALSE, fill = FALSE)Outlyingess contributions based on cellwise Shapley values are computed for 1895 and 2022 using the same color scheme as in the previous plot.
gridExtra::grid.arrange(ggplot(plt_data_1895, aes(x = var, y = parameter, fill = val, label = round(x))) +
                          geom_tile(color = "lightgray") +
                          theme_minimal() +
                          geom_text(size = 3) +
                          theme(axis.text.x = element_text(angle = 45, vjust = 1.4, hjust=1.4),
                                axis.title = element_blank())+
                          labs(title = "1895") + 
                          scale_fill_gradient2(low = "#1F78B4", mid = "white", high = "#E31A1C", 
                                               midpoint = 0, guide = "none"),
                        ggplot(plt_data_2022, aes(x = var, y = parameter, fill = val, label = round(x))) +
                          geom_tile(color = "lightgray") +
                          theme_minimal() +
                          geom_text(size = 3) +
                          theme(axis.text.x = element_text(angle = 45, vjust = 1.4, hjust=1.4),
                                axis.title = element_blank())+
                          labs(title = "2022") + 
                          scale_fill_gradient2(low = "#1F78B4", mid = "white", high = "#E31A1C", 
                                               midpoint = 0, guide = "none"),
                        nrow = 1)The DARWIN (Diagnosis AlzheimeR WIth haNdwriting) data set comprises handwriting samples from 174 individuals. Among them, 89 have been diagnosed with Alzheimer’s disease (AD), while the remaining 85 are considered healthy subjects (H). Each participant completed 25 handwriting tasks on paper, and their pen movements were recorded using a graphic tablet. A set of 18 features was extracted from the raw handwriting data.
An array of dimension \((p,q,n)\), comprising \(n = 174\) observations, each represented by a \(p = 18\) times \(q = 25\) dimensional matrix. The observed parameters are:
data(darwin)
darwin_class <- dimnames(darwin)[[3]]
sub_H <- which(darwin_class == "H")
sub_P <- which(darwin_class == "P")The classes are stored in dimnames(darwin)[[3]]; the
subjects with class H are the healthy, and the subjects with
class P have AD.
Since the parameters Total Time (= Air Time + Paper Time) and Mean GMRT (= (GMRT on paper + GMRT in air)/2) are linearly dependent on other variables, they are excluded from the following analysis. Moreover, the variable Air Time is skewed due to several large univariate outliers in both the H and AD subjects. Across various tasks, some of the observed values of this variable are more than 25 median absolute deviations away from the median. Since the Air Time was measured using the graphic tablet, it could be that some individuals placed the pen too close to the device before the task was performed. While Shapley values based on the MMCD estimators allow us to unveil those large outlyingness contributions, they dominate and mask outlyingess contributions from other variables. Therefore, we also excluded the Air Time variable, and used the \(p = 15\) remaining variables for our analysis.
darwin_array_clean <- darwin[-c(1,9,18),,]
X_H <- darwin_array_clean[,,sub_H] 
X_P <- darwin_array_clean[,,sub_P] 
dimnames(X_H)[[3]] <- sub_H
dimnames(X_P)[[3]] <- sub_P
p <- dim(X_H)[1]
q <- dim(X_H)[2]
n <- dim(X_H)[3]Parameter estimation of mean, rowwise and columnwise covariance matrix using maximum likelihood estimation and robust parameter estimation based on MMCD.
set.seed(1)
par_mmle <- robustmatrix::mmle(X_H)
par_mmcd <- robustmatrix::mmcd(X_H, nsamp = 500, alpha = 0.5, 
                               nthreads = 1, scale_consistency = "quant")Plotting the MMDs based on the parameters estimated from the H group.
plt_data <-   data.frame(patient_id = 1:dim(darwin_array_clean)[[3]], 
                         class = c(rep("AD", length(sub_P)),
                                   rep("H", length(sub_H))),
                         mmd = sqrt(mmd(darwin_array_clean, 
                                        par_mmcd$mu, par_mmcd$cov_row, 
                                        par_mmcd$cov_col)))
p_mmd <- ggplot(data = plt_data, 
                mapping = aes(x = patient_id, y = mmd, color = class)) + 
  geom_point(show.legend = FALSE) + 
  geom_hline(yintercept = sqrt(qchisq(0.975, p*q))) + 
  labs(x = "Subject ID", y = "MMD", color = element_blank()) + 
  theme_classic() +
  facet_grid(~ class, scales = "free")
p_mmd
Computing Shapley values and average proportional contributions
of the variables to the MMDs for the H and P
groups.
shv_cell_P <- array(dim = dim(X_P), dimnames = dimnames(X_P))
shv_cell_P[] <- matrixShapley(X_P, 
                              mu = par_mmcd$mu, 
                              cov_row = par_mmcd$cov_row_inv, 
                              cov_col = par_mmcd$cov_col_inv, 
                              inverted = TRUE, type = "cell")
shv_cell_H <- array(dim = dim(X_H), dimnames = dimnames(X_H))
shv_cell_H[] <- matrixShapley(X_H, 
                              mu = par_mmcd$mu, 
                              cov_row = par_mmcd$cov_row_inv, 
                              cov_col = par_mmcd$cov_col_inv, 
                              inverted = TRUE, type = "cell")
shv_row_mean_P <- data.frame("val" = apply(shv_cell_P,c(1),
                                           function(x)mean(x))) %>% 
  rownames_to_column()
shv_row_mean_H <- data.frame("val" = apply(shv_cell_H,c(1),
                                           function(x)mean(x))) %>% 
  rownames_to_column()
shv_row_mean <- rbind(cbind("class" = "AD", shv_row_mean_P),
                      cbind("class" = "H", shv_row_mean_H))
shv_row_mean_P_prop <- data.frame("val" = apply(shv_cell_P,c(1),
                                                function(x)sum(x)/sum(shv_cell_P))) %>% 
  rownames_to_column()
shv_row_mean_H_prop <- data.frame("val" = apply(shv_cell_H,c(1),
                                                function(x)sum(x)/sum(shv_cell_H))) %>% 
  rownames_to_column()
shv_row_mean_prop<- rbind(cbind("class" = "AD", shv_row_mean_P_prop),
                          cbind("class" = "H", shv_row_mean_H_prop))
fct_order <- order(shv_row_mean_P_prop$val)
shv_row_mean_prop <- shv_row_mean_prop %>%
  mutate(rowname = factor(rowname, levels = shv_row_mean_P_prop$rowname[fct_order]))Average proportional contributions for the H and P groups.
p_shv_features <- ggplot(data = shv_row_mean_prop, 
                         aes(x = val, y = rowname, fill = class)) + 
  geom_bar(stat = "identity", position=position_dodge(), color = "black") + 
  labs(x = "Proportional contribution to MMD²", 
       y = element_blank(), fill = element_blank()) + 
  scale_x_continuous(expand = c(0, 0)) + 
  theme_classic() + 
  theme(legend.position = c(0.95,0.5))
p_shv_featuresThe following code can load the video data and reproduce the results outlined in double-blind (2024). Due to the dimensionality of the data set, the following code is not executed.
# Loading the data
load(url("https://wis.kuleuven.be/stat/robust/Programs/DO/do-video-data-rdata"))
# Creating the overview plot
ind_overview <- c(1,487,491,495,500)
plts_overview <- list()
for(i in seq_along(ind_overview)){
  dim_Video <- dim(Video)
  video_grid <- expand.grid("x" = 1:dim_Video[2], "y" = 1:dim_Video[3])
  video_long <- data.frame(cbind(video_grid, 
                                 "r" = as.vector(Video[ind_overview[i],,,1]/255), 
                                 "g" = as.vector(Video[ind_overview[i],,,2]/255), 
                                 "b" = as.vector(Video[ind_overview[i],,,3]/255))) %>% mutate(rgb.val=rgb(r,g,b))
  plts_overview[[i]] <- ggplot(video_long, aes(y,x)) +
    geom_raster(aes(fill=rgb.val)) +
    scale_fill_identity() +
    theme_void()+
    coord_fixed() + 
    theme(plot.title = element_text(hjust = 0.5),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          panel.background = element_rect(fill = "transparent" ,colour = "black"),
          panel.ontop = TRUE,
          plot.margin = margin(1, 1, 1, 1, "pt")) + 
    scale_x_continuous(expand = c(0,0)) + 
    scale_y_continuous(expand = c(0,0), trans = "reverse") + 
    labs(title = paste("Frame", ind_overview[i]))
}
video_overview <- do.call(gridExtra::grid.arrange, args = list("grobs" = plts_overview, "nrow" = 1))
# Transform to grayscale image
video_grayscale <- apply(Video, 1:3, mean)
X <- aperm(video_grayscale,c(2,3,1))
n <- dim(X)[3]
p <- dim(X)[1]
q <- dim(X)[2]
# MMLE and MMCD parameter estimation
par_MMLE <- mmle(X, lambda = 0)
set.seed(1)
par_MMCD <- mmcd(X, alpha = 0.5,lambda = 0, nsamp = 500, nthreads = 1)
# Compute squared Mahlanobis distances
MD <- mmd(X, mu = par_MMLE$mu, cov_row = par_MMLE$cov_row_inv, 
          cov_col = par_MMLE$cov_col_inv, inverted = TRUE)
MD_rob <- mmd(X, mu = par_MMCD$mu, cov_row = par_MMCD$cov_row_inv, 
              cov_col = par_MMCD$cov_col_inv, inverted = TRUE)
out_quant <- qchisq(0.99, p*q)
# Plot MMD of all observations
data_plt_md <- data.frame(cbind("Frame" = 1:length(MD_rob), "MD" = sqrt(MD_rob)))
ggplot(data_plt_md, aes(x = Frame, y = MD)) + 
  geom_point() + 
  labs(x = "Frame", y = "MMD") + 
  scale_x_continuous(limits = range(1,n), breaks = seq(from = 50, to = n, by = 50), expand = c(0,3)) + 
  theme_classic()
# Plot MMD of last observations
sub_ind <- (1:length(MD_rob))[-(1:475)]
frame_labels <- rep(NA,length(sub_ind))
frame_labels[sub_ind %in% c(487,491,495,500)] <- c("Frame 487: man left of tree", "Frame 491: man behind tree", "Frame 495: man right of tree", "Frame 500: man fully visible")
data_plt_md1 <- data.frame(cbind("Frame" = sub_ind, "MD" = sqrt(MD_rob[sub_ind]), "labels" = frame_labels))
ggplot(data_plt_md1, aes(x = as.numeric(Frame), y = as.numeric(MD), label = labels)) + 
  geom_point() + 
  geom_path() + 
  geom_label_repel(nudge_x = 75, nudge_y = -80, color = "darkblue") + 
  labs(x = "Frame", y = "MMD") + 
  scale_x_continuous(limits = range(sub_ind), breaks = seq(from = 480, to = 630, by = 10), expand = c(0,1)) + 
  theme_classic()
# Compute Shapley values
shv <- array(dim = dim(X))
shv[,,] <- matrixShapley(X[,,], mu = par_MMCD$mu, 
                         cov_row = par_MMCD$cov_row_inv, cov_col = par_MMCD$cov_col_inv, 
                         inverted = TRUE, type = "cell")
# Function to plot Shapley values for image data
plot_shapley_image <- function(image,shapley_values, lower = -100, upper = 100, 
                               title = NULL, positive_only = FALSE, border = TRUE){
  shapley_values[shapley_values < lower] <- lower
  shapley_values[shapley_values > upper] <- upper
  if(positive_only){
    shapley_values[shapley_values < 0] <- 0
  }
  
  image_grid <- expand.grid("x" = 1:nrow(shapley_values), "y" = 1:ncol(shapley_values))
  image_data <- cbind(image_grid, "z" = as.vector(image), "shv" = as.vector(shapley_values))
  plt <- ggplot() 
  if(any(image != 0)){
    plt <- plt + 
      geom_raster(data = image_data, aes(x, y, fill = z)) + 
      scale_fill_gradient(low = "black", high = "white",guide=FALSE)
  }
  plt <- plt +
    new_scale("fill") +
    geom_raster(data = image_data, aes(x, y, fill = shv), na.rm = TRUE) +
    scale_fill_gradientn(colours= c("blue", "transparent", "red"),limits=c(lower, upper),
                         guide=FALSE, na.value = "transparent") + 
    labs(title = title) + 
    theme_void()+
    coord_fixed() + 
    theme(plot.title = element_text(hjust = 0.5))
  if(border){
    plt <- plt  +
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(),
            panel.background = element_rect(fill = "transparent" ,colour = "black"),
            panel.ontop = TRUE,
            plot.margin = margin(1, 1, 1, 1, "pt")) + 
      scale_y_continuous(expand = c(0,0)) + 
      scale_x_continuous(expand = c(0,0))
  }
  plt 
}
# Plot Shapley values for selected frames
lims <- 100
gridExtra::grid.arrange(
  plot_shapley_image(image = t(apply(X[,,487],2,rev)), shapley_values = (t(apply(shv[,,487],2,rev))),
                     lower = -lims, upper = lims, title = "Frame 487", positive_only = TRUE), 
  plot_shapley_image(image = t(apply(X[,,491],2,rev)), shapley_values = (t(apply(shv[,,491],2,rev))),
                     lower = -lims, upper = lims, title = "Frame 491", positive_only = TRUE),
  plot_shapley_image(image = t(apply(X[,,495],2,rev)), shapley_values = (t(apply(shv[,,495],2,rev))),
                     lower = -lims, upper = lims, title = "Frame 495", positive_only = TRUE),
  plot_shapley_image(image = t(apply(X[,,500],2,rev)), shapley_values = (t(apply(shv[,,500],2,rev))),
                     lower = -lims, upper = lims, title = "Frame 500", positive_only = TRUE),
  nrow = 1
)