Benchmark

Load packages

##update/install MLHO
devtools::install_github("clai-group/mlho")
Skipping install of 'mlho' from a github remote, the SHA1 (44e016f4) has not changed since last install.
  Use `force = TRUE` to force installation
# load MLHO, afterwards source the MSMR.lite.R file to overwrite the MSMR.lite function
# in the package with the updated one (the encounter functionality is only available in the R file)
library(mlho)

#load and install required dependecies
pacman::p_load(data.table, devtools, backports, Hmisc, tidyr,dplyr,ggplot2,plyr,scales,readr, httr, DT, lubridate, DALEX, tidyverse,reshape2,foreach,doParallel,caret,gbm,lubridate,praznik)
library(counterfactuals)
Warning: package 'counterfactuals' was built under R version 4.3.3
library(iml)

Prepare the data

We load several datasets from the MLHO package, including incident data and demographic information.

dbmart consists of patient ID (patient_num) and associated phenotypes (phenx). Each patient can have multiple features, including different diagnostic events or conditions.

labelDT includes patient ID (patient_num), the start date of each event (start_date), and a binary label (label) indicating the outcome of interest.

dems contains dempgraphic information for each patient.

dbmart <- mlho::incident_dbmart
labelDT <- mlho::incident_labeldt
labelDT <- labelDT %>% mutate(o_date=case_when(label == 0 ~ start_date, label == 1 ~ start_date+sample(1:40,1)))
labelDT$start_date= as.Date(labelDT$start_date)
dems <- mlho::incident_dems
head(dbmart)
# A tibble: 6 × 4
  patient_num                          phenx     DESCRIPTION          start_date
  <chr>                                <chr>     <chr>                <date>    
1 478a4846-0ae6-4ec8-8155-019708911526 76601001  Intramuscular injec… 2019-08-24
2 478a4846-0ae6-4ec8-8155-019708911526 76601001  Intramuscular injec… 2019-11-23
3 478a4846-0ae6-4ec8-8155-019708911526 76601001  Intramuscular injec… 2020-02-22
4 478a4846-0ae6-4ec8-8155-019708911526 261352009 Face mask (physical… 2020-03-11
5 478a4846-0ae6-4ec8-8155-019708911526 65200003  Insertion of intrau… 2020-05-01
6 478a4846-0ae6-4ec8-8155-019708911526 76601001  Intramuscular injec… 2020-05-23
head(labelDT)
# A tibble: 6 × 4
  patient_num                          start_date label o_date    
  <chr>                                <date>     <dbl> <date>    
1 478a4846-0ae6-4ec8-8155-019708911526 2019-08-24     1 2019-09-13
2 478a4846-0ae6-4ec8-8155-019708911526 2019-11-23     1 2019-12-13
3 478a4846-0ae6-4ec8-8155-019708911526 2020-02-22     1 2020-03-13
4 478a4846-0ae6-4ec8-8155-019708911526 2020-03-11     0 2020-03-11
5 478a4846-0ae6-4ec8-8155-019708911526 2020-05-01     0 2020-05-01
6 478a4846-0ae6-4ec8-8155-019708911526 2020-05-23     1 2020-06-12

Splitting data into training and testing sets using a 70-30 ratio

We extract a unique list of “patient_num” from dbmart. Using the list of unique patient ID, we randomly select 30% of these patients to include in our test set.

uniqpats <- c(as.character(unique(dbmart$patient_num)))


test_ind <- sample(uniqpats,
                   round(.3*length(uniqpats)))

Transform train data

After splitting the data into training and testing sets, the next step is to transform the data to ensure that the data aligns with the requirements of the modeling functions in the MLHO package.

dat.train  <- subset(dbmart,!(dbmart$patient_num %in% c(test_ind)))
data.table::setDT(dat.train)
#values must be in column named value
dat.train[,value := 1]
uniqpats.train <- c(as.character(unique(dat.train$patient_num)))

The benchmark.revolving function in the MLHO package generates a baseline, atemporal representation of patient data for machine learning. Unlike the TLDR algorithm, which labels each clinical record with temporal context (e.g., “history,” “past,” and “last”), this benchmarking approach ignores the timing of events. Instead, it simply counts the total number of times each clinical feature appears across all of a patient’s encounters. For each patient, the output is a wide-format table where each column represents a unique feature, and each cell contains the total count of that feature—aggregated over time, without regard to when it occurred.

MLHO.dat <- dat.train
labels = labelDT
patients <- uniqpats.train
binarize=T
sparsity=0.05 ## Sample size * sparsity. Don't pick a too small value to avoid overfitting
jmi=TRUE
topn=50
patients <- uniqpats.train
multicore=T
valuesToMerge = F
timeBufffer=c(h=0,p=0,l=0,o=-30)
dat.train <- benchmark.revolving(MLHO.dat,
                                 labels,
                                 binarize,
                                 sparsity,
                                 jmi,
                                 topn,
                                 patients <- uniqpats.train,
                                 multicore=FALSE,
                                 valuesToMerge = TRUE,
                                 timeBufffer)
[1] "step - 1: sparsity screening!"
[1] "step 2: JMI dimensionality reduction!"
rank = dat.train$rank
dat.train = dat.train$AVR

Transform test data

We repeat the data processing and transformation again on the test set.

dat.test <- subset(dbmart,dbmart$patient_num %in% c(test_ind))
uniqpats.test <- c(as.character(unique(dat.test$patient_num)))
# remove phenx not required to create the encounter based phenx 
# (remove _last, _past and _history from the colnames to determine the phenxs)
dat.train.colnames <- vapply(strsplit(colnames(dat.train),"_"),`[`, 1, FUN.VALUE=character(1))
dat.test <- subset(dat.test,dat.test$phenx %in% dat.train.colnames)
setDT(dat.test)
#values must be in column named value
dat.test$value <- 1

MLHO.dat.test = dat.test

# important to have a value and phenx column to merge
dat.test <- benchmark.revolving(MLHO.dat=dat.test,
                                patients = uniqpats.test,
                                sparsity=NA,
                                jmi = FALSE,
                                labels = labelDT,
                                valuesToMerge = TRUE,
                                binarize = F,
                                timeBufffer)

# remove sparse and not relevant _past, _last _history phenx according to the train data
dat.test <- dat.test %>% select(one_of(colnames(dat.train)))

Update demographics and labels Data

The dems dataset, which contains demographic information, is updated to include relevant labels from labelDT. This integration involves merging both datasets by “patient_num”, then modifying the “patient_num” to include the “start_date” for a unique identifier per patient encounter.

dems <- dems %>%
  merge(labelDT,by = "patient_num") %>%
  mutate(patient_num = paste0(patient_num,"_" ,start_date)) %>%
  select(-start_date, -label)

Similarly, labelDT is updated to concatenate “patient_num” with “start_date” to create a unique identifier for each patient’s encounter, which simplifies subsequent merging and data handling processes. The “start_date” column is then removed to clean up the dataset:

# merge patientnum and encounter date in labelDT
labelDT <- labelDT %>%
  mutate(patient_num = paste0(patient_num,"_" ,start_date))  %>%
  select(-start_date)

Train model

We use the mlearn function to do the modeling, which includes training the model and testing it on the test set.

## we may want to reduce the output of this cell
model.test <- mlearn(dat.train,
                     dat.test,
                     dems=NULL,
                     save.model=FALSE,
                     classifier="gbm",
                     note="mlho_test_run",
                     cv="cv",
                     nfold=5,
                     aoi="random phenx from dbmart",
                     multicore=FALSE,
                     calSHAP = T,
                     counterfactual = T,
                     save.model.counterfactual = F)
[1] "the modeling!"
Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.
Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9754             nan     0.1000    0.0193
     2        0.9446             nan     0.1000    0.0145
     3        0.9231             nan     0.1000    0.0116
     4        0.9051             nan     0.1000    0.0093
     5        0.8894             nan     0.1000    0.0073
     6        0.8751             nan     0.1000    0.0055
     7        0.8664             nan     0.1000    0.0046
     8        0.8587             nan     0.1000    0.0039
     9        0.8524             nan     0.1000    0.0032
    10        0.8486             nan     0.1000    0.0016
    20        0.8216             nan     0.1000    0.0007
    40        0.7979             nan     0.1000    0.0002
    60        0.7872             nan     0.1000    0.0001
    80        0.7809             nan     0.1000   -0.0003
   100        0.7785             nan     0.1000   -0.0002
   120        0.7768             nan     0.1000   -0.0001
   140        0.7757             nan     0.1000   -0.0001
   150        0.7755             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9720             nan     0.1000    0.0208
     2        0.9395             nan     0.1000    0.0164
     3        0.9122             nan     0.1000    0.0129
     4        0.8906             nan     0.1000    0.0107
     5        0.8723             nan     0.1000    0.0086
     6        0.8584             nan     0.1000    0.0072
     7        0.8459             nan     0.1000    0.0058
     8        0.8348             nan     0.1000    0.0053
     9        0.8254             nan     0.1000    0.0041
    10        0.8190             nan     0.1000    0.0026
    20        0.7777             nan     0.1000    0.0005
    40        0.7538             nan     0.1000   -0.0001
    60        0.7452             nan     0.1000   -0.0000
    80        0.7385             nan     0.1000   -0.0002
   100        0.7345             nan     0.1000   -0.0003
   120        0.7303             nan     0.1000   -0.0003
   140        0.7271             nan     0.1000   -0.0005
   150        0.7248             nan     0.1000    0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9654             nan     0.1000    0.0232
     2        0.9296             nan     0.1000    0.0171
     3        0.9005             nan     0.1000    0.0134
     4        0.8805             nan     0.1000    0.0097
     5        0.8630             nan     0.1000    0.0089
     6        0.8477             nan     0.1000    0.0077
     7        0.8345             nan     0.1000    0.0065
     8        0.8237             nan     0.1000    0.0049
     9        0.8142             nan     0.1000    0.0042
    10        0.8060             nan     0.1000    0.0037
    20        0.7677             nan     0.1000    0.0003
    40        0.7367             nan     0.1000    0.0003
    60        0.7295             nan     0.1000   -0.0003
    80        0.7211             nan     0.1000   -0.0004
   100        0.7169             nan     0.1000   -0.0002
   120        0.7134             nan     0.1000   -0.0002
   140        0.7088             nan     0.1000   -0.0002
   150        0.7076             nan     0.1000   -0.0004

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9800             nan     0.1000    0.0178
     2        0.9506             nan     0.1000    0.0136
     3        0.9295             nan     0.1000    0.0103
     4        0.9123             nan     0.1000    0.0084
     5        0.8978             nan     0.1000    0.0066
     6        0.8871             nan     0.1000    0.0054
     7        0.8783             nan     0.1000    0.0043
     8        0.8714             nan     0.1000    0.0035
     9        0.8661             nan     0.1000    0.0023
    10        0.8612             nan     0.1000    0.0027
    20        0.8326             nan     0.1000    0.0007
    40        0.8105             nan     0.1000    0.0005
    60        0.8010             nan     0.1000   -0.0001
    80        0.7947             nan     0.1000   -0.0004
   100        0.7910             nan     0.1000   -0.0002
   120        0.7894             nan     0.1000   -0.0002
   140        0.7878             nan     0.1000   -0.0000
   150        0.7873             nan     0.1000   -0.0001

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9749             nan     0.1000    0.0201
     2        0.9456             nan     0.1000    0.0156
     3        0.9224             nan     0.1000    0.0126
     4        0.9007             nan     0.1000    0.0102
     5        0.8830             nan     0.1000    0.0079
     6        0.8697             nan     0.1000    0.0065
     7        0.8591             nan     0.1000    0.0058
     8        0.8506             nan     0.1000    0.0045
     9        0.8426             nan     0.1000    0.0041
    10        0.8344             nan     0.1000    0.0037
    20        0.7932             nan     0.1000    0.0016
    40        0.7700             nan     0.1000   -0.0003
    60        0.7598             nan     0.1000   -0.0002
    80        0.7540             nan     0.1000    0.0003
   100        0.7489             nan     0.1000    0.0001
   120        0.7439             nan     0.1000   -0.0001
   140        0.7403             nan     0.1000   -0.0003
   150        0.7387             nan     0.1000   -0.0001

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9709             nan     0.1000    0.0217
     2        0.9405             nan     0.1000    0.0158
     3        0.9135             nan     0.1000    0.0131
     4        0.8923             nan     0.1000    0.0107
     5        0.8743             nan     0.1000    0.0085
     6        0.8612             nan     0.1000    0.0065
     7        0.8469             nan     0.1000    0.0072
     8        0.8346             nan     0.1000    0.0059
     9        0.8254             nan     0.1000    0.0043
    10        0.8182             nan     0.1000    0.0039
    20        0.7749             nan     0.1000    0.0012
    40        0.7525             nan     0.1000   -0.0002
    60        0.7437             nan     0.1000   -0.0000
    80        0.7371             nan     0.1000   -0.0003
   100        0.7322             nan     0.1000   -0.0003
   120        0.7282             nan     0.1000   -0.0003
   140        0.7233             nan     0.1000   -0.0005
   150        0.7213             nan     0.1000   -0.0004

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9796             nan     0.1000    0.0174
     2        0.9512             nan     0.1000    0.0139
     3        0.9282             nan     0.1000    0.0103
     4        0.9129             nan     0.1000    0.0083
     5        0.8991             nan     0.1000    0.0069
     6        0.8881             nan     0.1000    0.0057
     7        0.8796             nan     0.1000    0.0046
     8        0.8723             nan     0.1000    0.0038
     9        0.8664             nan     0.1000    0.0031
    10        0.8617             nan     0.1000    0.0026
    20        0.8357             nan     0.1000    0.0008
    40        0.8119             nan     0.1000    0.0001
    60        0.8005             nan     0.1000    0.0001
    80        0.7921             nan     0.1000   -0.0001
   100        0.7887             nan     0.1000   -0.0001
   120        0.7863             nan     0.1000   -0.0001
   140        0.7853             nan     0.1000   -0.0002
   150        0.7848             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9733             nan     0.1000    0.0204
     2        0.9422             nan     0.1000    0.0152
     3        0.9177             nan     0.1000    0.0127
     4        0.8973             nan     0.1000    0.0101
     5        0.8815             nan     0.1000    0.0080
     6        0.8668             nan     0.1000    0.0069
     7        0.8548             nan     0.1000    0.0055
     8        0.8441             nan     0.1000    0.0051
     9        0.8354             nan     0.1000    0.0042
    10        0.8286             nan     0.1000    0.0032
    20        0.7902             nan     0.1000    0.0010
    40        0.7647             nan     0.1000    0.0000
    60        0.7552             nan     0.1000   -0.0002
    80        0.7483             nan     0.1000   -0.0001
   100        0.7437             nan     0.1000   -0.0001
   120        0.7387             nan     0.1000   -0.0000
   140        0.7339             nan     0.1000   -0.0001
   150        0.7320             nan     0.1000   -0.0001

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9705             nan     0.1000    0.0220
     2        0.9355             nan     0.1000    0.0166
     3        0.9088             nan     0.1000    0.0127
     4        0.8858             nan     0.1000    0.0106
     5        0.8690             nan     0.1000    0.0088
     6        0.8538             nan     0.1000    0.0076
     7        0.8409             nan     0.1000    0.0065
     8        0.8300             nan     0.1000    0.0053
     9        0.8202             nan     0.1000    0.0045
    10        0.8138             nan     0.1000    0.0030
    20        0.7705             nan     0.1000    0.0008
    40        0.7464             nan     0.1000   -0.0001
    60        0.7364             nan     0.1000   -0.0001
    80        0.7285             nan     0.1000   -0.0006
   100        0.7219             nan     0.1000   -0.0003
   120        0.7173             nan     0.1000   -0.0001
   140        0.7130             nan     0.1000   -0.0003
   150        0.7115             nan     0.1000   -0.0005

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9742             nan     0.1000    0.0197
     2        0.9434             nan     0.1000    0.0151
     3        0.9198             nan     0.1000    0.0118
     4        0.9021             nan     0.1000    0.0093
     5        0.8867             nan     0.1000    0.0076
     6        0.8740             nan     0.1000    0.0062
     7        0.8636             nan     0.1000    0.0049
     8        0.8564             nan     0.1000    0.0040
     9        0.8486             nan     0.1000    0.0032
    10        0.8445             nan     0.1000    0.0019
    20        0.8182             nan     0.1000    0.0008
    40        0.7936             nan     0.1000    0.0004
    60        0.7818             nan     0.1000    0.0000
    80        0.7760             nan     0.1000   -0.0000
   100        0.7731             nan     0.1000    0.0001
   120        0.7719             nan     0.1000   -0.0001
   140        0.7703             nan     0.1000   -0.0005
   150        0.7698             nan     0.1000   -0.0000

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9708             nan     0.1000    0.0224
     2        0.9379             nan     0.1000    0.0166
     3        0.9109             nan     0.1000    0.0130
     4        0.8865             nan     0.1000    0.0112
     5        0.8698             nan     0.1000    0.0082
     6        0.8545             nan     0.1000    0.0078
     7        0.8436             nan     0.1000    0.0053
     8        0.8330             nan     0.1000    0.0051
     9        0.8235             nan     0.1000    0.0042
    10        0.8153             nan     0.1000    0.0038
    20        0.7775             nan     0.1000    0.0006
    40        0.7508             nan     0.1000   -0.0001
    60        0.7425             nan     0.1000   -0.0001
    80        0.7372             nan     0.1000   -0.0002
   100        0.7316             nan     0.1000   -0.0001
   120        0.7269             nan     0.1000   -0.0001
   140        0.7233             nan     0.1000   -0.0003
   150        0.7223             nan     0.1000   -0.0003

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9659             nan     0.1000    0.0227
     2        0.9288             nan     0.1000    0.0172
     3        0.9012             nan     0.1000    0.0136
     4        0.8795             nan     0.1000    0.0108
     5        0.8608             nan     0.1000    0.0091
     6        0.8450             nan     0.1000    0.0073
     7        0.8336             nan     0.1000    0.0055
     8        0.8225             nan     0.1000    0.0051
     9        0.8138             nan     0.1000    0.0037
    10        0.8061             nan     0.1000    0.0034
    20        0.7621             nan     0.1000    0.0003
    40        0.7372             nan     0.1000    0.0001
    60        0.7253             nan     0.1000   -0.0003
    80        0.7192             nan     0.1000   -0.0002
   100        0.7151             nan     0.1000   -0.0003
   120        0.7121             nan     0.1000   -0.0004
   140        0.7085             nan     0.1000   -0.0004
   150        0.7070             nan     0.1000   -0.0005

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9778             nan     0.1000    0.0179
     2        0.9491             nan     0.1000    0.0138
     3        0.9285             nan     0.1000    0.0109
     4        0.9116             nan     0.1000    0.0088
     5        0.8978             nan     0.1000    0.0072
     6        0.8853             nan     0.1000    0.0057
     7        0.8754             nan     0.1000    0.0046
     8        0.8678             nan     0.1000    0.0037
     9        0.8625             nan     0.1000    0.0022
    10        0.8576             nan     0.1000    0.0025
    20        0.8314             nan     0.1000    0.0005
    40        0.8080             nan     0.1000    0.0001
    60        0.7944             nan     0.1000    0.0001
    80        0.7880             nan     0.1000    0.0001
   100        0.7834             nan     0.1000   -0.0001
   120        0.7817             nan     0.1000   -0.0004
   140        0.7806             nan     0.1000   -0.0002
   150        0.7799             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9732             nan     0.1000    0.0201
     2        0.9393             nan     0.1000    0.0158
     3        0.9147             nan     0.1000    0.0124
     4        0.8941             nan     0.1000    0.0093
     5        0.8778             nan     0.1000    0.0085
     6        0.8642             nan     0.1000    0.0063
     7        0.8522             nan     0.1000    0.0058
     8        0.8412             nan     0.1000    0.0049
     9        0.8321             nan     0.1000    0.0040
    10        0.8248             nan     0.1000    0.0030
    20        0.7908             nan     0.1000    0.0003
    40        0.7625             nan     0.1000   -0.0001
    60        0.7514             nan     0.1000    0.0001
    80        0.7439             nan     0.1000   -0.0002
   100        0.7396             nan     0.1000   -0.0003
   120        0.7350             nan     0.1000   -0.0003
   140        0.7328             nan     0.1000   -0.0003
   150        0.7315             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9692             nan     0.1000    0.0223
     2        0.9377             nan     0.1000    0.0166
     3        0.9103             nan     0.1000    0.0138
     4        0.8879             nan     0.1000    0.0110
     5        0.8720             nan     0.1000    0.0083
     6        0.8565             nan     0.1000    0.0075
     7        0.8439             nan     0.1000    0.0061
     8        0.8330             nan     0.1000    0.0049
     9        0.8236             nan     0.1000    0.0046
    10        0.8144             nan     0.1000    0.0046
    20        0.7687             nan     0.1000    0.0006
    40        0.7437             nan     0.1000    0.0000
    60        0.7329             nan     0.1000   -0.0004
    80        0.7273             nan     0.1000   -0.0002
   100        0.7207             nan     0.1000   -0.0000
   120        0.7172             nan     0.1000   -0.0001
   140        0.7143             nan     0.1000   -0.0002
   150        0.7124             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9724             nan     0.1000    0.0203
     2        0.9391             nan     0.1000    0.0162
     3        0.9123             nan     0.1000    0.0122
     4        0.8937             nan     0.1000    0.0095
     5        0.8753             nan     0.1000    0.0088
     6        0.8611             nan     0.1000    0.0071
     7        0.8518             nan     0.1000    0.0052
     8        0.8419             nan     0.1000    0.0049
     9        0.8329             nan     0.1000    0.0044
    10        0.8251             nan     0.1000    0.0038
    20        0.7824             nan     0.1000    0.0017
    40        0.7586             nan     0.1000    0.0002
    60        0.7492             nan     0.1000    0.0002
    80        0.7440             nan     0.1000   -0.0001
   100        0.7401             nan     0.1000    0.0000
Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.
Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9797             nan     0.1000    0.0173
     2        0.9518             nan     0.1000    0.0134
     3        0.9292             nan     0.1000    0.0104
     4        0.9125             nan     0.1000    0.0083
     5        0.9000             nan     0.1000    0.0065
     6        0.8903             nan     0.1000    0.0053
     7        0.8813             nan     0.1000    0.0043
     8        0.8751             nan     0.1000    0.0036
     9        0.8683             nan     0.1000    0.0030
    10        0.8632             nan     0.1000    0.0022
    20        0.8363             nan     0.1000    0.0007
    40        0.8121             nan     0.1000   -0.0000
    60        0.7995             nan     0.1000   -0.0001
    80        0.7937             nan     0.1000    0.0001
   100        0.7896             nan     0.1000   -0.0004
   120        0.7876             nan     0.1000   -0.0000
   140        0.7869             nan     0.1000   -0.0004
   150        0.7869             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9733             nan     0.1000    0.0195
     2        0.9441             nan     0.1000    0.0150
     3        0.9190             nan     0.1000    0.0127
     4        0.9001             nan     0.1000    0.0096
     5        0.8830             nan     0.1000    0.0080
     6        0.8698             nan     0.1000    0.0068
     7        0.8593             nan     0.1000    0.0053
     8        0.8486             nan     0.1000    0.0041
     9        0.8410             nan     0.1000    0.0038
    10        0.8342             nan     0.1000    0.0032
    20        0.7912             nan     0.1000    0.0016
    40        0.7658             nan     0.1000   -0.0002
    60        0.7560             nan     0.1000    0.0006
    80        0.7502             nan     0.1000   -0.0000
   100        0.7448             nan     0.1000   -0.0001
   120        0.7418             nan     0.1000   -0.0002
   140        0.7379             nan     0.1000   -0.0006
   150        0.7363             nan     0.1000   -0.0001

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9724             nan     0.1000    0.0216
     2        0.9384             nan     0.1000    0.0164
     3        0.9147             nan     0.1000    0.0124
     4        0.8908             nan     0.1000    0.0106
     5        0.8720             nan     0.1000    0.0089
     6        0.8586             nan     0.1000    0.0071
     7        0.8461             nan     0.1000    0.0060
     8        0.8352             nan     0.1000    0.0054
     9        0.8260             nan     0.1000    0.0040
    10        0.8187             nan     0.1000    0.0031
    20        0.7747             nan     0.1000    0.0015
    40        0.7499             nan     0.1000   -0.0001
    60        0.7414             nan     0.1000   -0.0003
    80        0.7341             nan     0.1000   -0.0002
   100        0.7296             nan     0.1000   -0.0001
   120        0.7262             nan     0.1000   -0.0001
   140        0.7221             nan     0.1000   -0.0003
   150        0.7200             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9761             nan     0.1000    0.0181
     2        0.9494             nan     0.1000    0.0138
     3        0.9257             nan     0.1000    0.0106
     4        0.9081             nan     0.1000    0.0085
     5        0.8948             nan     0.1000    0.0067
     6        0.8840             nan     0.1000    0.0056
     7        0.8752             nan     0.1000    0.0045
     8        0.8666             nan     0.1000    0.0035
     9        0.8609             nan     0.1000    0.0029
    10        0.8567             nan     0.1000    0.0016
    20        0.8311             nan     0.1000    0.0003
    40        0.8058             nan     0.1000    0.0004
    60        0.7948             nan     0.1000    0.0000
    80        0.7879             nan     0.1000    0.0001
   100        0.7840             nan     0.1000   -0.0001
   120        0.7820             nan     0.1000   -0.0002
   140        0.7799             nan     0.1000   -0.0001
   150        0.7795             nan     0.1000   -0.0003

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9753             nan     0.1000    0.0198
     2        0.9443             nan     0.1000    0.0160
     3        0.9209             nan     0.1000    0.0125
     4        0.8991             nan     0.1000    0.0101
     5        0.8812             nan     0.1000    0.0086
     6        0.8666             nan     0.1000    0.0069
     7        0.8559             nan     0.1000    0.0055
     8        0.8466             nan     0.1000    0.0042
     9        0.8384             nan     0.1000    0.0043
    10        0.8319             nan     0.1000    0.0035
    20        0.7890             nan     0.1000    0.0008
    40        0.7649             nan     0.1000    0.0006
    60        0.7555             nan     0.1000   -0.0001
    80        0.7480             nan     0.1000   -0.0002
   100        0.7432             nan     0.1000   -0.0003
   120        0.7384             nan     0.1000   -0.0001
   140        0.7342             nan     0.1000   -0.0000
   150        0.7327             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9701             nan     0.1000    0.0221
     2        0.9352             nan     0.1000    0.0169
     3        0.9105             nan     0.1000    0.0125
     4        0.8906             nan     0.1000    0.0098
     5        0.8719             nan     0.1000    0.0087
     6        0.8572             nan     0.1000    0.0070
     7        0.8463             nan     0.1000    0.0049
     8        0.8353             nan     0.1000    0.0056
     9        0.8262             nan     0.1000    0.0042
    10        0.8175             nan     0.1000    0.0040
    20        0.7740             nan     0.1000    0.0000
    40        0.7507             nan     0.1000    0.0005
    60        0.7383             nan     0.1000    0.0001
    80        0.7303             nan     0.1000   -0.0003
   100        0.7251             nan     0.1000   -0.0001
   120        0.7211             nan     0.1000   -0.0004
   140        0.7175             nan     0.1000   -0.0001
   150        0.7153             nan     0.1000   -0.0000

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9787             nan     0.1000    0.0192
     2        0.9484             nan     0.1000    0.0151
     3        0.9243             nan     0.1000    0.0121
     4        0.9039             nan     0.1000    0.0095
     5        0.8885             nan     0.1000    0.0075
     6        0.8769             nan     0.1000    0.0061
     7        0.8668             nan     0.1000    0.0050
     8        0.8594             nan     0.1000    0.0041
     9        0.8540             nan     0.1000    0.0024
    10        0.8477             nan     0.1000    0.0031
    20        0.8162             nan     0.1000    0.0007
    40        0.7941             nan     0.1000    0.0003
    60        0.7820             nan     0.1000   -0.0000
    80        0.7754             nan     0.1000    0.0000
   100        0.7719             nan     0.1000   -0.0002
   120        0.7700             nan     0.1000   -0.0001
   140        0.7687             nan     0.1000   -0.0001
   150        0.7683             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9699             nan     0.1000    0.0212
     2        0.9357             nan     0.1000    0.0167
     3        0.9099             nan     0.1000    0.0125
     4        0.8886             nan     0.1000    0.0111
     5        0.8710             nan     0.1000    0.0087
     6        0.8564             nan     0.1000    0.0074
     7        0.8442             nan     0.1000    0.0060
     8        0.8337             nan     0.1000    0.0053
     9        0.8255             nan     0.1000    0.0042
    10        0.8175             nan     0.1000    0.0039
    20        0.7729             nan     0.1000    0.0011
    40        0.7491             nan     0.1000    0.0000
    60        0.7384             nan     0.1000    0.0002
    80        0.7318             nan     0.1000   -0.0000
   100        0.7283             nan     0.1000   -0.0005
   120        0.7252             nan     0.1000   -0.0003
   140        0.7206             nan     0.1000   -0.0001
   150        0.7194             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9667             nan     0.1000    0.0233
     2        0.9282             nan     0.1000    0.0181
     3        0.8992             nan     0.1000    0.0135
     4        0.8784             nan     0.1000    0.0109
     5        0.8602             nan     0.1000    0.0088
     6        0.8426             nan     0.1000    0.0078
     7        0.8293             nan     0.1000    0.0063
     8        0.8196             nan     0.1000    0.0051
     9        0.8090             nan     0.1000    0.0039
    10        0.8015             nan     0.1000    0.0034
    20        0.7535             nan     0.1000    0.0010
    40        0.7302             nan     0.1000   -0.0004
    60        0.7212             nan     0.1000   -0.0004
    80        0.7147             nan     0.1000   -0.0003
   100        0.7088             nan     0.1000   -0.0001
   120        0.7060             nan     0.1000   -0.0004
   140        0.7013             nan     0.1000   -0.0002
   150        0.6989             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9755             nan     0.1000    0.0181
     2        0.9453             nan     0.1000    0.0134
     3        0.9248             nan     0.1000    0.0105
     4        0.9060             nan     0.1000    0.0083
     5        0.8931             nan     0.1000    0.0066
     6        0.8822             nan     0.1000    0.0053
     7        0.8737             nan     0.1000    0.0044
     8        0.8663             nan     0.1000    0.0036
     9        0.8602             nan     0.1000    0.0030
    10        0.8548             nan     0.1000    0.0023
    20        0.8299             nan     0.1000    0.0009
    40        0.8080             nan     0.1000    0.0001
    60        0.7977             nan     0.1000   -0.0000
    80        0.7910             nan     0.1000   -0.0001
   100        0.7885             nan     0.1000   -0.0001
   120        0.7872             nan     0.1000   -0.0001
   140        0.7859             nan     0.1000   -0.0002
   150        0.7855             nan     0.1000   -0.0001

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9701             nan     0.1000    0.0201
     2        0.9386             nan     0.1000    0.0165
     3        0.9118             nan     0.1000    0.0128
     4        0.8923             nan     0.1000    0.0102
     5        0.8752             nan     0.1000    0.0082
     6        0.8614             nan     0.1000    0.0066
     7        0.8493             nan     0.1000    0.0061
     8        0.8401             nan     0.1000    0.0048
     9        0.8314             nan     0.1000    0.0039
    10        0.8236             nan     0.1000    0.0031
    20        0.7825             nan     0.1000    0.0011
    40        0.7618             nan     0.1000    0.0002
    60        0.7485             nan     0.1000   -0.0001
    80        0.7406             nan     0.1000    0.0002
   100        0.7370             nan     0.1000   -0.0002
   120        0.7342             nan     0.1000   -0.0002
   140        0.7303             nan     0.1000   -0.0002
   150        0.7288             nan     0.1000   -0.0001

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9685             nan     0.1000    0.0228
     2        0.9352             nan     0.1000    0.0171
     3        0.9080             nan     0.1000    0.0140
     4        0.8865             nan     0.1000    0.0112
     5        0.8692             nan     0.1000    0.0090
     6        0.8522             nan     0.1000    0.0075
     7        0.8392             nan     0.1000    0.0064
     8        0.8277             nan     0.1000    0.0054
     9        0.8196             nan     0.1000    0.0034
    10        0.8095             nan     0.1000    0.0046
    20        0.7656             nan     0.1000    0.0011
    40        0.7424             nan     0.1000    0.0004
    60        0.7325             nan     0.1000    0.0001
    80        0.7263             nan     0.1000   -0.0002
   100        0.7214             nan     0.1000   -0.0003
   120        0.7173             nan     0.1000   -0.0001
   140        0.7142             nan     0.1000   -0.0003
   150        0.7123             nan     0.1000   -0.0005

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9743             nan     0.1000    0.0194
     2        0.9456             nan     0.1000    0.0143
     3        0.9228             nan     0.1000    0.0115
     4        0.9025             nan     0.1000    0.0092
     5        0.8873             nan     0.1000    0.0072
     6        0.8756             nan     0.1000    0.0058
     7        0.8666             nan     0.1000    0.0047
     8        0.8592             nan     0.1000    0.0039
     9        0.8528             nan     0.1000    0.0033
    10        0.8472             nan     0.1000    0.0026
    20        0.8226             nan     0.1000    0.0003
    40        0.7990             nan     0.1000    0.0003
    60        0.7870             nan     0.1000    0.0000
    80        0.7812             nan     0.1000   -0.0003
   100        0.7789             nan     0.1000   -0.0001
   120        0.7768             nan     0.1000   -0.0001
   140        0.7761             nan     0.1000   -0.0001
   150        0.7757             nan     0.1000   -0.0000

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9681             nan     0.1000    0.0214
     2        0.9378             nan     0.1000    0.0162
     3        0.9115             nan     0.1000    0.0129
     4        0.8904             nan     0.1000    0.0100
     5        0.8715             nan     0.1000    0.0082
     6        0.8573             nan     0.1000    0.0063
     7        0.8447             nan     0.1000    0.0053
     8        0.8363             nan     0.1000    0.0037
     9        0.8273             nan     0.1000    0.0043
    10        0.8193             nan     0.1000    0.0032
    20        0.7841             nan     0.1000    0.0004
    40        0.7589             nan     0.1000   -0.0001
    60        0.7508             nan     0.1000   -0.0001
    80        0.7441             nan     0.1000   -0.0003
   100        0.7392             nan     0.1000   -0.0001
   120        0.7345             nan     0.1000   -0.0001
   140        0.7313             nan     0.1000   -0.0002
   150        0.7289             nan     0.1000   -0.0002

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9671             nan     0.1000    0.0223
     2        0.9325             nan     0.1000    0.0173
     3        0.9053             nan     0.1000    0.0130
     4        0.8848             nan     0.1000    0.0109
     5        0.8687             nan     0.1000    0.0086
     6        0.8540             nan     0.1000    0.0076
     7        0.8405             nan     0.1000    0.0069
     8        0.8291             nan     0.1000    0.0052
     9        0.8194             nan     0.1000    0.0048
    10        0.8108             nan     0.1000    0.0037
    20        0.7683             nan     0.1000    0.0004
    40        0.7431             nan     0.1000   -0.0002
    60        0.7340             nan     0.1000   -0.0003
    80        0.7269             nan     0.1000   -0.0001
   100        0.7222             nan     0.1000   -0.0003
   120        0.7183             nan     0.1000   -0.0003
   140        0.7149             nan     0.1000   -0.0003
   150        0.7143             nan     0.1000   -0.0000

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.9697             nan     0.1000    0.0224
     2        0.9357             nan     0.1000    0.0163
     3        0.9085             nan     0.1000    0.0135
     4        0.8865             nan     0.1000    0.0114
     5        0.8686             nan     0.1000    0.0091
     6        0.8529             nan     0.1000    0.0073
     7        0.8410             nan     0.1000    0.0062
     8        0.8296             nan     0.1000    0.0054
     9        0.8207             nan     0.1000    0.0043
    10        0.8131             nan     0.1000    0.0033
    20        0.7686             nan     0.1000    0.0009
    40        0.7420             nan     0.1000    0.0001
    60        0.7348             nan     0.1000    0.0001
    80        0.7284             nan     0.1000   -0.0001
   100        0.7227             nan     0.1000   -0.0002
Loading required package: pROC
Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'
The following objects are masked from 'package:stats':

    cov, smooth, var
Loading required package: PRROC
Warning: package 'PRROC' was built under R version 4.3.3
Loading required package: rlang
Warning: package 'rlang' was built under R version 4.3.3

Attaching package: 'rlang'
The following objects are masked from 'package:purrr':

    %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
    flatten_raw, invoke, splice
The following object is masked from 'package:backports':

    %||%
The following object is masked from 'package:data.table':

    :=
Loading required package: ModelMetrics

Attaching package: 'ModelMetrics'
The following object is masked from 'package:pROC':

    auc
The following objects are masked from 'package:caret':

    confusionMatrix, precision, recall, sensitivity, specificity
The following object is masked from 'package:base':

    kappa
Setting levels: control = N, case = Y
Setting direction: controls < cases
Preparation of a new explainer is initiated
  -> model label       :  gbm 
  -> data              :  4963  rows  40  cols 
  -> target variable   :  0  values 
  -> target variable   :  length of 'y' is different than number of rows in 'data' (  WARNING  ) 
  -> predict function  :  yhat.train  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package caret , ver. 7.0.1 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.02556174 , mean =  0.2033584 , max =  0.7417174  
  -> residual function :  difference between y and yhat (  default  )
Warning in min(residuals): no non-missing arguments to min; returning Inf
Warning in max(residuals): no non-missing arguments to max; returning -Inf
  -> residuals         :  numerical, min =  Inf , mean =  NaN , max =  -Inf  
  A new explainer has been created!  

Visualize results

Here we create a plot of the feature importance scores for each of the top (here we have ) predictors identified by MLHO.

To do so, let’s map the concept codes to their “English” translation. That’s why we kept that 4th column called description in dbmart.

features = model.test$features
features$features.new <- sub("_.*", "", features$features )
dbmart.concepts <- dbmart[!duplicated(paste0(dbmart$phenx)), c("phenx","DESCRIPTION")]
mlho.features <- data.frame(merge(features,dbmart.concepts,by.x="features.new",by.y = "phenx"))
datatable(dplyr::select(mlho.features,features,DESCRIPTION,`Feature importance`=Overall), options = list(pageLength = 5), filter = 'bottom')

now visualizing feature importance

(plot<- ggplot(mlho.features) +
    geom_segment(
      aes(y = 0,
          x = reorder(DESCRIPTION,Overall),
          yend = Overall,
          xend = DESCRIPTION),
      size=0.5,alpha=0.5) +
    geom_point(
      aes(x=reorder(DESCRIPTION,Overall),y=Overall),
      alpha=0.5,size=2,color="red") +
    theme_minimal()+
   coord_flip()+
    labs(y="Feature importance",x=""))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

SHAP value and visualization

When setting calSHAP=TRUE, SHAP values are also calculated to explain the output of models as shown below.

shap_value <- model.test$shap
head(as.data.frame(shap_value))
                               variable  contribution     variable_name
1000126_1                 1000126_1 = 0 -0.0156173734         1000126_1
124171000119105_1 124171000119105_1 = 0  0.0005751655 124171000119105_1
15777000_1               15777000_1 = 0 -0.0096777396        15777000_1
162864005_1             162864005_1 = 0 -0.0102141640       162864005_1
18262-6_1                 18262-6_1 = 0 -0.0014447155         18262-6_1
1870230_1                 1870230_1 = 0  0.0007779556         1870230_1
                  variable_value sign label B
1000126_1                      0   -1   gbm 0
124171000119105_1              0    1   gbm 0
15777000_1                     0   -1   gbm 0
162864005_1                    0   -1   gbm 0
18262-6_1                      0   -1   gbm 0
1870230_1                      0    1   gbm 0
plot(shap_value)

dbmart.concepts.new = mlho.features %>% select(features, DESCRIPTION)
colnames(dbmart.concepts.new) = c("phenx", "DESCRIPTION")
mshapviz(shap_value, dbmart.concepts.new, plot_type = "waterfall", top_n = 6, num = 1)
Selecting by abs_S
Selecting by abs_S

mshapviz_all(shap_value, dbmart.concepts.new, top_n = 5)
Selecting by mean_S
Selecting by mean_S