## Loading required package: Umpire
In this vignette, we ilustrate how to apply the GenAlgo package
to the problem of feature selection in an “omics-scale” data set. We
start by loading the packages that we will need.
library(GenAlgo)
library(Umpire)
library(oompaBase)
We will use the Umpire package to simulate a comnplex enough data set
to stress our feature selection algorithm.  We begin by setting up a
time-to-event model, built on an exponential baseline.
set.seed(391629)
sm <- SurvivalModel(baseHazard=1/5, accrual=5, followUp=1)
Next, we build a “cancer model” with six subtypes.
nBlocks <- 20    # number of possible hits
cm <- CancerModel(name="cansim",
                  nPossible=nBlocks,
                  nPattern=6,
                  OUT = function(n) rnorm(n, 0, 1), 
                  SURV= function(n) rnorm(n, 0, 1),
                  survivalModel=sm)
### Include 100 blocks/pathways that are not hit by cancer
nTotalBlocks <- nBlocks + 100
Now define the hyperparameters for the models.
### block size
blockSize <- round(rnorm(nTotalBlocks, 100, 30))
### log normal mean hypers
mu0    <- 6
sigma0 <- 1.5
### log normal sigma hypers
rate   <- 28.11
shape  <- 44.25
### block corr
p <- 0.6
w <- 5
Now set up the baseline “Engine”.
rho <- rbeta(nTotalBlocks, p*w, (1-p)*w)
base <- lapply(1:nTotalBlocks,
               function(i) {
                 bs <- blockSize[i]
                 co <- matrix(rho[i], nrow=bs, ncol=bs)
                 diag(co) <- 1
                 mu <- rnorm(bs, mu0, sigma0)
                 sigma <- matrix(1/rgamma(bs, rate=rate, shape=shape), nrow=1)
                 covo <- co *(t(sigma) %*% sigma)
                 MVN(mu, covo)
               })
eng <- Engine(base)
We alter the means if there is a hit, or else build it using the original engine components.
altered <- alterMean(eng, normalOffset, delta=0, sigma=1)
object <- CancerEngine(cm, eng, altered)
summary(object)
## A 'CancerEngine' using the cancer model:
## --------------
## cansim , a CancerModel object constructed via the function call:
##  CancerModel(name = "cansim", nPossible = nBlocks, nPattern = 6, SURV = function(n) rnorm(n, 0, 1), OUT = function(n) rnorm(n, 0, 1), survivalModel = sm) 
## 
## Pattern prevalences:
## [1] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
## 
## Survival effects:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.13455 -0.06092  0.30563  0.27837  0.73876  2.51446 
## 
## Outcome effects:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1.335728 -0.590101  0.009406 -0.104634  0.143112  1.454560 
## --------------
## 
## Base expression given by:
## An Engine with 120 components.
## 
## Altered expression given by:
## An Engine with 120 components.
rm(altered, base, blockSize, cm, eng, mu0, nBlocks, nTotalBlocks,
   p, rate, rho, shape, sigma0, sm, w)
Now we can use this elaborate setup to generate the simulated data.
train <- rand(object, 198)
tdata <- train$data
pid <- paste("PID", sample(1001:9999, 198+93), sep='')
rownames(train$clinical) <- colnames(tdata) <- pid[1:198]
Of course, to make things harder, we will add noise to the simulated measurements.
noise <- NoiseModel(3, 1, 1e-16)
train$data <- log2(blur(noise, 2^(tdata)))
sum(is.na(train$data))
## [1] 0
rm(tdata)
summary(train$clinical)
##  CancerSubType   Outcome         LFU          Event        
##  Min.   :1.000   Bad :106   Min.   : 0.00   Mode :logical  
##  1st Qu.:2.000   Good: 92   1st Qu.: 3.00   FALSE:88       
##  Median :4.000              Median :19.00   TRUE :110      
##  Mean   :3.662              Mean   :22.32                  
##  3rd Qu.:5.000              3rd Qu.:36.75                  
##  Max.   :6.000              Max.   :67.00
summary(train$data[, 1:3])
##     PID7842          PID6153          PID2085      
##  Min.   : 1.410   Min.   : 1.337   Min.   : 1.724  
##  1st Qu.: 5.021   1st Qu.: 5.008   1st Qu.: 5.071  
##  Median : 6.084   Median : 6.048   Median : 6.117  
##  Mean   : 6.127   Mean   : 6.099   Mean   : 6.163  
##  3rd Qu.: 7.172   3rd Qu.: 7.122   3rd Qu.: 7.192  
##  Max.   :11.705   Max.   :12.687   Max.   :12.170
Now we can also simualte a validation data set.
valid <- rand(object, 93)
vdata <- valid$data
vdata <- log2(blur(noise, 2^(vdata))) # add noise
sum(is.na(vdata))
## [1] 0
vdata[is.na(vdata)] <- 0.26347
valid$data <- vdata
colnames(valid$data) <- rownames(valid$clinical) <- pid[199:291]
rm(vdata, noise, object, pid)
summary(valid$clinical)
##  CancerSubType   Outcome        LFU          Event        
##  Min.   :1.000   Bad :54   Min.   : 0.00   Mode :logical  
##  1st Qu.:2.000   Good:39   1st Qu.: 3.00   FALSE:28       
##  Median :3.000             Median :15.00   TRUE :65       
##  Mean   :3.172             Mean   :22.32                  
##  3rd Qu.:5.000             3rd Qu.:35.00                  
##  Max.   :6.000             Max.   :71.00
summary(valid$data[, 1:3])
##     PID6584          PID5256          PID2944      
##  Min.   : 1.287   Min.   : 1.317   Min.   : 1.760  
##  1st Qu.: 4.957   1st Qu.: 4.997   1st Qu.: 5.022  
##  Median : 5.995   Median : 6.030   Median : 6.064  
##  Mean   : 6.067   Mean   : 6.083   Mean   : 6.121  
##  3rd Qu.: 7.109   3rd Qu.: 7.116   3rd Qu.: 7.151  
##  Max.   :12.205   Max.   :12.223   Max.   :12.341
Now we can start using the GenAlgo package. The key step is to define
sensible functions that can measure the “fitness” of a solution and to
introduce “mutations”. When these functions are called, they are passed a
context argument that can be used to access extra information about
how to proceed. In this case, that context will be the train object,
which includes the clinical information about the samples.
Now we can define the fitness function. The idea is to compute the Mahalanobis distance between the two groups (of “Good” or “Bad” outcome samples) in the space defined by the selected features.
measureFitness <- function(arow, context) {
  predictors <- t(context$data[arow, ]) # space defined by features
  groups <- context$clinical$Outcome    # good or bad outcome
  maha(predictors, groups, method='var')
}
The mutation function randomly chooses any other feature/row to swap out possible predictors of the outcome.
mutator <- function(allele, context) {
   sample(1:nrow(context$data),1)
}
We need to decide how many features to include in a potential predictor (here we use ten). We also need to decide how big a population of feature-sets (here we use 200) should be used in each generation of the genetic algorithm.
set.seed(821831)
n.individuals <- 200
n.features <- 10
y <- matrix(0, n.individuals, n.features)
for (i in 1:n.individuals) {
  y[i,] <- sample(1:nrow(train$data), n.features)
}
Having chosen the staring population, we can run the first step of the genetic algorithm.
my.ga <- GenAlg(y, measureFitness, mutator, context=train) # initialize
summary(my.ga)
## An object representing generation 1 in a genetic algorithm.
## Population size: 200 
## Mutation probability: 0.001 
## Crossover probability: 0.5 
## Fitness distribution:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02137 0.14909 0.22210 0.24071 0.31384 0.58730
To be able to evaluate things later, we save the starting generation
recurse <- my.ga
pop0 <- sort(table(as.vector(my.ga@data)))
Realistically, we probably want to run a couple of hundred or even a couple thousand iterations of the algorithm. But, in interests of making the vignette complete ins a reasonable amount of time, we are only going to terate through 20 generations.
NGEN <- 20
diversity <- meanfit <- fitter <- rep(NA, NGEN)
for (i in 1:NGEN) {
  recurse <- newGeneration(recurse)
  fitter[i] <- recurse@best.fit
  meanfit[i] <- mean(recurse@fitness)
  diversity[i] <- popDiversity(recurse)
}
Plot max and mean fitness by generation. This figure shows that both the mean and the maximum fitness are increasing.
plot(fitter, type='l', ylim=c(0, 1.5), xlab="Generation", ylab="Fitness")
abline(h=max(fitter), col='gray', lty=2)
lines(fitter)
lines(meanfit, col='gray')
points(meanfit, pch=16, col=jetColors(NGEN))
legend("bottomleft", c("Maximum", "Mean"), col=c("black", "blue"), lwd=2)
Plot the diversity of the population, to see that it is deceasing.
plot(diversity, col='gray', type='l', ylim=c(0,10), xlab='', ylab='', yaxt='n')
points(diversity, pch=16, col=jetColors(NGEN))
See which predictors get selected most frequently in the latest generation.
sort(table(as.vector(recurse@data)))
## 
##  1137  2839  3318  5599  9337  2852  3439  4248  5851  7642  7720  9039 
##     1     1     1     1     1     2     2     2     2     2     2     2 
##  9771 11427    49   918  1599  2390  5344  6130  6333  8322  8725  8807 
##     2     2     3     3     3     3     3     3     3     3     3     3 
##  9150  9512 10186 10884 11139  1958  4632  4991  5094  6601  9728 11091 
##     3     3     3     3     3     4     4     4     4     4     4     4 
## 12304    93   456  3801  3857  6386  6657  7680  8741 10825 11512  4100 
##     4     5     5     5     5     5     5     5     5     5     5     6 
##  9044 12161  2845  6022  9923  7846  5152  5473  9574    81 10220 10965 
##     6     6     7     7     7     8     9     9     9    11    11    11 
##  2335  3055  9565 10435  1176  6440  8749   449 10765  1534  6959  7306 
##    12    12    12    12    14    15    16    17    17    18    18    19 
## 11164  7000  8651  3838  8973  9524  5086  8832 11464  8683  1451  4976 
##    20    21    21    22    23    24    27    27    28    29    32    34 
##  4953  6568  8926  7348  1394  9874 10247   171  6117  8600  1855  1935 
##    38    44    49    61    62    65    72    84    88   101   120   152 
##   936  6252 
##   158   159