#######################################################################
# Note that this note can directly be run in R.
#######################################################################

#
# EXAMPLE SESSION FOR INFERRING GENETIC NETWORKS
# 


# load GeneTS library
library(GeneTS)

#######################################################################

# GET DATA:

# As an example we select 42 genes from the Caulobacter data set
data(caulobacter)

# test for periodicity
pval.caulobacter <- fisher.g.test(caulobacter)
fdr.conservative <- fdr.control(pval.caulobacter, Q = 0.05)
data.matrix <- caulobacter[,fdr.conservative$significant]

# remove some unknow ORFs
keep <- rep(TRUE,45)
keep[7] <- FALSE #06901
keep[22] <- FALSE #04476
keep[27] <- FALSE #02688
data.matrix <- data.matrix[,keep]


#######################################################################

# THE DATA:

# the normalized data need to be ready in time series format, i.e. in
# a matrix where each *column* corresponds to a gene, and where the
# *rows* correspond to the individual measurements (time points).

# our example data is stored in "data.matrix"
data.matrix

# 42 genes with 11 time points
dim(data.matrix)

# number of nodes
num.nodes <- dim(data.matrix)[2]

# node.labels
node.labels <- c("CheA", "CheR", "CheD", "ABC transporter", "hfaA",
  "#06446",  "#02759", "peptidase (M23/M37)", "#03144", "#04700",
   "fljO 1", "fljK", "fljN", "#4480", "flbT", "LexA", "fljM 1",
   "fljO 2",  "#08039", "#04977", "#02998 (5-repeat)",
   "#02058 (S-transferase)",  "fljM 2", "#02730" , "divK",  "orfA",
   "#03649", "DnaA", "bacA",  "#01232 (regulator)",  "fljL", 
   "#05886 (GGDEF)", "McpH", "#04700",  "#01720", "neuB",  "#02930",
   "#03170",    "cheW",    "#01459 (receptor)",    "CtrA", "fliJ")


#######################################################################

# INFER GRAPHICAL GAUSSIAN MODEL:

### Estimate partial correlation matrix

# the three estimators for partical correlation 
# mentioned in Schaefer and Strimmer (2003)

#pcor.pi1 <- ggm.estimate.pcor(data.matrix, method = "observed.pcor")
pcor.pi2 <- ggm.estimate.pcor(data.matrix, method = "partial.bagged.cor", R=10000)
#pcor.pi3 <- ggm.estimate.pcor(data.matrix, method = "bagged.pcor")

# here we employ pi2 (the partial bagged correlation) as N << G
inferred.pcor <- pcor.pi2

# note: if you have a better estimate of the partial correlations
# (e.g. from another estimator, or from using some external information)
# you may plug it in here as well - before testing the edges


### Test edges in the network:

# p-values, q-values and posterior probabilities for each edge 
test.results <- ggm.test.edges(inferred.pcor)

# show best 20 edges
test.results[1:20,]

# how many are significant for Q=0.05 ?
num.significant <- sum(test.results$qval <= 0.05)
test.results[1:num.significant,]

# parameters of the mixture distribution used to compute p-values etc.
cor.fit.mixture(sm2vec(inferred.pcor))


#######################################################################

# PLOT GRAPHICAL GAUSSIAN MODEL:

# Note: this requires the "graph" and "Rgraphviz" packages from www.bioconductor.org 

# generate graph object with all significant edges
gr <- ggm.make.graph( test.results[1:num.significant,], num.nodes) 
gr 

# print vector of edge weights
show.edge.weights(gr)

# plot network
X11(width=12, height=9)
ggm.plot.graph(gr, node.labels, show.edge.labels=FALSE)

# with partial correlations as edge labels
X11(width=12, height=9)
ggm.plot.graph(gr, node.labels)


#######################################################################

# GENERATE RANDOM GRAPHICAL GAUSSIAN MODEL:

# generate random network with 20 nodes and 10 percent edges (=19 edges)
true.pcor2 <- ggm.simulate.pcor(20, 0.1)

# convert to edge list 
test.results2 <- ggm.test.edges(true.pcor2, eta0=0.9, kappa=1000)[1:19,]
test.results2

# plot network
gr2 <- ggm.make.graph( test.results2, 20) 
gr2
X11(width=8, height=8)
ggm.plot.graph(gr2)


#######################################################################

# SIMULATE RANDOM GRAPHICAL GAUSSIAN MODEL, GENERATE DATA, 
# AND RE-ESTIMATE PARTIAL CORRELATIONS:

# generate random network with 40 nodes and 5 percent edges
sim.pcor <- ggm.simulate.pcor(40, 0.05)
  
# simulate data set with 40 observations
m.sim <- ggm.simulate.data(40, sim.pcor)

# simple estimate of partial correlations
estimated.pcor <- partial.cor(m.sim)

# comparison of estimated and true model
sum((sim.pcor-estimated.pcor)^2)

# a slightly better estimate ...
estimated.pcor.2 <- ggm.estimate.pcor(m.sim, method = c("bagged.pcor"))
sum((sim.pcor-estimated.pcor.2)^2)
