Rcode: Variable Selection


Load required libraries

library(MASS)
library(igraph)
library(corrplot)
library(ggplot2)

Generate a 10×10 precision matrix from an AR(1) model and draw the corresponding graph based on the adjacency matrix.

p=10               ## number of variables
C = diag(rep(1,p)) ## true covariance matrix
for(i in 1:p){
C[i,i-1]=C[i-1,i]=0.5
}
Sigma = solve(C)

## Adjacency matrix for the graph
A = matrix(as.numeric(abs(C) > 0), p, p) 

net=graph.adjacency(C, mode="undirected", weighted=TRUE, diag=FALSE)
V(net)       ## nodes in the graph
E(net)       ## edges in the graph
plot(net)

Generate n samples from N(0, Sigma), and compute and display the sample covariance and precision matrices. Here the so-called sample precision matrix is just the inverse of the sample covariance, the MLE of the precision matrix.

n= 30     ## num of observations
Y = as.data.frame(mvrnorm(n,rep(0,p),Sigma))
S = cov(Y)           ## sample covariance matrix

## plot the true and sample covariance matrices
par(mfrow=c(1,2))
corrplot(Sigma, method = "ellipse", is.corr=FALSE)
corrplot(S, method = "ellipse", is.corr=FALSE)

## plot the true and sample precision matrices 
Theta = solve(S)
par(mfrow=c(1,2)) 
corrplot(C, method = "ellipse", is.corr=FALSE)
corrplot(Theta, method = "ellipse", is.corr=FALSE)

Regress each column of Y on the other columns using variable selection procedures (such as, stepwise BIC) to estimate the zero-pattern of the precision matrix. The variable selection result is stored in this p-by-p binary indicator matrix Z (you can ignore its diagonal entries).

Z = diag(p) ## initiate the indicator matrix
names(Y)    ## Y: data matrix with the i-th column named Vi

for(i in 1:p){
  ## automatically generate the modelnames in each iteration
  uppermodel=paste("V", i, `~', sep='')
  uppermodel = paste(uppermodel, paste(names(Y)[-i], 
               collapse = "+", sep= ""))
  myfit  = lm(uppermodel, data=Y)
  BIC.output = step(myfit,  k=log(n), trace=0)

  ## Based on the BIC output, set the corresponding entries in Z 
  ## to be 1 (i.e., being selected)

  var.id = which(names(Y) %in% attr(BIC.output$terms, "term.labels"))
  if (length(var.id) > 0) Z[i, var.id] = 1
}

The precision matrix is symmetric, so is its zero-pattern. However, the Z matrix returned by this column-by-column regression procedure may not be symmetric. Next we force Z to be symmetric and then threshold its entries (could be > 0 or > 0.5).

Z = (Z + t(Z))/2  
Z = matrix(as.numeric(Z > 0.5), p,p) 

Graph the true graph and the estimated one.

par(mfrow=c(1,3))
net=graph.adjacency(A, mode="undirected", weighted=TRUE, diag=FALSE)
plot(net)
net=graph.adjacency(Theta, mode="undirected", weighted=TRUE, diag=FALSE)
plot(net)
net=graph.adjacency(Z, mode="undirected", weighted=TRUE, diag=FALSE)
plot(net)