# ---------------------------------------------------------------------------- # RDcompareLasso # # ---------------------------------------------------------------------------- # Revised on Nov 29, 2024 # EXPLANATION # RDcompareLasso function calculates RD using RDcompare function after eliminating non-important factors by performing Lasso. Library 'grpreg' is required for the calculation. RDcompareLasso function is especially suitable for cases where RDcompare function takes too much time for the calculation when the number of variables is larger than about 12. # INPUT PARAMETERS # A saturated model, which includes whole set of explanatory variables, should be specified in the formula. # OPTIONS # max.variable: Maximum number of variables that is used in RDcompare. The default number is 10. The calculation time increases if the number is too large. Compulsory factors are not counted in max.variable. # df.error: Minimum degree of freedom that is used in estimating variance for normal errors. This option should used if the number of parameters is larger than the number of observation. The minimum degree of freedom should be 8 in such cases. # noint: We should use noint=TRUE instead of using the form "-1 +", if we want to omit the intercept. The default setting is noint=FALSE. # max.ranking: Number of models shown in the ranking in the output. The default number is 50. # ShowModel: Number of models of which the parameters are shown in the output. The default number is 5. # Progress: Calculation progress is displayed if Progress=TRUE. The default setting is Progress=TRUE. # Compulsory: The factors that should be included compulsorily. If we specify Compulsory="Trap", for example, the factor "Trap" is always included in the model. All compulsory terms should be included also in the set of parameters selected by Lasso. The sequence of interaction should be alphabetical order in using RDcompareLasso. For example, please use "Age:Markers" instead of "Markers:Age". # --- Start of RDcompareLasso function --- RDcompareLasso <- function(formula, data=NULL, family="gaussian",link=idendity, max.variable=10, df.error=0, Progress=TRUE, Compulsory=NULL, weights=NULL, subset, ShowModel=5, max.ranking=50, noint=FALSE, na.action, singular.ok = TRUE, method = "qr", Chk.interaction=TRUE){ # -- Initialization -- start.time <- proc.time() library(grpreg) mf <- match.call() deparse.mf <- deparse(mf,width.cutoff=500) deparse.mf2 <- strsplit(deparse.mf,"~")[[1]][2] mf.arg <- unlist(strsplit(gsub(" +","",deparse.mf2),","), recursive = TRUE) mf.arg.n <- length(mf.arg) mf.arg[mf.arg.n] <- substring(mf.arg[mf.arg.n],1,nchar(mf.arg[mf.arg.n])-1) Lasso.X <- model.matrix(formula,data)[,-1] Lasso.group <- attr(model.matrix(formula,data),"assign")[-1] Lasso.Y <- model.response(model.frame(formula, data)) RDLassoData <<- data.frame(Lasso.Y,Lasso.X,data) mf.arg.others <- "data=RDLassoData" add.weights <- FALSE for(k in 2:mf.arg.n){ if(charmatch("data",mf.arg[k],nomatch = 0)>0) next if(charmatch("family",mf.arg[k],nomatch = 0)>0) next if(charmatch("max.variable",mf.arg[k],nomatch = 0)>0) next if(charmatch("df.error",mf.arg[k],nomatch = 0)>0) next if(charmatch("ShowModel",mf.arg[k],nomatch = 0)>0) next if(charmatch("max.ranking",mf.arg[k],nomatch = 0)>0) next if(charmatch("Compulsory",mf.arg[k],nomatch = 0)>0) next if(charmatch("Progress",mf.arg[k],nomatch = 0)>0) next if(charmatch("weights=",mf.arg[k],nomatch = 0)>0) add.weights <- TRUE if(charmatch("offset=",mf.arg[k],nomatch = 0)>0) add.weights <- TRUE mf.arg.others <- paste(mf.arg.others,mf.arg[k],sep=",") } mf.arg.others <- paste(mf.arg.others,")",sep="") # -- Calculation by grpreg -- Xstd <- scale(Lasso.X) grid <- 10^seq(10,-2,length=1000) lasso.weights <- NULL if(add.weights==TRUE){ char.weights <- as.character(eval(substitute(alist(weights)))) lasso.weights <- eval(parse(text=paste("data$",char.weights,sep=""))) } resStd <- grpreg(X=Xstd, y=Lasso.Y, group=Lasso.group, family=family, weights=lasso.weights, lambda=grid) coef_grpreg <- coef(resStd) freq_included <- apply(coef_grpreg,1,function(x) sum(abs(x)>0)) freq_included2 <- rbind(frequency=freq_included[-1],Lasso.group) freq_expvar <- aggregate(frequency~Lasso.group,data=t(freq_included2),mean) rownames(freq_expvar) <- labels(terms(formula,data=data)) order_expvar <- noquote(rownames(freq_expvar)[order(freq_expvar$frequency,decreasing=TRUE)]) model_selected <- paste(order_expvar[1:min(length(order_expvar),max.variable)],collapse = "+") # -- Estimation of error variance -- order_expvar_vector <- order_expvar[1:min(length(order_expvar),(nrow(data)-(df.error+(noint==FALSE))))] varmodel <- paste(order_expvar_vector,collapse = "+") mf.char <- paste("lm(Lasso.Y ~ ",varmodel,sep="") if (noint==TRUE) mf.char <- paste(mf.char,"-1 ",sep="") if (length(mf.arg.others)>0) mf.char <- paste(mf.char,mf.arg.others,sep=",") else mf.char <- paste(mf.char,")",sep="") ErrorVariance.lm <- eval(parse(text=mf.char)) phi <- sum(residuals(ErrorVariance.lm)^2)/df.residual(ErrorVariance.lm) # -- Parameters are passed to RDcompare function -- mf.arg.others <- "data=RDLassoData" phiC <- TRUE for(k in 2:mf.arg.n){ if(charmatch("data",mf.arg[k],nomatch = 0)>0) next if(charmatch("family",mf.arg[k],nomatch = 0)>0) {if(charmatch("gaussian",mf.arg[k],nomatch = 0)==0) phiC <- FALSE} if(charmatch("family",mf.arg[k],nomatch = 0)>0) {if(charmatch("quasi",mf.arg[k],nomatch = 0)>0) phiC <- TRUE} if(charmatch("max.variable",mf.arg[k],nomatch = 0)>0) next if(charmatch("df.error",mf.arg[k],nomatch = 0)>0) next mf.arg.others <- paste(mf.arg.others,mf.arg[k],sep=",") } if(phiC==TRUE) mf.arg.others <- paste(mf.arg.others,",phi=",phi,sep="") mf.arg.others <- paste(mf.arg.others,")",sep="") mf.char <- paste("RDcompare(Lasso.Y ~ ",model_selected,sep="") if (length(mf.arg.others)>0) mf.char <- paste(mf.char,mf.arg.others,sep=",") else paste(mf.char,")") cat(mf.char,"\n") eval(parse(text=mf.char)) cat("\n","elapsed time","\n") total.time <- proc.time()-start.time print(total.time) } # --- End of RDcompareLasso function --- # ---------------------------------------------------------------------------- # RDcompare # # ---------------------------------------------------------------------------- # Revised on July 22, 2020 # EXPLANATION # This function calculates RD for generalized linear models. # All combinations of explanatory variables are compared if the combinations belong to the hierarchical family of models. # The models are listed in the descending order of RD. # Original reference for RD: # Kohji Yamamura (2016) # Estimation of the predictive ability of ecological models # Communications in Statistics - Simulation and Computation, 45:2122-2144 # DOI: 10.1080/03610918.2014.889161 # INPUT PARAMETERS # Please specify the saturated model or the most complex model as the formula. # OPTIONS # max.ranking: Number of models shown in the ranking in the output. The default number of 50. # ShowModel: Number of models of which the parameters are shown in the output. The default number is 5. # Compulsory: The factors that should be included compulsorily. If we specify Compulsory="Trap", for example, the factor "Trap" is always included in the model. Notice that the compulsory factors should also be included in the model formula. # noint: We should use noint=TRUE instead of using the form "-1 +", if we want to omit the intercept. The default setting is noint=FALSE. # phi: Dispersion parameter. We should use this option if we already have a reliable estimate of dispersion parameter. # Progress: If Progress=TRUE, the progress of calculation is shown on the console window. Default setting is FALSE. # --- Start of RDcompare function --- RDcompare <- function(formula,family,data=NULL,weights,subset,phi,ShowModel=5, max.ranking=50, noint=FALSE, na.action, start = NULL, etastart, mustart, offset, control = list(...), method = "glm.fit", singular.ok = TRUE, contrasts = NULL, Compulsory=NULL, Progress=TRUE, Chk.interaction=TRUE){ Ranking <- max.ranking ShowModel <- ceiling(ShowModel) Log <- FALSE mff <- terms.formula(formula,data=data) org.model <- gsub("[[:blank:]]","",deparse(formula,width.cutoff=500)) depvar <- strsplit(org.model,"~")[[1]][1] fact.name <- attr(mff,"term.labels") fact.number.full <- length(fact.name) if(!is.null(Compulsory)){ fact.name <- fact.name[-which(fact.name %in% unlist(strsplit(Compulsory,"\\+")))] Msg <- "* All compulsory terms should be included also in the model.\n * Please enlarge the option max.variable if we are using LASSO or PLS. \n * Also, please note that the name of a compulsory interaction should be arranged with the same sequence with the main effect.\n * For example, if we used \"Month*Trap\" or \"Month+Trap+Trap:Month\" as the model formula,\n * we should use \"Month:Trap\" but should not use \"Trap:Month\" as the interaction term." if(length(unlist(strsplit(Compulsory,"\\+"))) > fact.number.full-length(fact.name) | length(fact.name)==0){stop(Msg)} } fact.number <- length(fact.name) model.number <- 2^fact.number result <- 0 mf <- match.call() deparse.mf <- deparse(mf,width.cutoff=500) deparse.mf2 <- strsplit(deparse.mf,"~")[[1]][2] mf.arg <- unlist(strsplit(gsub(" +","",deparse.mf2),","), recursive = TRUE) mf.arg.n <- length(mf.arg) mf.arg[mf.arg.n] <- substring(mf.arg[mf.arg.n],1,nchar(mf.arg[mf.arg.n])-1) mf.arg.others <- "" for(k in 2:mf.arg.n){ if(charmatch("link",mf.arg[k],nomatch = 0)>0) next if(charmatch("phi",mf.arg[k],nomatch = 0)>0) next if(charmatch("ShowModel",mf.arg[k],nomatch = 0)>0) next if(charmatch("max.ranking",mf.arg[k],nomatch = 0)>0) next if(charmatch("noint",mf.arg[k],nomatch = 0)>0) next if(charmatch("Compulsory",mf.arg[k],nomatch = 0)>0) next if(charmatch("Progress",mf.arg[k],nomatch = 0)>0) next if(charmatch("Chk.interaction",mf.arg[k],nomatch = 0)>0) next mf.arg.others <- paste(mf.arg.others,mf.arg[k],sep=",") } mf.arg.others <- paste(mf.arg.others,")",sep="") # Calculation for true model. indepvar <- paste(fact.name,collapse="+") if(!is.null(Compulsory)){ if(indepvar==""){org.model <- paste(depvar,"~",Compulsory,sep="")} else {org.model <- paste(depvar,"~",Compulsory,"+",indepvar,sep="")} }else{ org.model <- paste(depvar,"~",indepvar,sep="")} mf.char <- paste("glm(",org.model,sep="") if (noint==TRUE) mf.char <- paste(mf.char,"-1 ",sep="") mf.char <- paste(mf.char,mf.arg.others,sep="") if(nchar(gsub("[^\\(]","", mf.char)) > nchar(gsub("[^\\)]","", mf.char))) {mf.char <- paste(mf.char,")",sep="")} true.model <- eval(parse(text=mf.char)) # Calculation for null model. model <- paste(depvar," ~ 1 ",sep="") mf.char <- paste("glm(",model,sep="") mf.char <- paste(mf.char,mf.arg.others,sep="") if(nchar(gsub("[^\\(]","", mf.char)) > nchar(gsub("[^\\)]","", mf.char))) {mf.char <- paste(mf.char,")",sep="")} null.model <- eval(parse(text=mf.char)) # Creating all combinations of fact.name RD <- c() RSD <- c() k <- c() Model <- c() if(Progress==TRUE){ cat("0---20---40---60---80---100\n"); proglen1 <- 0 } for(i in 1:model.number){ if(Progress==TRUE){ proglen2 <- floor(100*i/model.number/4) if(proglen2 > proglen1){ for(progleni in 1:(proglen2-proglen1)){cat("*"); flush.console()} proglen1 <- proglen2} } if (Log==TRUE) cat("model.number=",i,"\n") flag <- 1 if (noint==FALSE) model <- paste(depvar," ~ 1 ",sep="") else model <- paste(depvar," ~ -1 ",sep="") if(!is.null(Compulsory)){ model <- paste(model,Compulsory,sep=" + ") model <- paste(gsub("\\+"," \\+ ",model)," ",sep="") } for(j in 1:fact.number){ if (((i-0.5)%/%(2^(fact.number-j)))%%2==1) { if(Chk.interaction==TRUE){ # Splitting the interaction into the elements split <- strsplit(fact.name[j],":") splitn <- length(split[[1]]) # Generating all partial combinations of elements splitm <- 2^splitn for(m in 1:splitm){ inter <- "" for(n in 1:splitn){ if (((m-0.5)%/%(2^(splitn-n)))%%2==1) inter <- ifelse(inter=="",inter <- split[[1]][n],paste(inter,":",split[[1]][n],sep=""))} # The flag is kept at 1 only if all partial combinations in the interaction are found in the previous part of the model. if ((inter!=fact.name[j]) && (inter!="")) { flag <- flag*ifelse(length(grep(paste(" ",inter," ",sep=""),model))>0,1,0) if (Log==TRUE) cat("flag=",flag,", try adding",fact.name[j],", cheking subfactor",inter,"\n") } } } model <- paste(model,"+ ",fact.name[j]," ",sep="") } } if (strsplit(model,"~")[[1]][2]==" -1 ") model <-paste(depvar," ~ 1 ",sep="") if (Log==TRUE) cat("model: ",model,"\n\n") # Calculation is performed for the candidate model if the flag remains 1 if (flag==1) { # Calculation for test model. mf.char <- paste("glm(",model,sep="") mf.char <- paste(mf.char,mf.arg.others,sep="") if(nchar(gsub("[^\\(]","", mf.char)) > nchar(gsub("[^\\)]","", mf.char))) {mf.char <- paste(mf.char,")",sep="")} test.model <- eval(parse(text=mf.char)) # Calculating RD and RSD RDcal <- RDcompare.cal(test.model,true.model,null.model,phi) RD <- c(RD,RDcal[1]) RSD <- c(RSD,RDcal[2]) k <- c(k,RDcal[3]) Model <- c(Model,model) } } if(Progress==TRUE) cat("\n") result <- data.frame(RD,RSD,k,Model) result <- result[order(result$RD,decreasing=TRUE),] rownames(result) <- c() cat("\n# RD ranking for the hierarchical family of models #\n") print(head(result,Ranking)) output <- list(Ranking=result) # Parameters of the best model. family <- true.model$family$family df <- true.model$df.residual if(missing(phi)==0) phi <- phi else phi <- if((family=="binomial")||(family=="poisson")) 1 else sum((true.model$weights*true.model$residuals^2)/df) if(ShowModel>0){ cat("\n# phi = ",phi," #\n"); cat("\n# Parameters of the best",ShowModel,"model #\n"); for(m in 1:min(ShowModel,length(RD))){ mf.char <- paste("glm(",result[m,4],sep="") mf.char <- paste(mf.char,mf.arg.others,sep="") if(nchar(gsub("[^\\(]","", mf.char)) > nchar(gsub("[^\\)]","", mf.char))) {mf.char <- paste(mf.char,")",sep="")} best.model <- summary.glm(eval(parse(text=mf.char)),dispersion=phi) cat("# Model",m,"#\n") print(coefficients(best.model)[,c(1,2)]) tmp <- list(coefficients(best.model)[,c(1,2)]) names(tmp) <- paste("Model",m,sep="_") output <- c(output,tmp) } } final <- output } # --- End of RDcompare function --- # ---------------------------------------------------------------------------- # RDcompareNB # # ---------------------------------------------------------------------------- RDcompareNB <- function(formula, data=NULL, weights, subset, ShowModel=5, max.ranking=50, noint=FALSE, na.action, start = NULL, etastart, mustart,offset, control = list(...), method = "glm.fit", contrasts = NULL, Compulsory=NULL){ library(MASS) Ranking <- max.ranking ShowModel <- ceiling(ShowModel) log <- FALSE mff <- terms.formula(formula,data=data) org.model <- gsub("[[:blank:]]","",deparse(formula,width.cutoff=500)) depvar <- strsplit(org.model,"~")[[1]][1] fact.name <- attr(mff,"term.labels") fact.number.full <- length(fact.name) if(!is.null(Compulsory)){ fact.name <- fact.name[-which(fact.name %in% unlist(strsplit(Compulsory,"\\+")))] Msg <- "* All compulsory terms should be included also in the model.\n * Please enlarge the option max.variable if we are using LASSO or PLS. \n * Also, please note that the name of a compulsory interaction should be arranged with the same sequence with the main effect.\n * For example, if we used \"Month*Trap\" or \"Month+Trap+Trap:Month\" as the model formula,\n * we should use \"Month:Trap\" but should not use \"Trap:Month\" as the interaction term." if(length(unlist(strsplit(Compulsory,"\\+"))) > fact.number.full-length(fact.name) | length(fact.name)==0){stop(Msg)} } fact.number <- length(fact.name) model.number <- 2^fact.number result <- 0 mf <- match.call() deparse.mf <- deparse(mf,width.cutoff=500) deparse.mf2 <- strsplit(deparse.mf,"~")[[1]][2] mf.arg <- unlist(strsplit(gsub(" +","",deparse.mf2),","), recursive = TRUE) mf.arg.n <- length(mf.arg) mf.arg[mf.arg.n] <- substring(mf.arg[mf.arg.n],1,nchar(mf.arg[mf.arg.n])-1) mf.arg.others <- "" for(k in 2:mf.arg.n){ if(charmatch("phi",mf.arg[k],nomatch = 0)>0) next if(charmatch("ShowModel",mf.arg[k],nomatch = 0)>0) next if(charmatch("max.ranking",mf.arg[k],nomatch = 0)>0) next if(charmatch("noint",mf.arg[k],nomatch = 0)>0) next if(charmatch("Compulsory",mf.arg[k],nomatch = 0)>0) next mf.arg.others <- paste(mf.arg.others,mf.arg[k],sep=",") } mf.arg.others <- paste(mf.arg.others,")",sep="") # Calculation for true model. indepvar <- strsplit(org.model,"~")[[1]][2] if(!is.null(Compulsory)){indepvar <- paste(Compulsory,indepvar,sep="+")} org.modelNB <- paste(depvar,"~",indepvar,sep="") mf.char <- paste("glm.nb(",org.modelNB,sep="") if (noint==TRUE) mf.char <- paste(mf.char,"-1 ",sep="") mf.char <- paste(mf.char,mf.arg.others,sep="") if(nchar(gsub("[^\\(]","", mf.char)) > nchar(gsub("[^\\)]","", mf.char))) {mf.char <- paste(mf.char,")",sep="")} true.model <- eval(parse(text=mf.char)) theta <- true.model$theta cat("# Negative binomial model: theta=",theta,"#\n") mf.char <- paste("RDcompare(",org.model,sep="") fam <- paste("family=negative.binomial(theta=",deparse(theta),")",sep="") mf.char <- paste(mf.char,fam,sep=",") mf.char <- paste(mf.char,",phi=1",sep="") for(k in 2:mf.arg.n){ mf.char <- paste(mf.char,mf.arg[k],sep=",") } if(nchar(gsub("[^\\(]","", mf.char)) > nchar(gsub("[^\\)]","", mf.char))) {mf.char <- paste(mf.char,")",sep="")} eval(parse(text=mf.char)) } # --- End of RDcompareNB function --- # ---------------------------------------------------------------------------- # RDcompare.cal # # ---------------------------------------------------------------------------- # The following function calculates RD for a specified model. RDcompare.cal <- function(test.model,true.model,null.model,phi){ family <- true.model$family$family df <- true.model$df.residual deviance <- test.model$deviance t.deviance <- true.model$deviance if(missing(null.model)) n.deviance <- true.model$null.deviance else n.deviance <- null.model$deviance k <- ncol(model.matrix(test.model)) if(missing(phi)==0) phi <- phi else phi <- if((family=="binomial")||(family=="poisson")) 1 else sum((true.model$weights*true.model$residuals^2)/df) RD <- 1-(deviance+2*phi*k)/(n.deviance+2*phi) RSD <- 1-(deviance-t.deviance+2*phi*k)/(n.deviance-t.deviance+2*phi) c(RD=RD,RSD=RSD,k=k) } # --- End of RDcompare.cal function ---