LeeStrazicichUnitRootTestParallelization.R

#Additional option for the calculation of the critical value is available, but not used at the moment
ur.ls.bootstrap <- function(y, model = c("crash", "break"), breaks = 1, lags = NULL, method = c("GTOS","Fixed"), pn = 0.1, critval = c("bootstrap","theoretical"), print.results = c("print", "silent")){
  #Starttime
  starttime <- Sys.time()
  
  #Check sanity of the function call
  if (any(is.na(y))) 
    stop("\nNAs in y.\n")
  y <- as.vector(y)
  
  if(pn >= 1 || pn <= 0){
    stop("\n pn has to be between 0 and 1.")
  }
  if(method == "Fixed" && is.null(lags) == TRUE){
    stop("\n If fixed lag length should be estimated, the number 
         \n of lags to be included should be defined explicitely.")
  }
  
  #Add lagmatrix function
  lagmatrix <- function(x,max.lag){
    embed(c(rep(NA,max.lag),x),max.lag+1)
  }
  #Add diffmatrix function
  diffmatrix <- function(x,max.diff,max.lag){
    #Add if condition to make it possible to differentiate between matrix and vector                  
    if(is.vector(x) == TRUE ){
      embed(c(rep(NA,max.lag),diff(x,max.lag,max.diff)),max.diff)
    }
    
    else if(is.matrix(x) == TRUE){
      rbind(matrix(rep(NA,max.lag), ncol = ncol(x)), matrix(diff(x,max.lag,max.diff), ncol = ncol(x)))
    }
    #if matrix the result should be 0, if only a vector it should be 1
    else if(as.integer(is.null(ncol(x))) == 0 ){
      rbind(matrix(rep(NA,max.lag), ncol = ncol(x)), matrix(diff(x,max.lag,max.diff), ncol = ncol(x)))
      
    }
  }
  
  #Number of observations
  n <- length(y)
  model <- match.arg(model)
  lags <- as.integer(lags)
  method <- match.arg(method)
  breaks <- as.integer(breaks)
  #Percentage to eliminate endpoints in the lag calculation
  pn <- pn
  #Critical Values for the one break test
  model.one.crash.cval <- matrix(c(-4.239, -3.566, -3.211)
                                 , nrow = 1, ncol = 3, byrow = TRUE)
  
  colnames(model.one.crash.cval) <- c("1%","5%","10%")
  model.one.break.cval <- matrix(c(.1 , -5.11, -4.5, -4.21,
                                   .2 , -5.07, -4.47, -4.20,
                                   .3 , -5.15, -4.45, -4.18,
                                   .4 , -5.05, -4.50, -4.18,
                                   .5 , -5.11, -4.51, -4.17), nrow = 5, ncol = 4, byrow = TRUE)
  colnames(model.one.break.cval) <- c("lambda","1%","5%","10%")
  #   All critical values were derived in samples of size T = 100. Critical values
  #   in Model C (intercept and trend break) depend (somewhat) on the location of the
  #   break (λ = T_B /T) and are symmetric around λ and (1-λ). Model C critical values
  #   at additional break locations can be interpolated.
  #
  #   Critical values for the endogenous two break test  
  #   Model A - "crash" model  
  #   invariant to the location of the crash
  model.two.crash.cval <- matrix(c("LM_tau", -4.545, -3.842, -3.504,
                                   "LM_rho", -35.726, -26.894, -22.892), nrow = 2, ncol = 4, byrow = TRUE ) 
  
  colnames(model.two.crash.cval) <-  c("Statistic","1%","5%","10%")
  
  
  # Model C (i) - "break" model, breaks in the data generating process
  # Model C (i) - "break" model invariant to the location of the crash
  
  model.two.break.dgp.cval <- matrix(c("LM_tau", -5.823, -5.286, -4.989,
                                       "LM_rho", -52.550, -45.531, -41.663), nrow = 2, ncol = 4, byrow = TRUE ) 
  
  
  # Model C (ii) - "break" model, breaks are not considered in the data generating process
  # Model C (ii) - "break" model depends on the location of the crash
  ## highest level of list is the location of the second breakpoint - so the share inside 
  ## the matrix refers to the first breakpoint
  model.two.break.tau.cval <-  matrix(c( -6.16, -5.59, -5.27, -6.41, -5.74, -5.32, -6.33, -5.71, -5.33,
                                         NA    ,   NA,  NA  , -6.45, -5.67, -5.31, -6.42, -5.65, -5.32,
                                         NA    ,   NA,  NA  , NA   , NA   , NA   , -6.32, -5.73, -5.32)
                                      , nrow = 3, ncol = 9, byrow = TRUE )
  
  rownames(model.two.break.tau.cval) <- c("Break 1 - 0.2", "Break 1 - 0.4", "Break 1 - 0.6")
  colnames(model.two.break.tau.cval) <- c("Break 2 - 0.4 - 1%", "Break 2 - 0.4 - 5%", "Break 2 - 0.4 - 10%",
                                          "Break 2 - 0.6 - 1%", "Break 2 - 0.6 - 5%", "Break 2 - 0.6 - 10%",
                                          "Break 2 - 0.8 - 1%", "Break 2 - 0.8 - 5%", "Break 2 - 0.8 - 10%")
  
  
  model.two.break.rho.cval <-  matrix(c( -55.4 , -47.9, -44.0, -58.6, -49.9, -44.4, -57.6, -49.6, -44.6,
                                         NA    ,    NA,  NA  ,-59.3, -49.0, -44.3, -58.8, -48.7, -44.5,
                                         NA    ,    NA,  NA  , NA  , NA   , NA    ,-57.4, -49.8, -44.4)
                                      , nrow = 3, ncol = 9, byrow = TRUE )
  
  
  rownames(model.two.break.rho.cval) <- c("Break 1 - 0.2", "Break 1 - 0.4", "Break 1 - 0.6")
  colnames(model.two.break.rho.cval) <- c("Break 2 - 0.4 - 1%", "Break 2 - 0.4 - 5%", "Break 2 - 0.4 - 10%",
                                          "Break 2 - 0.6 - 1%", "Break 2 - 0.6 - 5%", "Break 2 - 0.6 - 10%",
                                          "Break 2 - 0.8 - 1%", "Break 2 - 0.8 - 5%", "Break 2 - 0.8 - 10%")
  #Number of observations to eliminate in relation to the sample length
  pnnobs <- round(pn*n)
  
  
  #Define the start values
  startl <- 0
  myBreakStart <- startl + 1 + pnnobs
  myBreakEnd <- n - pnnobs
  
  #Calculate Dy
  y.diff <- diffmatrix(y, max.diff = 1, max.lag = 1)
  
  #Calculation
  #trend for 1:n like in ur.sp 
  trend <- 1:n
  
  #Define minimum gap between the two possible break dates.
  #the gap is 2 in the crash case and 3 periods in the case of a break model
  gap <- 2 + as.numeric(model == "break")
  
  
  myBreaks <- matrix(NA, nrow = n - 2 * pnnobs, ncol =  breaks)
  if(breaks == 1){
    myBreaks [,1] <- myBreakStart:myBreakEnd
  } else if (breaks == 2){
    myBreaks[, 1:breaks] <- cbind(myBreakStart:myBreakEnd,(myBreakStart:myBreakEnd)+gap)
  }
  #Define the variables to hold the minimum t-stat
  tstat <- NA
  mint <- 1000
  tstat.result.matrix <- matrix(NA, nrow = n, ncol = n )
  
  #Create lists to store the results
  #Lists for the one break case
  result.reg.coef <- list()
  result.reg <- list()
  result.reg.resid <- list()
  result.matrix <- list()
  
  #Function to analyze the optimal lags to remove autocorrelation from the residuals
  #Lag selection with general to specific procedure based on Ng,Perron (1995)
  #ToDo make it possible, that zero lags are included
  myLagSelection <- function(y.diff, S.tilde, datmat, pmax, Dummy.diff){
    n <- length(y.diff)
    
    #               General to specific approach to determine the lag which removes autocorrelation from the residuals
    #               Ng, Perron 1995
    qlag <- pmax
    while (qlag >= 0) {
      
      #               Define optimal lags to include to remove autocorrelation from the residuals
      #               select p.value of the highest lag order and check if significant
      #test.coef <- coef(summary(lm(y.diff ~ 0 + lagmatrix(S.tilde,1)[,-1] + datmat[,-1][, 1:(qlag + 1)]  + Dummy.diff)))
      
      #lm.fit implementation
      test.reg.data <- na.omit(cbind(y.diff,lagmatrix(S.tilde,1)[,-1], datmat[,-1][, 1:(qlag + 1)], Dummy.diff))
      
      test.reg.lm. <-(lm.fit(x = test.reg.data[,-1], y = test.reg.data[, 1]))
      
      df.lm.fit <- length(test.reg.data[,1]) - test.reg.lm.$qr$rank
      sigma2 <- sum((test.reg.data[,1] - test.reg.lm.$fitted.values)^2)/df.lm.fit
      varbeta <- sigma2 * chol2inv(qr.R(test.reg.lm.$qr), size = ncol(test.reg.data) -2)
      SE <- sqrt(diag(varbeta))
      tstat <- na.omit(coef(test.reg.lm.))/SE
      pvalue <- 2* pt(abs(tstat), df = df.lm.fit, lower.tail =  FALSE)
      
      #                print(test.coef)
      #                print(paste("lm result:",qlag,test.coef[qlag + 1 , 4]))
      #                print(paste("lm.fit:",qlag,pvalue[qlag+1]))
      #               print(c("Number of qlag",qlag)) 
      if(pvalue[qlag+1] <= 0.1){
        slag <- qlag
        #                  print("break")
        break
      }
      qlag <- qlag - 1
      slag <- qlag
      
    }
    #            print(slag)
    return(slag)
  }
  
  # Function to calculate the test statistic and make the code shorter, because the function can be used in both cases for 
  # the one break as well as the two break case
  myLSfunc <- function(Dt, DTt, y.diff, est.function = c("estimation","results")){
    
    Dt.diff <- diffmatrix(Dt, max.diff = 1, max.lag = 1)
    
    DTt.diff <- diffmatrix(DTt, max.diff = 1, max.lag = 1)
    
    S.tilde <- 0
    S.tilde <- c(0, cumsum(lm.fit(x = na.omit(cbind(Dt.diff[,])), y=na.omit(y.diff))$residuals))
    S.tilde.diff <-  diffmatrix(S.tilde,max.diff = 1, max.lag = 1)
    #       Define optimal lags to include to remove autocorrelation
    #       max lag length pmax to include is based on Schwert (1989) 
    pmax <- min(round((12*(n/100)^0.25)),lags)
    
    #      Create matrix of lagged values of S.tilde.diff from 0 to pmax
    #      and check if there is autocorrelation if all these lags are included and iterate down to 1          
    datmat <- matrix(NA,n, pmax + 2)
    datmat[ , 1] <- S.tilde.diff
    #      Add column of 0 to allow the easy inclusion of no lags into the test estimation
    datmat[ , 2] <- rep(0, n)
    
    if(pmax > 0){
      datmat[, 3:(pmax + 2) ] <- lagmatrix(S.tilde.diff, pmax)[,-1]
      colnames(datmat) <- c("S.tilde.diff", "NoLags",  paste("S.tilde.diff.l",1:pmax, sep = ""))
    } else if(lags == 0){
      colnames(datmat) <- c("S.tilde.diff", "NoLags")
    }
    
    
    if(method == "Fixed"){
      slag <- lags
    } else if(method == "GTOS"){
      
      slag <- NA
      
      if(model == "crash"){
        slag <- myLagSelection(y.diff, S.tilde, datmat, pmax, Dt.diff)
        
      } else if(model == "break"){
        slag <- myLagSelection(y.diff, S.tilde, datmat, pmax, DTt.diff)
      }
    }
    
    S.tilde <- NA
    
    
    
    if(model == "crash"){
      S.tilde <- c(0, cumsum(lm.fit(x = na.omit(cbind(Dt.diff[,])), y=na.omit(y.diff))$residuals))
      S.tilde.diff <-  diffmatrix(S.tilde,max.diff = 1, max.lag = 1)
      
      #Add lag of S.tilde.diff to control for autocorrelation in the residuals
      if(est.function == "results"){
        break.reg <- summary(lm(y.diff ~ 0 + lagmatrix(S.tilde, 1)[,-1] + datmat[,2:(slag+2)]  + Dt.diff))
      } else if (est.function == "estimation"){
        #lm.fit() implementation
        roll.reg.data <- na.omit(cbind(y.diff, lagmatrix(S.tilde,1)[,-1], datmat[,2:(slag+2)], Dt.diff))
        
        roll.reg.lm. <- lm.fit(x = roll.reg.data[,-1], y = roll.reg.data[, 1])
        
        
        df.lm.fit <- length(roll.reg.data[, 1]) - roll.reg.lm.$qr$rank
        sigma2 <- sum((roll.reg.data[, 1] - roll.reg.lm.$fitted.values)^2)/df.lm.fit
        varbeta <- sigma2*chol2inv(qr.R(roll.reg.lm.$qr), size = ncol(roll.reg.data) - 2)
        SE <- sqrt(diag(varbeta))
        tstat.lm.fit <- na.omit(coef(roll.reg.lm.))/SE
        pvalue <- 2 * pt(abs(tstat.lm.fit),df = df.lm.fit, lower.tail =  FALSE)
        coef.roll.reg.lm <- cbind(na.omit(coef(roll.reg.lm.)), SE, tstat.lm.fit, pvalue)
        
        tstat.lm.fit <- tstat.lm.fit[1] 
        
        # tstat.lm <- coef(break.reg)[1,3]
        return(coef.roll.reg.lm)
      }
      
      # print(paste("lm.fit", tstat.lm.fit[1]))
      # print(paste("lm", tstat.lm))
      #print(roll.reg)
      if(est.function == "estimation"){
        return(coef.roll.reg.lm)
      } else if(est.function == "results"){
        return(break.reg)
      }
      
    } else if(model =="break"){
      S.tilde <- c(0, cumsum(lm.fit(x = na.omit(cbind(DTt.diff[,])), y=na.omit(y.diff))$residuals))
      S.tilde.diff <-  diff(S.tilde)
      
      
      #Add lag of S.tilde.diff to control for autocorrelation in the residuals
      if(est.function == "results"){
        break.reg <- summary(lm(y.diff ~ 0 + lagmatrix(S.tilde,1)[,-1] + datmat[,2:(slag+2)] + DTt.diff))
      } else if (est.function == "estimation"){
        #lm.fit() implementation
        roll.reg.data <- na.omit(cbind(y.diff, lagmatrix(S.tilde,1)[,-1], datmat[,2:(slag+2)], DTt.diff))
        
        roll.reg.lm. <- lm.fit(x = roll.reg.data[,-1], y = roll.reg.data[, 1])
        
        df.lm.fit <- length(roll.reg.data[, 1]) - roll.reg.lm.$qr$rank
        sigma2 <- sum((roll.reg.data[, 1] - roll.reg.lm.$fitted.values)^2)/df.lm.fit
        varbeta <- sigma2*chol2inv(qr.R(roll.reg.lm.$qr), size = ncol(roll.reg.data) -2)
        SE <- sqrt(diag(varbeta))
        tstat.lm.fit <- na.omit(coef(roll.reg.lm.))/SE
        pvalue <- 2 * pt(abs(tstat.lm.fit),df = df.lm.fit, lower.tail =  FALSE)
        coef.roll.reg.lm <- cbind(na.omit(coef(roll.reg.lm.)), SE, tstat.lm.fit, pvalue)
        
        tstat.lm.fit <- tstat.lm.fit[1] 
        return(coef.roll.reg.lm)
        # tstat.lm <- coef(break.reg)[1,3]
      }
      #Return Value
      #print(roll.reg)
      #print(paste("lm.fit", tstat.lm.fit))
      #print(paste("lm", tstat.lm))
      #print("break")
      if(est.function == "estimation"){
        return(coef.roll.reg.lm)
      } else if(est.function == "results"){
        return(break.reg)
      }
      
    }
    
    #   print(roll.reg)
    if(est.function == "estimation"){
      return(coef.roll.reg.lm)
    } else if(est.function == "results"){
      return(break.reg)
    }
    
  }
  
  
  # Start of the actual function call
  
  if(breaks == 1)
  {
    # Function to calculate the rolling t-stat
    # One Break Case
    # foreach implementation returns a list of the tstat and the corresponding break dates
    # 
    tstat.result.matrix <-  foreach(myBreak1 = myBreaks[,1], .combine = 'rbind') %dopar%{
      #Dummies for one break case
      Dt1 <-  as.matrix(cbind(trend, trend >= (myBreak1 + 1)))
      
      #       Dummy with break in intercept and in trend
      DTt1 <- as.matrix(cbind(Dt1, c(rep(0, myBreak1), 1:(n - myBreak1))))
      colnames(Dt1) <- c("Trend","D")
      colnames(DTt1) <- c("Trend","D","DTt")
      #print(paste("Break1: ",myBreak1, sep = ""))
      
      #Combine all Dummies into one big matrix to make it easier to include in the regressions
      
      Dt <- cbind(Dt1)
      DTt <- cbind(DTt1)
      
      result.reg[[myBreak1]] <- myLSfunc(Dt, DTt, y.diff, est.function = c("estimation"))
      
      
      # return the t.statistic for the considered break dates 
      tstat <- result.reg[[myBreak1]][1,3]
      tstat.result <- list(tstat,myBreak1)
      
    }#End of first for loop
  } else if(breaks == 2) {
    
    
    ## Two Break Case
    # Nested foreach loop to implement the parallelization for the two break case
    # Returns a complete list with tstat and the corresponding break dates
    # These returned matrix is then searched for the minimum tstat
    tstat.result.matrix <- foreach(myBreak1 = myBreaks[,1], .combine = 'rbind', .errorhandling = 'remove') %:%
      foreach(myBreak2 = myBreaks[which(myBreaks[,2] < myBreakEnd & myBreaks[,2] >= myBreak1 + gap),2]
              ,.combine = 'rbind', .errorhandling = 'remove') %dopar%{
                #Dummies for one break case
                Dt1 <-  as.matrix(cbind(trend, trend >= (myBreak1 + 1)))
                
                #       Dummy with break in intercept and in trend
                DTt1 <- as.matrix(cbind(Dt1, c(rep(0, myBreak1), 1:(n - myBreak1))))
                colnames(Dt1) <- c("Trend","D")
                colnames(DTt1) <- c("Trend","D","DTt")
                #print(paste("Break1: ",myBreak1, sep = ""))
                
                #Dummies for two break case
                Dt2 <-  as.matrix(trend >= (myBreak2 + 1))
                DTt2 <- as.matrix(cbind(Dt2, c(rep(0, myBreak2), 1:(n - myBreak2))))
                colnames(Dt2) <- c("D2")
                colnames(DTt2) <- c("D2","DTt2")
                #print(paste("Break2: ",myBreak2, sep = ""))
                
                #Combine all Dummies into one big matrix to make it easier to include in the regressions
                
                Dt <- cbind(Dt1, Dt2)
                DTt <- cbind(DTt1, DTt2)
                
                result.reg <- myLSfunc(Dt, DTt, y.diff, est.function = c("estimation"))
                
                
                # return the t.statistic for the considered break dates 
                tstat <- result.reg[1,3]
                tstat.result <- list(tstat,myBreak1,myBreak2) 
                
                
              }#End of nested foreach loops
    
  } else if(breaks > 2){
    
    print("Currently more than two possible structural breaks are not implemented.")
  }
  #Find index of minimum in the result matrix
  #Determine the breakpoints according to the minimum
  if(breaks== 2){
    mybestbreak1 <- as.integer(tstat.result.matrix[which.min(tstat.result.matrix),][2])
    mybestbreak2 <- as.integer(tstat.result.matrix[which.min(tstat.result.matrix),][3])
  } else if(breaks == 1){
    mybestbreak1 <- as.integer(tstat.result.matrix[which.min(tstat.result.matrix),][2])
  }
  #Find minimum tstat
  mint <- tstat.result.matrix[which.min(tstat.result.matrix),][1]
  #Estimate regression results, based on the determined breaks and the selected lag 
  # to obtain all necessary information
  Dt1 <-  as.matrix(cbind(trend, trend >= (mybestbreak1 + 1)))
  
  #       Dummy with break in intercept and in trend
  DTt1 <- as.matrix(cbind(Dt1, c(rep(0, mybestbreak1), 1:(n - mybestbreak1))))
  colnames(Dt1) <- c("Trend","D")
  colnames(DTt1) <- c("Trend","D","DTt")
  #print(paste("Break1: ",myBreak1, sep = ""))
  
  if(breaks == 2){
    #Dummies for two break case
    Dt2 <-  as.matrix(trend >= (mybestbreak2 + 1))
    DTt2 <- as.matrix(cbind(Dt2, c(rep(0, mybestbreak2), 1:(n - mybestbreak2))))
    colnames(Dt2) <- c("D2")
    colnames(DTt2) <- c("D2","DTt2")
    #print(paste("Break2: ",myBreak2, sep = ""))
    
    #Combine all Dummies into one big matrix to make it easier to include in the regressions
    
    Dt <- cbind(Dt1, Dt2)
    DTt <- cbind(DTt1, DTt2)
  } else if (breaks == 1){
    Dt <- Dt1
    DTt <- DTt1
    
  }
  
  
  break.reg <- myLSfunc(Dt, DTt, y.diff, est.function = c("results")) 
  
  endtime <- Sys.time()
  myruntime <- difftime(endtime,starttime, units = "mins")
  if(print.results == "print"){
  print(mint)
  print(paste("First possible structural break at position:", mybestbreak1))
  print(paste("The location of the first break - lambda_1:", round(mybestbreak1/n, digits = 1),", with the number of total observations:", n))  
  if(breaks == 2){
    print(paste("Second possible structural break at position:", mybestbreak2))
    print(paste("The location of the second break - lambda_2:", round(mybestbreak2/n, digits = 1),", with the number of total observations:", n))  
    
    
    # Output Critical Values    
    cat("Critical values:\n")
    print(model.two.break.tau.cval)
    
  }else if(breaks == 1){
    if(model == "crash"){
      cat("Critical values - Crash model:\n")
      print(model.one.crash.cval)
    } else if(model == "break"){
      cat("Critical values - Break model:\n")
      print(model.one.break.cval)
    }
    
  }
  
  if(method == "Fixed"){
    print(paste("Number of lags used:",lags))
  } else if(method == "GTOS"){
    print(paste("Number of lags determined by general-to-specific lag selection:" 
                ,as.integer(substring(unlist(attr(delete.response(terms(break.reg)), "dataClasses")[3]),9))-1))
  } 
  cat("Runtime:\n")
  print(myruntime)
  }
  # Create complete list with all the information and not only print it
  if(breaks == 2){
    results.return <- list(mint, mybestbreak1, mybestbreak2, myruntime)
    names(results.return) <- c("t-stat", "First break", "Second break", "Runtime")
  } else if(breaks == 1){
    results.return <- list(mint, mybestbreak1, myruntime)
    names(results.return) <- c("t-stat", "First break", "Runtime")
  }
  return(list(results.return, break.reg))
  }#End of ur.ls function