DNN_GPSCVS.Rmd

---
title: "DNN"
date: "Nov 14, 2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## R Markdown
```{r, echo=FALSE}

library(tidyverse)
library(data.table)
library(reshape2)
library(tictoc)
library(h2o)
library(smotefamily)
library(FNN) # to run k nearest for SMOTE FAMILT
library(gtools) # for combinations
library(caret)
library(sp)
library(rgdal)
library(rgeos)
library(foreach)
library(doParallel)




```



# PITNEY BOWES

```{r, Batch in the Pitney Bowes shapefile, echo=FALSE}

pb_shp<-readOGR("c:/projects/CVS_GPS/DNN","FirmSynthesisPoints_QTAssigned_totemp_LCC")
rgeos::gIsValid(pb_shp)

#get the database
pb_shp_df <- pb_shp@data

```

# CVS and Data Munging

```{r, Batch in the CVS}
options(digits = 8)

# batch in csd identifier. The idea is that if one is within the GGH or Trans the buffer and variance will be lower.
csd_equiv <- read_csv("c:/projects/CVS_GPS/DNN/CSD_GGH.txt") %>%
  .[,-c(1)]

# Batch in the CVS data that Bryce has sent along and strip out unncessary columns
cvs <- read_csv("c:/projects/CVS_GPS/DNN/cvs_tours_5.csv.gz") %>%
  merge(., csd_equiv, by = "csduid", all.x = TRUE)
cvs$GGH[is.na(cvs$GGH)] <- 0

# xtabs(~cfaf_group + cvs_comm_group, data = cvs)
# xtabs(~cvs_comm_group + sctg2, data = cvs)

# generate CFAF groups by Tour ID
cfaf_tour <- cvs %>%
  group_by(ID) %>%
  summarise(cfaf = min(cfaf_group))

# get the points from the CVS dataframe
xy <- cvs %>%
  subset(., select = c(latitude, longitude)) %>%
  transform(ID = 1:nrow(.))
colnames(xy) <- c("Y", "X", "ID")
coordinates(xy) <- c("X", "Y") 

xydf <- xy@coords

# set the projection system
proj4string(xy) <- CRS("+init=epsg:4326")

# Get the projection system from the Pitney Bowes points file and set the CVS to the same
pb_proj <- pb_shp@proj4string
res <- as.data.frame(spTransform(xy, pb_proj)) %>%
  subset(., select = -c(ID))

# Now convert the CVS to a spatial points dataframe
cvs <- cbind(cvs, res)
cvs_spdf <- SpatialPointsDataFrame(coords = res, data = cvs, proj4string = pb_proj)
cvs_spdf@data$seq_id <- 1:nrow(cvs_spdf@data)

# xtabs(~sctg2+cvs_comm_group, data=cvs)

```



```{r}

cvs_df <- cvs_spdf@data

# multiple buffer sizes
buff_size_urban <- 250
buff_size_rural <- 500
variance <- -2
increment <- 250
max_buff_area <- pi*(4000^2)

# create all buffers at once by urban vs rural
cvs_spdf_urban <- cvs_spdf[cvs_spdf@data$GGH >0, ]
bufferedPoints_urb <- gBuffer(cvs_spdf_urban, width=buff_size_urban, byid=TRUE)


cvs_spdf_rural <- cvs_spdf[cvs_spdf@data$GGH ==0, ]
bufferedPoints_rur <- gBuffer(cvs_spdf_rural, width=buff_size_rural, byid=TRUE)

# one parent buffer dataframe
bufferedPoints <- rbind(bufferedPoints_urb, bufferedPoints_rur, makeUniqueIDs = TRUE) 

# Calculate the Gaussian Distance now
datalist = list()
tic("Build Gaussian Distances")

#setup parallel backend to use many processors
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)

# cvs_df <- cvs_df[1:100,]
writeLines(c(""), "log.txt")
sink("log.txt")

i <- 31214

datalist <- foreach(i = 1:1, .packages=c("sp", "dplyr", "reshape2", "rgeos")) %dopar% {

  
  cat(paste("\n","Starting iteration",i,"\n"), file = "log.txt", append = TRUE)

  # select the buffer polygon around the CVS
 
  first <- bufferedPoints[bufferedPoints@data$seq_id == i, ]
  # plot(first)
  # writeOGR(first, "C:/projects/CVS_GPS/DNN", "onebuffer1", driver = "ESRI Shapefile")

  # get the dataframe of the buffer
  firstdf <- first@data 
  
  # Get all the Pitney Bowes firms inside the buffer
  pb_in_poly <- pb_shp[first,]
  pdf <- pb_in_poly@data
  
  # get defaults
  numrows_pdf <- nrow(pdf)
  area_buff <- first@polygons[[1]]@area
  marginal_buff <- increment/2
  
  while (numrows_pdf< 1 ) {
    
    
    first <- gBuffer(first, width = marginal_buff, byid=TRUE)
    
    # Get all the Pitney Bowes firms inside the buffer
    pb_in_poly <- pb_shp[first,]
    pdf <- pb_in_poly@data
    
    numrows_pdf <- nrow(pdf)
    marginal_buff <- marginal_buff + increment/2
    area_buff <- first@polygons[[1]]@area
    
    print(numrows_pdf)
    print(area_buff)
    print(marginal_buff)
    sink()
    
    if (area_buff > max_buff_area){
      break
    }
    
  }
  plot(first)
  plot(pb_in_poly, col = "red", add = TRUE)

  # only if some firms were found in the buffer
  if(nrow(pb_in_poly@data)>0){
    
    # save the Pitney Bowes firms inside the buffer
    pb_in_poly_df <- pb_in_poly@data %>%
      merge(., pb_shp_df, by = "firm_id", all.x = TRUE) %>%
      subset(., select = c(firm_id, csd_id.x, naics_code.x, size_categ.x, lat.y, lon.y, emp.x ))
    colnames(pb_in_poly_df) <- c("firm_id", "csd_id", "naics_code", "size", "lat", "lon", "emp")
    
    first_df <- bind_rows(replicate(nrow(pb_in_poly_df), firstdf, simplify = FALSE)) %>%
      cbind(., pb_in_poly_df)

    # Gaussian distance and percentage calculation
    first_df$euc <- sqrt(((first_df$X-first_df$lon)/10000)^2 + ((first_df$Y-first_df$lat)/10000)^2)   # scale down by 10000 to avoid overflow
    first_df$var <- variance*first_df$euc
    first_df$gauss_dist <- exp(first_df$var)
    first_df$pct <- (first_df$emp*first_df$gauss_dist)/sum(first_df$emp*first_df$gauss_dist)
    
    # calculate probabilities by NAICS
    naics_pct <- first_df %>%
      group_by(naics_code) %>%
      summarise(naics_wt = sum(pct)) %>%
      transform(., id = first_df$ID[1]) %>%
      dcast(.,id ~ naics_code, value.var = "naics_wt") %>%
      subset(., select = -c(id))
    
    # Bind it all together
    firstdf <- cbind(firstdf, naics_pct)
    
    datalist[[i]] <- firstdf
    
  } else {
    
    datalist[[i]] <- firstdf
    
  }


}


stopCluster(cl)
toc()

# One dataframe of all Buffers and the gaussian distances. Get rid of NAs
big_data1 = rbindlist(datalist, fill = TRUE)
big_data1[is.na(big_data1)] <- 0

sink()


```

## Build the DNN dataset
```{r, Now build the DNN dataset}

dnn_df <- big_data1 %>%
  group_by(ID) %>%
  summarise(nw = min(nw), cvs_comm_group = min(cvs_comm_group), 
            sctg2 = min(sctg2),
            tour_dist = min(tour_dist), csduid = min(csduid), 
            n11 =  mean(`11`),   n23 =  mean(`23`),   n31 =  mean(`31-33`), 
            n44 =  mean(`44-45`),n48 =  mean(`48-49`),n52 =  mean(`52`),
            n53 =  mean(`53`),   n54 =  mean(`54`),   n56 =  mean(`56`),
            n62 =  mean(`62`),   n71 =  mean(`71`),   n72 =  mean(`72`),
            n81 =  mean(`81`),   n22 =  mean(`22`),   n41 =  mean(`41-42`),
            n51 =  mean(`51`),   n61 =  mean(`61`),   n21 =  mean(`21`),
            n55 =  mean(`55`),   n91 =  mean(`91-92`), urban_area = max(GGH))

# Get Tours that got no Pitney Bowes in the buffer. This is because the distance buffer maxed out before finding any firms. 
# These tours will be taken out before training.
dnn_df$sum <- rowSums(dnn_df[, 7:26])
zeros <- subset(dnn_df, sum == 0)

dnn_df <- subset(dnn_df, sum != 0) %>%
  subset(., select = -c(sum))

# Save this interim step
write_csv(dnn_df, "c:/projects/CVS_GPS/DNN/dnn_df_variablebuffer.csv")


  
###########################
# The above dnn_df includes trucks that dont have a GPS on them as well. Because we want to apply this to the GPS, we are going to only keep records that have a GPS on the truck
cvs_tracked <- read_csv("c:/projects/CVS_GPS/DNN/cvs_tours_5_tracked_vehs.csv.gz") %>%
  group_by (ID) %>%
  summarise(count=n()) %>%
  transform(., tracked = 1) %>%
  subset(., select = c(ID, tracked)) 

dnn_df <- merge(dnn_df, cvs_tracked, by="ID", all.x = TRUE) %>%
  subset(., tracked > 0) %>%
  subset(., select = -c(tracked))

#############################
# new tours that Bryce generated
cvs7 <- read_csv("C:/projects/cvs_gps/dnn/cvs_tours_7_all_vehs2.csv")
  
# get week flag
wk_flag <- cvs7 %>%
  group_by(ID) %>%
  summarise(week_flag = min(is_weekend)) %>%
  transform(., week_flag = ifelse(week_flag == 1, "T", "F"))

# Only keep the records that have a CFAF group
dnn_df <- read_csv("c:/projects/CVS_GPS/DNN/dnn_df_variablebuffer.csv") %>%
  merge(., cfaf_tour, by = "ID") %>% 
  merge(., wk_flag, by = "ID") %>%
  drop_na()

```



# Level 1 DNN NOW  

## Do Something about the *Semi-Known : Unknowns*  

There are around 12042 observations that had **unknown** as a commodity. Ignoring this in a tour-based setup is a challenge for it reduces the dataset by 25%.  

We can drop these from the final training, but I need to impute a label for two reasons:  

* First, augment the training dataset  
* Use it in the testing dataset and compare results against the imputation.  


```{r, Understand the Unknowns}

# Let's see how the Unknowns are distributed by distance band
# generate breaks up to 3000 kms. Beyond that it is a single group
dist_breaks <- seq(0, 1500, by = 100) 
bin_kns <- subset(dnn_df, cvs_comm_group != 99) %>%
  transform(., km_bin = ifelse(.$tour_dist <= 1500, cut(.$tour_dist, breaks = dist_breaks), 16)) %>% 
              group_by(km_bin) %>% 
              summarise(bin_cnt = n()) %>%
              transform(., cumulative = cumsum(.$bin_cnt/sum(.$bin_cnt)))

ggplot(bin_kns, aes(x=km_bin, y=bin_cnt)) + 
  geom_bar(stat = "identity") +
  theme(axis.text.x=element_text(angle=90,hjust=1))


```

Let's make representative records that belong to each group of distance bins. These will be used to sample a commodity group for the **UNKNOWN** commodity group. The process is as follows:  

* Calculate a representative for each commodity group by generating the means for all the continous variables.  
* Rescale the NW and distance variables to fall between -1 and 1. This is important for we need to do a **Euclidean (L2)** calculation.  
* Calculate each of the **UNKNOWN** records distance to these representatives. 
* The distance acts as weights for sampling a CVS_COMMODITY_GROUP.  

```{r, Generate representative values across the segments for the Known Commodity Groups}

# These bins look OK. So lets transfer the information over to the main dataframe
dnn_df1 <- dnn_df %>%
  transform(., km_bin = ifelse(.$tour_dist <= 1500, cut(.$tour_dist, breaks = dist_breaks), 16)) 

# generate representatives for sampling
dist_kns <- dnn_df1 %>%
  subset(., select = -c(ID, csduid, km_bin)) %>%
  subset(., cvs_comm_group != 99) %>%
  group_by(cvs_comm_group) %>%
  summarise_all(mean)

# mean nd variance
m_nw <- mean(dist_kns$nw)
v_nw <- var(dist_kns$nw)
m_td <- mean(dist_kns$tour_dist)
v_td <- var(dist_kns$tour_dist)

dist_kns <- dist_kns %>%
  transform(., nw = (nw-m_nw)/v_nw) %>%  # Rescale NW and distance columns need to be scaled to between -1 and 1.
  transform(., tour_dist = (tour_dist-m_td)/v_td)

# save original number of rows of the distribution dataframe
# Also find the number of rows that have the UNKNOWN label in the commodity group. The distribution dataframe will be 
# repeated so many times
rows_dist_kns <- nrow(dist_kns)
rows_dnn_unks <- nrow(subset(dnn_df, cvs_comm_group == 99))

# repeat dataframe as many times as there are records in the UNKNOWN dataframe. This will allow for matrix operations

dist_kns <- dist_kns[rep(seq.int(1,nrow(dist_kns)),rows_dnn_unks), 1:ncol(dist_kns)]

```


```{r, Now generate the CVS groups for the UNKNOWNS}

dnn_unks <- dnn_df1 %>%
  subset(., cvs_comm_group == 99) %>%
  subset(., select = -c(csduid, km_bin, cvs_comm_group))

# mean and variance
m_nw <- mean(dnn_unks$nw)
v_nw <- var(dnn_unks$nw)
m_td <- mean(dnn_unks$tour_dist)
v_td <- var(dnn_unks$tour_dist)

dnn_unks <- dnn_unks %>%
  transform(., nw = (nw-m_nw)/v_nw) %>%  # Rescale NW and distance columns need to be scaled to between -1 and 1.
  transform(., tour_dist = (tour_dist-m_td)/v_td)

# repeat the rows to match the number of records in the distribution and sort on ID to ensure that all the records with the same ID are together
dnn_unks1 <- dnn_unks[rep(seq.int(1,nrow(dnn_unks)),rows_dist_kns), 1:ncol(dnn_unks)] %>%
  .[order(.$ID),]

# Calculate EUCLIDEAN
euc_dist <- (dnn_unks1[,c(2:23)]-dist_kns[,c(2:23)])^2
euc_dist <- transform(euc_dist, euc_dist_final = rowSums(euc_dist))

# Now append the necessary columns back to the UNKNOWN dataframe for sampling a commodity label
temp_df <- cbind(dist_kns[,1], euc_dist[,23])
colnames(temp_df) <- c("cvs_comm_group", "euc_dist")
dnn_unks1 <- cbind(dnn_unks1, temp_df)

# SAMPLE COMMODITY GROUP
seed = 1234
sampled_comm <- dnn_unks1 %>%
  subset(., select = c(ID, cvs_comm_group, euc_dist)) %>%
  subset(., cvs_comm_group < 97) %>%
  group_by(ID) %>%
  sample_n(., size = 1, weight = euc_dist) %>%
  subset(., select = -c(euc_dist))
colnames(sampled_comm) <- c("ID", "syn_comm_group")

ggplot(sampled_comm, aes(x=syn_comm_group)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) +
  ggtitle("Sampled commodities from the Euclidean Distances for the UNKNOWN commodities")

# attach the commodity group using ID field. COnvert the target vector to character
dnn_df1 <- merge(dnn_df1, sampled_comm, by = "ID", all.x = TRUE) %>%
  transform(., cvs_comm_group = ifelse(cvs_comm_group == 99, syn_comm_group, cvs_comm_group)) %>%
  subset(., cvs_comm_group != 97)

dnn_df1$cvs_comm_group <- as.factor(dnn_df1$cvs_comm_group)
dnn_df1$syn_comm_group[is.na(dnn_df1$syn_comm_group)] <- 0


ggplot(dnn_df1, aes(x=cvs_comm_group)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) +
  ggtitle("Final commodities after including the synthesized ones")
```

# The BEST FOUR CLASS MODEL  

*This is an interim saving of the best four class model after running SMOTE. The test set does not include any SMOTE records and thus represents an original 20% sample from the processed CVS.* 

## Split and Group Data
We'll be creating a cross-validation set from the training set to evaluate our model against. Use createDataPartition() to split our training data into two sets : 75% and 25%. Since, the outcome is categorical in nature, this will make sure that the distribution of outcome variable classes will be similar in both the sets. 


```{r}

# dnn_df1 <- dnn_df %>%
#   .[-c(1)] %>%
#   transform(., sctg2 = ifelse(sctg2 %in% c(-2,-3,99),99,sctg2)) %>%
#   subset(., !sctg2 %in% c(99,97))

dnn_df1 <- dnn_df %>%
  .[-c(1:4)] %>%
  subset(., !cfaf %in% c("UNKNOWN", "NON_CARGO", "COAL"))

# un <- dnn_df1 %>%
#   group_by(sctg2) %>%
#   summarise(c = n()) %>%
#   transform(., pct = (c/sum(.$c))*100)

un <- dnn_df1 %>%
  group_by(cfaf) %>%
  summarise(c = n()) %>%
  transform(., pct = (c/sum(.$c))*100)

# #Spliting training set into two parts based on outcome: 75% and 25%
# dnn_df1 <- dnn_df %>%
#   .[-c(1)] %>%
#   subset(., sctg2 %in% c(5,12,35,23,42,6,39,27,26,31,33,34,32,3,41,24,7,36,43)) %>%
#   select(c(cvs_comm_group,sctg2), everything())
# 
# un <- dnn_df1 %>%
#   group_by(sctg2) %>%
#   summarise(c = n()) %>%
#   .[order(-.$c),]

# Process the datafame
dnn_df1 <- dnn_df1 %>%
  transform(., urban_area = urban_area) %>%
  transform(., ln_dist = log(tour_dist)) %>%
  transform(., new_grp = cfaf) %>%
  # transform(., weight = ifelse(new_grp == "FUELS",6,
  #                              ifelse(new_grp == "WASTE",6,
  #                                     ifelse(new_grp == "MNRLS", 6, 
  #                                            ifelse(new_grp == "AGRI",6,
  #                                                   ifelse(new_grp == "FRPAP",6,1)))))) %>%
  transform(., weight = 1) %>%
  subset(., select = -c(cfaf)) %>%
  select(c(new_grp,week_flag), everything())


##########################################################################
# Understand the distributions of NAICS for each CFAF class
naics_grp <- dnn_df1[, c(1,5:24)] %>%
  group_by(new_grp) %>%
  summarise_all(sum) 

# naics_grp_pct <- naics_grp 
# # %>%
# #   transform(., tot = rowSums(.[2:21]))
# rownames(naics_grp_pct) <- naics_grp_pct$new_grp
# naics_grp_pct <- as.matrix(naics_grp_pct[,2:ncol(naics_grp_pct)]) %>%
#   prop.table(.,1)
# naics_grp_pct <- as.data.frame(naics_grp_pct)
# rownames(naics_grp_pct) <- naics_grp$new_grp

# melt for plotting
naics_grp1 <- naics_grp %>%
  melt(., id.vars = c("new_grp"))

ggplot(naics_grp1, aes(x=variable, y = value)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) +
  facet_wrap(~new_grp, ncol=4)


ggplot(naics_grp1, aes(x=new_grp, y = value)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) +
  facet_wrap(~variable, ncol=4)

#################################################################

# Miscellaneous is virtually indistinguishable based on the two ggplots. Do a binary choice
# between it and the rest to see if that improves the fit

dnn_df1 <- dnn_df1 %>%
  # transform(., new_grp = ifelse(!new_grp %in% c("MISC","BMETL","FOOD","OTHMF", "PLCHM"),"META1", "META")) %>%
  transform(., weight = ifelse(new_grp %in% c("WASTE"),1,1))


# Get Test data before SMOTE
# index <- createDataPartition(dnn_df1$new_grp, p=0.2, list=FALSE)
# save the index
# idx <- as.data.frame(index)

testSet <- dnn_df1[idx$Resample1,]
colnames(testSet)[1] <- "class"
testSet$class <- as.factor(testSet$class)
testSet$week_flag <- as.factor(testSet$week_flag)
testSet$urban_area <- as.factor(testSet$urban_area)
testSet$csduid <- as.factor(testSet$csduid)

# generate new records using SMOTE on the Training set only
trainSet <- dnn_df1[-idx$Resample1,]
colnames(trainSet)[1] <- "class"
trainSet$class <- as.factor(trainSet$class)
table(trainSet$class)

# RUN SMOTE
# genData = SMOTE(trainSet[,-c(1:2)],trainSet[,1], dup_size = 6, K=30)
# g <- genData$data
# table(g$class)
# genData_2 = SMOTE(g[,-c(25)],g[,25],dup_size = 3, K=30)
# g1 <- genData_2$data
# table(g1$class)
# genData_3 = SMOTE(g1[,-c(25)],g1[,25],dup_size = 3, K=30)
# g2 <- genData_3$data
# table(g2$class)
# genData_4 = SMOTE(g2[,-c(25)],g2[,25],dup_size = 1.5, K=30)
# g3 <- genData_4$data
# table(g3$class)
# genData_5 = SMOTE(g3[,-c(25)],g3[,25],dup_size = 4, K=30)
# g4 <- genData_5$data
# table(g4$class)
# genData_6 = SMOTE(g4[,-c(25)],g4[,25],dup_size = 4, K=30)
# g5 <- genData_6$data
# table(g5$class)
# genData_7 = SMOTE(g5[,-c(25)],g5[,25],dup_size = 4, K=30)
# g6 <- genData_7$data
# table(g6$class)
# genData_8 = SMOTE(g6[,-c(24)],g6[,24],dup_size = 12, K=15)
# g7 <- genData_8$data
# table(g7$class)
# genData_9 = SMOTE(g7[,-c(24)],g7[,24],dup_size = 9, K=15)
# g8 <- genData_9$data
# table(g8$class)
# genData_10 = SMOTE(g8[,-c(24)],g8[,24],dup_size = 9, K=15)
# g9 <- genData_10$data
# table(g9$class)
# genData_11 = SMOTE(g9[,-c(24)],g9[,24],dup_size = 8, K=15)
# g10 <- genData_11$data
# table(g10$class)
# genData_12 = SMOTE(g10[,-c(24)],g10[,24],dup_size = 8, K=15)
# g11 <- genData_12$data
# table(g11$class)
# genData_13 = SMOTE(g11[,-c(24)],g11[,24],dup_size = 7, K=15)
# g12 <- genData_13$data
# table(g12$class)
# genData_14 = SMOTE(g12[,-c(24)],g12[,24],dup_size = 7, K=15)
# g13 <- genData_14$data
# table(g13$class)
# genData_15 = SMOTE(g13[,-c(24)],g13[,24],dup_size = 7, K=15)
# g14 <- genData_15$data
# table(g14$class)
# genData_16 = SMOTE(g14[,-c(24)],g14[,24],dup_size = 6, K=15)
# g15 <- genData_16$data
# table(g15$class)
# genData_17 = SMOTE(g15[,-c(24)],g15[,24],dup_size = 2, K=6)
# g16 <- genData_17$data
# table(g16$class)
# genData_18 = SMOTE(g16[,-c(24)],g16[,24],dup_size = 1.5, K=6)
# g17 <- genData_18$data
# table(g17$class)
# genData_19 = SMOTE(g17[,-c(24)],g17[,24],dup_size = 3, K=6)
# g18 <- genData_19$data
# table(g18$class)
# genData_20 = SMOTE(g18[,-c(24)],g18[,24],dup_size = 3, K=6)
# g19 <- genData_20$data
# table(g19$class)


# re-write df1 to the synthesized database
trainSet <- trainSet%>%
  select(c(class, week_flag), everything())
trainSet$class <- as.factor(trainSet$class)
trainSet$week_flag <- as.factor(trainSet$week_flag)
trainSet$urban_area <- as.factor(trainSet$urban_area)
trainSet$csduid <- as.factor(trainSet$csduid)


##################################################################

ggplot(trainSet, aes(x=class)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) +
  ggtitle("Final commodities after including the synthesized ones")

ggplot(testSet, aes(x=class)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) +
  ggtitle("Commodities in Test Data")


```




## Build the First Round of DNN  

```{r}
h2o.shutdown(prompt = FALSE)
h2o.init(nthreads = 6, max_mem_size = "32g")

train <- as.h2o(trainSet)
test <- as.h2o(testSet)

# Deep Learner
# model <- h2o.deeplearning(x=colnames(train[c(2,4,6:26)]), 
#                           y = "new_comm", 
#                           training_frame = train, 
#                           hidden = c(1500,500,300,100),
#                           activation = "RectifierWithDropout",
#                           weights_column = "weights",
#                           epochs=10,
#                           train_samples_per_iteration = -1,
#                           score_training_samples = 0,
#                           score_validation_samples=0,
#                           validation_frame = test)
# # CF
# h2o.confusionMatrix(model)
# 
# # Auto ML
# model1 <- h2o.automl(y="class", 
#                      training_frame = train[c(1:26)], 
#                      max_runtime_secs = 1200,
#                      nfolds = 5)
# 
# lb <- model1@leaderboard
# model_ids <- as.data.frame(model1@leaderboard$model_id)[,1]
# # Get the "All Models" Stacked Ensemble model
# se <- h2o.getModel(grep("XRT", model_ids, value = TRUE)[1])
# # variable importance
# h2o.varimp_plot(se)
# 
# print(model1@leaderboard)
# print(model1@leader)


# we have chosen the Distributed RF for further exploration

# GBM
model_rf <- h2o.gbm(
                        model_id = "GBM_Attempt_CFAF_meta1",
                        x=colnames(train[c(2:27)]), 
                        y = "class", 
                        training_frame = train,
                        nfolds = 5,
                        keep_cross_validation_predictions = TRUE,
                        ntrees = 200,
                        stopping_tolerance = 0.005,
                        stopping_rounds = 0,
                        sample_rate = 0.67,
                        col_sample_rate = 0.6,
                        max_depth = 150,
                        min_rows = 3,
                        nbins = 600,
                        distribution = "multinomial",
                        fold_assignment = "Stratified",
                        weights = "weight",
                        nbins_cats = 600)

h2o.flow()



# lb <- model_rf@model$cross_validation_models[2]
# 
# ## Using the DNN model for predictions
h2o_yhat_test <- h2o.predict(model_rf, test)
# 
# ## Converting H2O format into data frame
df_yhat_test <- as.data.frame(h2o_yhat_test)

#################################################

# SAMPLE A CLASS INSTEAD OF SELECTING THE ONE WITH THE HIGHEST PROB
sampled_class = list()

for (i in 1:nrow(df_yhat_test)){
  
  temp_df <- df_yhat_test[i,] %>%
  melt(., id.vars = "predict")

  sampled_class[[i]] <- sample_n(tbl=temp_df, size=1, weight=temp_df$value)
  
}

sampled_class1 = rbindlist(sampled_class, fill = TRUE) %>%
  subset(., select = -c(value))
colnames(sampled_class1) <- c("predicted", "sampled")

testset_pred <- cbind(testSet, sampled_class1)

# Confusion matrices
xtabs(~class + sampled, data = testset_pred)
xtabs(~class + predicted, data = testset_pred)

#################################################################
h2o.flow()



```


## Now classify META CLASS 100 

```{r}

# Process the datafame
dnn_df2 <- dnn_df %>%
  .[-c(1)] %>%
  subset(., sctg2 %in% c(24,36,43)) %>%
  subset(., select = -c(nw)) %>%
  transform(., urban_area = ifelse(urban_area == 2,1,urban_area)) %>%
  transform(., ln_dist = log(tour_dist)) %>%
  transform(., new_grp = ifelse(sctg2 %in% c(24,43), 110, sctg2)) %>%
  subset(., select = -c(cvs_comm_group, csduid, sctg2)) %>%
  transform(., weight = ifelse(new_grp == 36, 2,1)) %>%
  select(new_grp, everything())

un <- dnn_df2 %>%
  group_by(new_grp) %>%
  summarise(c = n())

# Get Test data before SMOTE
index <- createDataPartition(dnn_df2$new_grp, p=0.2, list=FALSE)
testSet <- dnn_df2[index,]
colnames(testSet)[1] <- "class"
testSet$class <- as.factor(testSet$class)

#################################################################
# generate new records using SMOTE on the Training set only
trainSet <- dnn_df2[-index,]
colnames(trainSet)[1] <- "class"
trainSet$class <- as.factor(trainSet$class)
table(trainSet$class)


# RUN SMOTE
genData = SMOTE(trainSet[,-c(1)],trainSet[,1], dup_size = 1.5, K=15)
g <- genData$data
table(g$class)
# genData_2 = SMOTE(g[,-c(25)],g[,25],dup_size = 1.5, K=15)
# g1 <- genData_2$data
# table(g1$class)

# re-write df1 to the synthesized database
trainSet <- g%>%
  select(class, everything())
trainSet$class <- as.factor(trainSet$class)


##################################################################

ggplot(trainSet, aes(x=class)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) +
  ggtitle("Final commodities after including the synthesized ones")

ggplot(testSet, aes(x=class)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) +
  ggtitle("Commodities in Test Data")


#####################################################################

h2o.shutdown(prompt = FALSE)
h2o.init(nthreads = 6)

train <- as.h2o(trainSet)
test <- as.h2o(testSet)


# GBM
model_rf <- h2o.gbm(
                        model_id = "GBM_Attempt1_meta100",
                        x=colnames(train[c(2:25)]), 
                        y = "class", 
                        training_frame = train,
                        nfolds = 5,
                        keep_cross_validation_predictions = TRUE,
                        ntrees = 95,
                        stopping_tolerance = 0.005,
                        stopping_rounds = 0,
                        sample_rate = 1.0,
                        max_depth = 40,
                        min_rows = 5,
                        nbins = 60,
                        distribution = "multinomial",
                        fold_assignment = "Stratified",
                        weights = "weight")


h2o.flow()




```

## Now classify META CLASS 110 

```{r}

# Process the datafame
dnn_df3 <- dnn_df %>%
  .[-c(1)] %>%
  subset(., sctg2 %in% c(24,43)) %>%
  subset(., select = -c(nw)) %>%
  transform(., urban_area = ifelse(urban_area == 2,1,urban_area)) %>%
  transform(., ln_dist = log(tour_dist)) %>%
  transform(., new_grp = sctg2) %>%
  subset(., select = -c(cvs_comm_group, csduid, sctg2)) %>%
  transform(., weight = ifelse(new_grp == 24, 2,1)) %>%
  select(new_grp, everything())

un <- dnn_df3 %>%
  group_by(new_grp) %>%
  summarise(c = n())

# Get Test data before SMOTE
index <- createDataPartition(dnn_df3$new_grp, p=0.2, list=FALSE)
testSet <- dnn_df3[index,]
colnames(testSet)[1] <- "class"
testSet$class <- as.factor(testSet$class)

#################################################################
# generate new records using SMOTE on the Training set only
trainSet <- dnn_df3[-index,]
colnames(trainSet)[1] <- "class"
trainSet$class <- as.factor(trainSet$class)
table(trainSet$class)


# RUN SMOTE
genData = SMOTE(trainSet[,-c(1)],trainSet[,1], dup_size = 2, K=15)
g <- genData$data
table(g$class)


# re-write df1 to the synthesized database
trainSet <- g%>%
  select(class, everything())
trainSet$class <- as.factor(trainSet$class)


##################################################################

ggplot(trainSet, aes(x=class)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) +
  ggtitle("Final commodities after including the synthesized ones")

ggplot(testSet, aes(x=class)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) +
  ggtitle("Commodities in Test Data")


#####################################################################

h2o.shutdown(prompt = FALSE)
h2o.init(nthreads = 6)

train <- as.h2o(trainSet)
test <- as.h2o(testSet)


# GBM
model_rf <- h2o.gbm(
                        model_id = "GBM_Attempt1_meta110",
                        x=colnames(train[c(2:25)]), 
                        y = "class", 
                        training_frame = train,
                        nfolds = 5,
                        keep_cross_validation_predictions = TRUE,
                        ntrees = 95,
                        stopping_tolerance = 0.005,
                        stopping_rounds = 0,
                        sample_rate = 1.0,
                        max_depth = 40,
                        min_rows = 5,
                        nbins = 60,
                        distribution = "multinomial",
                        fold_assignment = "Stratified",
                        weights = "weight")


h2o.flow()




```




## Cluster the individual SCTG2 using Cosine 

```{r}

# apply the model_rf and get the confusion matrix

cm <- h2o.confusionMatrix(model_rf, test) %>%
  .[1:13, c(1:13)]
cm$tot <- rowSums(cm[,])

# compute percentages
cm1 <- cm/cm$tot 
cm1 <- cm1[,c(1:13)]


cm1$class <- rownames(cm1)
cm1 <- cm1 %>%
  select(class, everything())
cm1$class <- as.numeric(as.character(cm1$class))

# generate the combinations
char.var <- c(cm1$class)
df = as.data.frame(permutations(n=length(char.var), r=2, v=char.var))

# Now generate the clusters using confusion matrix
result <- c()

for (i in 1:nrow(df)) {
  if(i %% 10==0) {
    print(i)
  }
  # make individual vectors and put them in list
  vec1 <- df[i,1]
  vec2 <- df[i,2]
  v <- c(vec1, vec2)
  v_chr <- c(as.character(vec1), as.character(vec2))
  
  slice_cm <- subset(cm, rownames(cm) %in% v) %>%
    select(., v_chr)
  
  dot_prod <- sum(slice_cm[1,]*slice_cm[2,])
  norm_val <- sqrt(sum(slice_cm[1,]*slice_cm[1,]))*sqrt(sum(slice_cm[2,]*slice_cm[2,]))
  
  cos_theta <- dot_prod/norm_val
  result[[i]] <- cos_theta
  
}


# bind it all
rdf <- as.data.frame(result)
rdf[is.na(rdf)] <- 0
df <- cbind(df, rdf) %>%
  .[order(-result),]

df_cluster <- subset(df, result>0.85) %>%
  .[order(.$V2),]

df_cluster1 <- dcast(df_cluster, V1 ~ V2)
df_cluster1[is.na(df_cluster1)] <- 0


```


### META CLASSES

As one can see from the cluster matrix (df_cluster1), that 43 is confused with most commodities. In this round we collapse the commodities that confuse with 43 in one, and keep the other 7 that don't as individuals.

```{r}

#Spliting training set into two parts based on outcome: 75% and 25%
dnn_df1 <- dnn_df %>%
  .[-c(1)] %>%
  transform(., sctg2 = ifelse(sctg2 %in% c(-2,-3,99),99,sctg2)) %>%
  subset(., !sctg2 %in% c(99,97)) %>% ### NO EMPTIES and UNIDENTIFIED COMM
  select(c(cvs_comm_group,sctg2), everything())

un <- dnn_df1 %>%
  group_by(sctg2) %>%
  summarise(c = n())

# Process the datafame
dnn_df1 <- dnn_df1 %>%
  subset(., select = -c(nw)) %>%
  transform(., urban_area = ifelse(urban_area == 2,1,urban_area)) %>%
  transform(., ln_dist = log(tour_dist)) %>%
  transform(., new_grp = ifelse(sctg2 %in% c(34, 43), 101, 
                                ifelse(!sctg2 %in% c(3,7,24,26,27,31:33,36,39,41),100,sctg2))) %>%
  subset(., select = -c(cvs_comm_group, csduid, sctg2)) %>%
  select(new_grp, everything()) 



un <- dnn_df1 %>%
  group_by(new_grp) %>%
  summarise(c = n())

# CREATE TEST DATA
index <- createDataPartition(dnn_df1$new_grp, p=0.2, list=FALSE)
testSet <- dnn_df1[index,]
colnames(testSet)[1] <- "class"
testSet$class <- as.factor(testSet$class)

#################################################################
# generate new records using SMOTE on the Training set only
trainSet <- dnn_df1[-index,]
colnames(trainSet)[1] <- "class"
trainSet$class <- as.factor(trainSet$class)
table(trainSet$class)


# RUN SMOTE
genData = SMOTE(trainSet[,-c(1)],trainSet[,1], dup_size = 7, K=6)
g <- genData$data
table(g$class)
genData_2 = SMOTE(g[,-c(24)],g[,24],dup_size = 7, K=6)
g1 <- genData_2$data
table(g1$class)
genData_3 = SMOTE(g1[,-c(24)],g1[,24],dup_size = 6, K=6)
g2 <- genData_3$data
table(g2$class)
genData_4 = SMOTE(g2[,-c(24)],g2[,24],dup_size = 4.5, K=6)
g3 <- genData_4$data
table(g3$class)
genData_5 = SMOTE(g3[,-c(24)],g3[,24],dup_size = 4.5, K=6)
g4 <- genData_5$data
table(g4$class)
genData_6 = SMOTE(g4[,-c(24)],g4[,24],dup_size = 3, K=6)
g5 <- genData_6$data
table(g5$class)
genData_7 = SMOTE(g5[,-c(24)],g5[,24],dup_size = 3.5, K=6)
g6 <- genData_7$data
table(g6$class)
genData_8 = SMOTE(g6[,-c(24)],g6[,24],dup_size = 3.5, K=6)
g7 <- genData_8$data
table(g7$class)
genData_9 = SMOTE(g7[,-c(24)],g7[,24],dup_size = 3.5, K=6)
g8 <- genData_9$data
table(g8$class)
genData_10 = SMOTE(g8[,-c(24)],g8[,24],dup_size = 2.5, K=6)
g9 <- genData_10$data
table(g9$class)

# re-write df1 to the synthesized database
trainSet <- g9 %>%
  select(class, everything())
trainSet$class <- as.factor(trainSet$class)


h2o.shutdown(prompt = FALSE)
h2o.init(nthreads = 3)

train <- as.h2o(trainSet)
test <- as.h2o(testSet)

# GBM
model_rf <- h2o.gbm(
                        model_id = "GBM_Attempt1",
                        x=colnames(train[c(1:23)]), 
                        y = "class", 
                        training_frame = train,
                        nfolds = 5,
                        keep_cross_validation_predictions = TRUE,
                        ntrees = 75,
                        stopping_tolerance = 0.005,
                        stopping_rounds = 0,
                        sample_rate = 1.0,
                        max_depth = 35,
                        min_rows = 5,
                        nbins = 50,
                        distribution = "multinomial",
                        fold_assignment = "Stratified")

h2o.flow()



```




# Old clutering using CF 

```{r}

for (i in 1:nrow(df)) {
  
  frow <- as.numeric(as.vector(df[i, ]))
  df_1 <- cm1 %>%
    subset(., class %in% frow)

  result[[i]] <- sum(sapply(df_1[,c(2:ncol(df_1))],min))
  
}

# bind it all
rdf <- as.data.frame(result)
df <- cbind(df, rdf) %>%
  .[order(-result),]

# Now make groups. 
result1 <- c()
df_cp <- df
counter <- 1

# specify number of clusters
for(j in 1:22) {
  print(j)

  # get the first row
  frow <- as.numeric(as.vector(df_cp[1,c(1:2)]))
  result1[[counter]] <- frow
  
  # Now remove all the rows that have any of the above classes 
  # because these cannot be grouped again
  df1 <- df_cp %>%
    subset(., !(V1 %in% frow | V2 %in% frow )) 
  
  # overwrite the copy so that the loop is getting information 
  # from the reduced dataframe always
  df_cp <- df1
  
  counter <- counter + 1

  
}

clusters_df <- as.data.frame(result1)



```