getwd()
setwd("/home/ychen64/data")
list.files()   # List files in current folder
forbes <- read.csv("Forbes2000.csv",header=T,stringsAsFactors = FALSE,na.strings ="NA",sep=",")
head(forbes)
str(forbes)
summary(forbes)

##################
# Missing values
##################
is.na(forbes$profits)
which(is.na(forbes$profits))
table(is.na(forbes$profits))
forbes[is.na(forbes$profits),]
mean(forbes$profits,na.rm=T)
forbes <- na.omit(forbes)
dim(forbes)
#Alternative method to deal with missing values
for(i in 1:nrow(forbes)){
  if(is.na(forbes$profits[i])==TRUE){
  forbes$profits[i] <- mean(forbes$profits, na.rm = TRUE)
  }
}
dim(forbes)
##################
# Subsetting Data
##################
# Find all companies with negative profit
forbes[forbes$profits < 0,c("name","sales","profits","assets")]
# Find three companies with largest sale vol.
companies <- forbes$name
companies <- forbes[,"name"] #same as above
order_sales <- order(forbes$sales, decreasing=T)
companies[order_sales[1:3]]
head(sort(forbes$sales,decreasing=T),n=3)
# Find the business category to which most of the Bermuda island companies belong.
Bermudacomp <- subset(forbes, country == "Bermuda")
table(Bermudacomp[,"category"]) #frequency table of categories
# Create another data frame with only numeric variables
forbes2 <- data.frame(sales=forbes$sale,profits=forbes$profits,assets=forbes$assets, mvalue=forbes$marketvalue)
str(forbes2)
# Or simply use indexing
forbes3 <- forbes[,c(5:8)]
str(forbes3)
##################
# Factors
##################
# Convert characters to (unordered) factors
forbes$country<-factor(forbes$country)
str(forbes)
table(forbes$country)
forbes$country[(forbes$country=="Bahamas")|(forbes$country=="Bermuda")|(forbes$country=="Brazil")|(forbes$country=="Cayman Islands")|(forbes$country=="Chile")|(forbes$country=="Panama/ United Kingdom")|(forbes$country=="Peru")]<-"Venezuela"
forbes$country[(forbes$country=="Austria")|(forbes$country=="Belgium")|(forbes$country=="Czech Republic")|(forbes$country=="Denmark")|(forbes$country=="Finland")|(forbes$country=="France")|(forbes$country=="Germany")|(forbes$country=="Greece")|(forbes$country=="Hungary")|(forbes$country=="Ireland")|(forbes$country=="Italy")|(forbes$country=="Luxembourg")|(forbes$country=="Netherlands")|(forbes$country=="Norway")|(forbes$country=="Poland")|(forbes$country=="Portugal")|(forbes$country=="Russia")|(forbes$country=="Spain")|(forbes$country=="Sweden")|(forbes$country=="Switzerland")|(forbes$country=="Turkey")|(forbes$country=="France/ United Kingdom")|(forbes$country=="United Kingdom/ Netherlands")|(forbes$country=="Netherlands/ United Kingdom")]<-"United Kingdom"
forbes$country[(forbes$country=="China")|(forbes$country=="Hong Kong/China")|(forbes$country=="Indonesia")|(forbes$country=="Japan")|(forbes$country=="Kong/China")|(forbes$country=="Korea")|(forbes$country=="Malaysia")|(forbes$country=="Philippines")|(forbes$country=="Singapore")|(forbes$country=="South Korea")|(forbes$country=="Taiwan")]<-"Thailand"
forbes$country[(forbes$country=="Africa")|(forbes$country=="Australia")|(forbes$country=="India")|(forbes$country=="Australia/ United Kingdom")|(forbes$country=="Islands")|(forbes$country=="Israel")|(forbes$country=="Jordan")|(forbes$country=="Liberia")|(forbes$country=="Mexico")|(forbes$country=="New Zealand")|(forbes$country=="Pakistan")|(forbes$country=="South Africa")|(forbes$country=="United Kingdom/ Australia")]<-"United Kingdom/ South Africa"
forbes$country<-droplevels(forbes$country)
table(forbes$country)
levels(forbes$country)<-c("Canada","East/Southeast Asia","Europe","Other","United States","Latin America")
levels(forbes$country)

# Export the Dataset after the Data Cleaning (Optional)
write.csv(forbes,"Forbes2000_clean.csv",row.names=FALSE)

# Import clean data
forbes <- read.csv("Forbes2000_clean.csv",header=T,stringsAsFactors = T,na.strings ="NA",sep=",")


# Subset 
forbes2 <- forbes[,c(3, 5:8)]
str(forbes2)

set.seed(1) #set random seed reproducible
indx <- sample(1:1995,size=1995,replace=F)
forbes.train <- forbes2[indx[1:1600],]
forbes.test <- forbes2[indx[1601:1995],]

########################### 
# regular models(untrained) 
########################### 

#MLR 
lm <- lm(marketvalue ~ ., data = forbes.train) 
summary(lm)
contrasts(forbes.train$country)
lm.yhat <- predict(lm, newdata = forbes.test) 
lm.y <- forbes.test["marketvalue"] 
lm.rmse <- sqrt(mean((lm.y - lm.yhat)^2)) 
lm.rmse 
lm.abs = abs(lm.y - lm.yhat) 
lm.mad = (sum(lm.abs))/395 
lm.mad 

# Backward selection 
library(leaps) 
bwd <- regsubsets(marketvalue ~ ., data = forbes.train,nvmax =4,method ="backward") 
summary(bwd) 
bwd.summary=summary(bwd) 
which.min(bwd.summary$bic) 
which.min(bwd.summary$cp) 
# Same RMSE and MAD as MLR

# Single tree 
library (rpart) 
rpart <- rpart(marketvalue ~ ., data = forbes.train,control = rpart.control(xval = 10, minbucket = 50))
rpart 
jpeg('rplot1%03d.jpg')
par(mfrow=c(1,1),xpd=NA,cex=1.5) 
plot(rpart,uniform=T) 
text(rpart,use.n=T) 
dev.off()
#prune the tree 
print(rpart$cptable) 
opt <- which.min(rpart$cptable[,"xerror"]) 
cp <- rpart$cptable[opt, "CP"] 
prune <- prune(rpart,cp = cp) 
prune 
jpeg('rplot2%03d.jpg')
par(mfrow=c(1,1),xpd=NA,cex=1.5) 
plot(prune,uniform=T) 
text(prune,use.n=T) 
dev.off()
dt.yhat <- predict(prune, newdata = forbes.test) 
dt.y <- forbes.test["marketvalue"] 
dt.rmse <- sqrt(mean((dt.y - dt.yhat)^2)) 
dt.rmse 
dt.abs = abs(dt.y - dt.yhat) 
dt.mad = (sum(dt.abs))/395 
dt.mad 

# Bagging tree 
library (randomForest) 
bag <- randomForest(marketvalue ~ ., data = forbes.train, importance =TRUE) 
bag.yhat <- predict(bag, newdata = forbes.test) 
bag.y <- forbes.test["marketvalue"] 
bag.rmse <- sqrt(mean((bag.y - bag.yhat)^2)) 
bag.rmse 
bag.abs = abs(bag.y - bag.yhat) 
bag.mad = (sum(bag.abs))/395 
bag.mad 
jpeg('rf%03d.jpg')
importance(bag) 
varImpPlot(bag)
dev.off()


##################
# Parallel setting 
##################
#Following only applies on PC or one compute node of cluster. For using more compute nodes, refer to our parallel computing in R trainings
#In the R, load library(doParallel)
library(doParallel)
# Find out how many cores are available
detectCores()
# Create cluster with desired number of cores, should be based on the result of detectCores()
cl <- makeCluster(16)
# Register cluster
registerDoParallel(cl)
# Find out how many cores are being used
getDoParWorkers()

############### 
# data training 
############### 

library(caret) 

# boostraping linear regression 
lmtrain <- train(marketvalue ~ ., data = forbes.train,method = "lm",trControl = trainControl(method = "boot",number = 10000)) 
lmtrain$times 
lmtrain.yhat <- predict(lmtrain, newdata = forbes.test) 
lm.y <- forbes.test["marketvalue"] 
lm.rmse <- sqrt(mean((lm.y - lmtrain.yhat)^2)) 
lm.rmse 
lm.abs = abs(lm.y - lmtrain.yhat) 
lm.mad = (sum(lm.abs))/395 
lm.mad 

# backward selection 
bwdtrain <- train(marketvalue ~ ., data = forbes.train,method = "leapBackward",trControl = trainControl(method = "cv",number = 10)) 
bwdtrain 
bwdtrain.yhat <- predict(bwdtrain, newdata = forbes.test) 
bwd.y <- forbes.test["marketvalue"] 
bwd.rmse <- sqrt(mean((bwd.y - bwdtrain.yhat)^2)) 
bwd.rmse 
bwd.abs = abs(bwd.y - bwdtrain.yhat) 
bwd.mad = (sum(bwd.abs))/395 
bwd.mad 

# single tree
treetrain <- train(marketvalue ~ ., data = forbes.train,method = "rpart",trControl = trainControl(method = "boot",number = 1000),tuneLength=10) 
dt.yhat <- predict(treetrain, newdata = forbes.test) 
dt.y <- forbes.test["marketvalue"] 
dt.rmse <- sqrt(mean((dt.y - dt.yhat)^2)) 
dt.rmse 
dt.abs = abs(dt.y - dt.yhat) 
dt.mad = (sum(dt.abs))/395 
dt.mad 

# bagging tree 
bagtrain <- train(marketvalue ~ ., data = forbes.train,method = "rf",tuneGrid = NULL,tuneLength = 3) 
#or use 10 CV
bagtrain <- train(marketvalue ~ ., data = forbes.train,method = "rf",trControl = trainControl(method = "cv",number = 10)) 
bagtrain 
bag.yhat <- predict(bagtrain, newdata = forbes.test) 
bag.y <- forbes.test["marketvalue"] 
dt.rmse <- sqrt(mean((bag.y - bag.yhat)^2)) 
dt.rmse 
bag.abs = abs(bag.y - bag.yhat) 
bag.mad = (sum(bag.abs))/395 
bag.mad 
jpeg('rf2%03d.jpg')
importance(bag) 
varImpPlot(bag) 
dev.off()




