#Author: Dr.R.Parvathi,Professor, School of Computing Science & Engineering, VIT Chennai
# Visualization of Decision Tree
#R version 3.6.2 (2019-12-12)
#RStudio version 1.2.1335
rm(list=ls())
#Dataset can be downloaded from https://onlinecourses.science.psu.edu/stat857/sites/onlinecourses.science.psu.edu.stat857/files/german_credit.csv
#read data file
mydata= read.csv("german_credit.csv")
mydata
names(mydata)
# Check attributes of data
str(mydata)
#Check number of rows and columns
dim(mydata)
#Make dependent variable as a factor (categorical)

mydata$Creditability = as.factor(mydata$Creditability)
# Split data into training (70%) and validation (30%)
dt = sort(sample(nrow(mydata), nrow(mydata)*.7))
train<-mydata[dt,]
val<-mydata[-dt,] 
# Check number of rows in training data set
nrow(train)
# To view dataset
train
# Decision Tree Model
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
mtree <- rpart(Creditability~., data = train, method="class", control = rpart.control(minsplit = 20, minbucket = 7, maxdepth = 10, usesurrogate = 2, xval =10 ))
mtree
#Plot tree
plot(mtree)
text(mtree)


par(xpd = NA, mar = rep(0.7, 4)) 
plot(mtree, compress = TRUE)
text(mtree, cex = 0.7, use.n = TRUE, fancy = FALSE, all = TRUE)


#Beautify tree

library(rpart.plot)

#view1
prp(mtree, faclen = 0,box.palette = "Reds", cex = 0.8, extra = 1)
#view2 - total count at each node
tot_count <- function(x, labs, digits, varlen)
{paste(labs, "\n\nn =", x$frame$n)}
prp(mtree, faclen = 0, cex = 0.8, node.fun=tot_count)
printcp(mtree)
bestcp <- mtree$cptable[which.min(mtree$cptable[,"xerror"]),"CP"]
# Prune the tree using the best cp.
pruned <- prune(mtree, cp = bestcp)
#Plot pruned tree
prp(pruned, box.palette = "Blues",faclen = 0, cex = 0.8, extra = 1)
# confusion matrix (training data)
conf.matrix <- table(train$Creditability, predict(pruned,type="class"))
rownames(conf.matrix) <- paste("Actual", rownames(conf.matrix), sep = ":")
colnames(conf.matrix) <- paste("Pred", colnames(conf.matrix), sep = ":")
print(conf.matrix)
#Scoring
library(ROCR)
val1 = predict(pruned, val, type = "prob")
#Storing Model Performance Scores
pred_val <-prediction(val1[,2],val$Creditability)
# Calculating Area under Curve
perf_val <- performance(pred_val,"auc")
perf_val
# Plotting Lift curve
plot(performance(pred_val, measure="lift", x.measure="rpp"), colorize=TRUE)
# Calculating True Positive and False Positive Rate
perf_val <- performance(pred_val, "tpr", "fpr")
#Plot the ROC curve
plot(perf_val, col = "green", lwd = 1.5)
#Calculating KS statistics
ks1.tree <- max(attr(perf_val, "y.values")[[1]] - (attr(perf_val, "x.values")[[1]]))
ks1.tree


library(randomForest)
rf50 <- randomForest(Creditability ~., data = train, ntree=200, importance=T, proximity=T)
plot(rf50, main="")
rf50
Test50_rf_pred <- predict(rf50, val, type="class")
table(Test50_rf_pred, val$Creditability)
importance(rf50)
varImpPlot(rf50,  main="", cex=0.8)

# CART model
latlontree = rpart(mydata$Creditability~., data= mydata)
# Plot the tree using prp command defined in rpart.plot package
prp(latlontree)
latlontree = rpart(mydata$Creditability~., data= mydata,minbucket=50)
plot(latlontree)
text(latlontree)



