Material comes from : https://github.com/StatQuest/roc_and_auc_demo/blob/master/roc_and_auc_demo.R
library(pROC) # install with install.packages("pROC")
library(randomForest) # install with install.packages("randomForest")
set.seed(420) # this will make my results match yours
num.samples <- 100
## genereate 100 values from a normal distribution with
## mean 172 and standard deviation 29, then sort them
weight <- sort(rnorm(n=num.samples, mean=172, sd=29))
## Now we will decide if a sample is obese or not.
## NOTE: This method for classifying a sample as obese or not
## was made up just for this example.
## rank(weight) returns 1 for the lightest, 2 for the second lightest, ...
## ... and it returns 100 for the heaviest.
So what we do is generate a random number between 0 and 1. Then we see if that number is less than \(rank/100\). So, for the lightest sample, \(rank = 1\). This sample will be classified “obese” if we get a random number less than \(1/100\). For the second lightest sample, \(rank = 2\), we get another random number between 0 and 1 and classify this sample “obese” if that random number is < \(2/100\). We repeat that process for all 100 samples.
obese <- ifelse(test=(runif(n=num.samples) < (rank(weight)/num.samples)),
yes=1, no=0)
## plot the data
plot(x=weight, y=obese)
## fit a logistic regression to the data...
glm.fit=glm(obese ~ weight, family=binomial)
lines(weight, glm.fit$fitted.values)
NOTE: By default, the graphs come out looking terrible The problem is that ROC graphs should be square, since the x and y axes both go from 0 to 1. However, the window in which I draw them isn’t square so extra white space is added to pad the sides.
roc(obese, glm.fit$fitted.values, plot=TRUE)
Call:
roc.default(response = obese, predictor = glm.fit$fitted.values, plot = TRUE)
Data: glm.fit$fitted.values in 45 controls (obese 0) < 55 cases (obese 1).
Area under the curve: 0.8291
Now let’s configure R so that it prints the graph as a square.
par(pty = "s") ## pty sets the aspect ratio of the plot region. Two options:
## "s" - creates a square plotting region
## "m" - (the default) creates a maximal plotting region
roc(obese, glm.fit$fitted.values, plot=TRUE)
Call:
roc.default(response = obese, predictor = glm.fit$fitted.values, plot = TRUE)
Data: glm.fit$fitted.values in 45 controls (obese 0) < 55 cases (obese 1).
Area under the curve: 0.8291
NOTE: By default, roc() uses specificity on the x-axis and the values
range from 1 to 0. This makes the graph look like what we would expect,
but the x-axis itself might induce a headache. To use
1-specificity (i.e. the False Positive Rate) on the
x-axis, set "legacy.axes" to TRUE
.
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE)
Call:
roc.default(response = obese, predictor = glm.fit$fitted.values, plot = TRUE, legacy.axes = TRUE)
Data: glm.fit$fitted.values in 45 controls (obese 0) < 55 cases (obese 1).
Area under the curve: 0.8291
If you want to rename the x and y axes…
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage")
Call:
roc.default(response = obese, predictor = glm.fit$fitted.values, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage")
Data: glm.fit$fitted.values in 45 controls (obese 0) < 55 cases (obese 1).
Area under the curve: 82.91%
We can also change the color of the ROC line, and make it wider…
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#377eb8", lwd=4)
Call:
roc.default(response = obese, predictor = glm.fit$fitted.values, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "#377eb8", lwd = 4)
Data: glm.fit$fitted.values in 45 controls (obese 0) < 55 cases (obese 1).
Area under the curve: 82.91%
If we want to find out the optimal threshold we can store the data used to make the ROC graph in a variable…
roc.info <- roc(obese, glm.fit$fitted.values, legacy.axes=TRUE)
str(roc.info)
List of 15
$ percent : logi FALSE
$ sensitivities : num [1:101] 1 1 1 1 1 ...
$ specificities : num [1:101] 0 0.0222 0.0444 0.0667 0.0889 ...
$ thresholds : num [1:101] -Inf 0.0135 0.0325 0.0525 0.0702 ...
$ direction : chr "<"
$ cases : Named num [1:55] 0.128 0.133 0.159 0.25 0.278 ...
..- attr(*, "names")= chr [1:55] "9" "10" "12" "23" ...
$ controls : Named num [1:45] 0.0129 0.0141 0.0508 0.0542 0.0862 ...
..- attr(*, "names")= chr [1:45] "1" "2" "3" "4" ...
$ fun.sesp :function (thresholds, controls, cases, direction)
$ auc : 'auc' num 0.829
..- attr(*, "partial.auc")= logi FALSE
..- attr(*, "percent")= logi FALSE
..- attr(*, "roc")=List of 15
.. ..$ percent : logi FALSE
.. ..$ sensitivities : num [1:101] 1 1 1 1 1 ...
.. ..$ specificities : num [1:101] 0 0.0222 0.0444 0.0667 0.0889 ...
.. ..$ thresholds : num [1:101] -Inf 0.0135 0.0325 0.0525 0.0702 ...
.. ..$ direction : chr "<"
.. ..$ cases : Named num [1:55] 0.128 0.133 0.159 0.25 0.278 ...
.. .. ..- attr(*, "names")= chr [1:55] "9" "10" "12" "23" ...
.. ..$ controls : Named num [1:45] 0.0129 0.0141 0.0508 0.0542 0.0862 ...
.. .. ..- attr(*, "names")= chr [1:45] "1" "2" "3" "4" ...
.. ..$ fun.sesp :function (thresholds, controls, cases, direction)
.. ..$ auc : 'auc' num 0.829
.. .. ..- attr(*, "partial.auc")= logi FALSE
.. .. ..- attr(*, "percent")= logi FALSE
.. .. ..- attr(*, "roc")=List of 8
.. .. .. ..$ percent : logi FALSE
.. .. .. ..$ sensitivities: num [1:101] 1 1 1 1 1 ...
.. .. .. ..$ specificities: num [1:101] 0 0.0222 0.0444 0.0667 0.0889 ...
.. .. .. ..$ thresholds : num [1:101] -Inf 0.0135 0.0325 0.0525 0.0702 ...
.. .. .. ..$ direction : chr "<"
.. .. .. ..$ cases : Named num [1:55] 0.128 0.133 0.159 0.25 0.278 ...
.. .. .. .. ..- attr(*, "names")= chr [1:55] "9" "10" "12" "23" ...
.. .. .. ..$ controls : Named num [1:45] 0.0129 0.0141 0.0508 0.0542 0.0862 ...
.. .. .. .. ..- attr(*, "names")= chr [1:45] "1" "2" "3" "4" ...
.. .. .. ..$ fun.sesp :function (thresholds, controls, cases, direction)
.. .. .. ..- attr(*, "class")= chr "roc"
.. ..$ call : language roc.default(response = obese, predictor = glm.fit$fitted.values, legacy.axes = TRUE)
.. ..$ original.predictor: Named num [1:100] 0.0129 0.0141 0.0508 0.0542 0.0862 ...
.. .. ..- attr(*, "names")= chr [1:100] "1" "2" "3" "4" ...
.. ..$ original.response : num [1:100] 0 0 0 0 0 0 0 0 1 1 ...
.. ..$ predictor : Named num [1:100] 0.0129 0.0141 0.0508 0.0542 0.0862 ...
.. .. ..- attr(*, "names")= chr [1:100] "1" "2" "3" "4" ...
.. ..$ response : num [1:100] 0 0 0 0 0 0 0 0 1 1 ...
.. ..$ levels : chr [1:2] "0" "1"
.. ..- attr(*, "class")= chr "roc"
$ call : language roc.default(response = obese, predictor = glm.fit$fitted.values, legacy.axes = TRUE)
$ original.predictor: Named num [1:100] 0.0129 0.0141 0.0508 0.0542 0.0862 ...
..- attr(*, "names")= chr [1:100] "1" "2" "3" "4" ...
$ original.response : num [1:100] 0 0 0 0 0 0 0 0 1 1 ...
$ predictor : Named num [1:100] 0.0129 0.0141 0.0508 0.0542 0.0862 ...
..- attr(*, "names")= chr [1:100] "1" "2" "3" "4" ...
$ response : num [1:100] 0 0 0 0 0 0 0 0 1 1 ...
$ levels : chr [1:2] "0" "1"
- attr(*, "class")= chr "roc"
and then extract just the information that we want from that variable.
roc.df <- data.frame(
tpp=roc.info$sensitivities*100, ## tpp = true positive percentage
fpp=(1 - roc.info$specificities)*100, ## fpp = false positive precentage
thresholds=roc.info$thresholds)
head(roc.df) ## head() will show us the values for the upper right-hand corner
## of the ROC graph, when the threshold is so low
## (negative infinity) that every single sample is called "obese".
## Thus TPP = 100% and FPP = 100%
now let’s look at the thresholds between TPP 60% and 80%… We can calculate the area under the curve…
roc.df[roc.df$tpp > 60 & roc.df$tpp < 80,]
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#377eb8", lwd=4, print.auc=TRUE)
Call:
roc.default(response = obese, predictor = glm.fit$fitted.values, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "#377eb8", lwd = 4, print.auc = TRUE)
Data: glm.fit$fitted.values in 45 controls (obese 0) < 55 cases (obese 1).
Area under the curve: 82.91%
or partial area under the curve.
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#377eb8", lwd=4, print.auc=TRUE, print.auc.x=45, partial.auc=c(100, 90), auc.polygon = TRUE, auc.polygon.col = "#377eb822")
Call:
roc.default(response = obese, predictor = glm.fit$fitted.values, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "#377eb8", lwd = 4, print.auc = TRUE, print.auc.x = 45, partial.auc = c(100, 90), auc.polygon = TRUE, auc.polygon.col = "#377eb822")
Data: glm.fit$fitted.values in 45 controls (obese 0) < 55 cases (obese 1).
Partial area under the curve (specificity 100%-90%): 4.727%
rf.model <- randomForest(factor(obese) ~ weight)
## ROC for random forest
roc(obese, rf.model$votes[,1], plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#4daf4a", lwd=4, print.auc=TRUE)
Call:
roc.default(response = obese, predictor = rf.model$votes[, 1], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "#4daf4a", lwd = 4, print.auc = TRUE)
Data: rf.model$votes[, 1] in 45 controls (obese 0) > 55 cases (obese 1).
Area under the curve: 79.78%
par(pty = "s")
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#377eb8", lwd=4, print.auc=TRUE)
Call:
roc.default(response = obese, predictor = glm.fit$fitted.values, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "#377eb8", lwd = 4, print.auc = TRUE)
Data: glm.fit$fitted.values in 45 controls (obese 0) < 55 cases (obese 1).
Area under the curve: 82.91%
plot.roc(obese, rf.model$votes[,1], percent=TRUE, col="#4daf4a", lwd=4, print.auc=TRUE, add=TRUE, print.auc.y=40)
legend("bottomright", legend=c("Logisitic Regression", "Random Forest"), col=c("#377eb8", "#4daf4a"), lwd=4)