Random Forests in predicting wines

Updated 2014 September 17th to reflect changes in the R packages

Source http://mkseo.pe.kr/stats/?p=220.

Using Random Forests in predicting wines derived from three different cultivars. Download the wine data set from the Machine Learning Repository.

# install if necessary
install.packages("randomForest")
library(randomForest)
randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

# load the wine dataset
data <- read.table("https://davetang.org/file/wine.csv", header=TRUE, sep=",")

# check out the data
str(data)
'data.frame':	178 obs. of  14 variables:
 $ class               : Factor w/ 3 levels "class_1","class_2",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
 $ malic_acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
 $ ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
 $ alcalinity_of_ash   : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
 $ magnesium           : int  127 100 101 113 118 112 96 121 97 98 ...
 $ total_phenols       : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
 $ flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
 $ nonflavanoid_phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
 $ proanthocyanins     : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
 $ colour_intensity    : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
 $ hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
 $ od280_od315         : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
 $ proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

# table of wine classes
table(data$class)

class_1 class_2 class_3 
     59      71      48

# prepare test and training set
test <- data[ c(1:10, 60:69, 131:140), ]
train <- data[ c(11:59, 70:130, 141:178), ]

# run the Random Forest classifier
r <- randomForest(class ~ ., data=train, importance=TRUE, do.trace=100)
ntree      OOB      1      2      3
  100:   1.35%  0.00%  3.28%  0.00%
  200:   1.35%  0.00%  3.28%  0.00%
  300:   1.35%  0.00%  3.28%  0.00%
  400:   1.35%  0.00%  3.28%  0.00%
  500:   1.35%  0.00%  3.28%  0.00%

# check out the results
r

Call:
 randomForest(formula = class ~ ., data = train, importance = TRUE,      do.trace = 100) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 1.35%
Confusion matrix:
        class_1 class_2 class_3 class.error
class_1      49       0       0  0.00000000
class_2       1      59       1  0.03278689
class_3       0       0      38  0.00000000

names(r)
 [1] "call"            "type"            "predicted"       "err.rate"        "confusion"      
 [6] "votes"           "oob.times"       "classes"         "importance"      "importanceSD"   
[11] "localImportance" "proximity"       "ntree"           "mtry"            "forest"         
[16] "y"               "test"            "inbag"           "terms"

r$call
randomForest(formula = class ~ ., data = train, importance = TRUE, 
    do.trace = 100)

r$type
[1] "classification"

head(r$predicted)
     11      12      13      14      15      16 
class_1 class_1 class_1 class_1 class_1 class_1 
Levels: class_1 class_2 class_3

# two incorrect predictions
# as seen above
table(r$predicted == train$class)

FALSE  TRUE 
    2   146

head(r$err.rate)
            OOB    class_1   class_2    class_3
[1,] 0.12500000 0.17647059 0.1481481 0.00000000
[2,] 0.11458333 0.06451613 0.1666667 0.08695652
[3,] 0.15517241 0.14285714 0.2200000 0.06451613
[4,] 0.12195122 0.12820513 0.1538462 0.06250000
[5,] 0.11029412 0.11111111 0.1090909 0.11111111
[6,] 0.09558824 0.11111111 0.1090909 0.05555556

r$confusion
        class_1 class_2 class_3 class.error
class_1      49       0       0  0.00000000
class_2       1      59       1  0.03278689
class_3       0       0      38  0.00000000

head(r$votes)
     class_1     class_2     class_3
11 0.9892473 0.010752688 0.000000000
12 0.7752809 0.151685393 0.073033708
13 0.8901734 0.086705202 0.023121387
14 0.9768786 0.011560694 0.011560694
15 0.9900990 0.004950495 0.004950495
16 0.9890110 0.010989011 0.000000000

head(r$oob.times)
[1] 186 178 173 173 202 182

r$classes
[1] "class_1" "class_2" "class_3"

head(r$importance)
                        class_1      class_2     class_3 MeanDecreaseAccuracy MeanDecreaseGini
alcohol            0.1525347592 0.0928248591 0.023430629          0.093836483       13.7934327
malic_acid         0.0021385789 0.0002331466 0.014002376          0.004613423        1.8313973
ash               -0.0002340754 0.0010472105 0.003502219          0.001340635        0.8750481
alcalinity_of_ash  0.0185050050 0.0023801169 0.017335616          0.011531540        3.0038507
magnesium          0.0410753409 0.0055964424 0.006044563          0.017276572        3.1034223
total_phenols      0.0883397880 0.0026153845 0.043181163          0.041287427        5.7167077

head(r$importanceSD)
                       class_1      class_2     class_3 MeanDecreaseAccuracy
alcohol           0.0084789054 0.0059100068 0.003047730         0.0046735969
malic_acid        0.0011525786 0.0008391581 0.002548371         0.0008575549
ash               0.0007456213 0.0005977076 0.001228513         0.0004172596
alcalinity_of_ash 0.0026827364 0.0012745702 0.002734288         0.0014002414
magnesium         0.0042727455 0.0012352037 0.001371906         0.0017315456
total_phenols     0.0072416021 0.0012143907 0.004509257         0.0031221191

r$localImportance
NULL

r$proximity
NULL

r$ntree
[1] 500

r$mtry
[1] 3

names(r$forest)
 [1] "ndbigtree"  "nodestatus" "bestvar"    "treemap"    "nodepred"   "xbestsplit" "pid"        "cutoff"    
 [9] "ncat"       "maxcat"     "nrnodes"    "ntree"      "nclass"     "xlevels"

head(r$y)
     11      12      13      14      15      16 
class_1 class_1 class_1 class_1 class_1 class_1 
Levels: class_1 class_2 class_3

r$test
NULL

r$inbag
NULL

Analysis of the Area Under the Curve (AUC)

The rattle package provides a very nice GUI to the R workspace for conducting data analysis. Within it is a GUI for random forests, and below I follow the analysis steps that rattle performs for the analysis of the area under the curve (AUC).

#install if necessary
install.packages('pROC')
library(pROC)

#there is a test dataset within the pROC package,
#containing subarachnoid hemorrhage data
#which can be loaded by running
#data(aSAH)

#calculate the area under the curve (AUC)
## Default S3 method:
#roc(response, predictor, controls, cases, ...
#for the test dataset it would be
#roc(aSAH$outcome, aSAH$s100b)

#using flavanoids as my predictor, for class 1 and 2
#only the first two levels are used, the third is ignored
roc(data$class, data$flavanoids)

#Call:
#roc.default(response = data$class, predictor = data$flavanoids)
#
#Data: data$flavanoids in 59 controls (data$class class_1) > 71 cases (data$class class_2).
#Area under the curve: 0.8855

#calculate the AUC confidence interval for flavanoids
#for the test data set it would be
#ci.auc(aSAH$outcome, aSAH$s100b)

ci.auc(data$class, data$flavanoids)
#95% CI: 0.8266-0.9444 (DeLong)

#list the importance of the variables
rn <- round(importance(r), 2)

#sort by mean decrease accuracy
rn[order(rn[,4], decreasing=TRUE),]
                     class_1 class_2 class_3 MeanDecreaseAccuracy MeanDecreaseGini
colour_intensity       17.12   18.52   16.99                24.64            17.09
flavanoids             16.94    9.98   19.09                22.77            15.43
alcohol                18.95   16.91    7.55                21.12            14.77
proline                19.40   12.02    7.80                21.00            13.36
od280_od315            14.44    9.14   16.30                18.14            12.74
hue                    10.30    6.60   12.38                13.76             6.79
total_phenols          10.74   -0.80    8.57                11.31             4.67
magnesium               9.90    5.95    4.42                10.63             2.86
alcalinity_of_ash       7.51    1.10    5.79                 8.77             2.62
proanthocyanins         4.76    1.23    6.93                 7.83             1.88
malic_acid              2.94    0.29    6.22                 6.26             2.03
nonflavanoid_phenols    4.08   -0.72    2.80                 4.57             0.92
ash                     0.91    2.03    1.48                 2.64             0.96

#plot the OOB ROC curve
#install if necessary
install.packages('verification')
library(verification)

#how good at predicting class_2?
aucc <- roc.area(as.integer(train$class=='class_2'), r$votes[,2])$A
roc.plot(as.integer(train$class=='class_2'), r$votes[,2], main="")
legend("bottomright", bty="n", sprintf("Area Under the Curve (AUC) = %1.3f", aucc))
title(main="OOB ROC Curve Random Forest for class_2 wine.csv")

oob_roc_curve_class_2

Prediction

data.predict <- predict(r, test)
t <- table(observed=test[,'class'], predict=data.predict)
prop.table(t,1)
         predict
observed  class_1 class_2 class_3
  class_1     1.0     0.0     0.0
  class_2     0.0     0.9     0.1
  class_3     0.0     0.0     1.0

Does increasing the number of trees decrease the error rate?


r_1000 <- randomForest(class ~ ., data=train, importance=TRUE, ntree=1000)
r_1000

Call:
 randomForest(formula = class ~ ., data = train, importance = TRUE,      ntree = 1000) 
               Type of random forest: classification
                     Number of trees: 1000
No. of variables tried at each split: 3

        OOB estimate of  error rate: 1.35%
Confusion matrix:
        class_1 class_2 class_3 class.error
class_1      49       0       0  0.00000000
class_2       1      59       1  0.03278689
class_3       0       0      38  0.00000000

predict_1000 <- predict(r_1000, test)
t_1000 = table(observed=test[,'class'], predict=predict_1000)
prop.table(t_1000,1)
         predict
observed  class_1 class_2 class_3
  class_1     1.0     0.0     0.0
  class_2     0.0     0.9     0.1
  class_3     0.0     0.0     1.0

plot(r_1000, type="l")

random_forest_tree

Compared to a classifcation tree


library(tree)
tree1 &lt;- tree(class ~ alcohol + malic_acid + ash + alcalinity_of_ash + magnesium + total_phenols + flavanoids + nonflavanoid_phenols + proanthocyanins + colour_intensity + hue, data = data)
> summary(tree1)

Classification tree:
tree(formula = class ~ alcohol + malic_acid + ash + alcalinity_of_ash + 
    magnesium + total_phenols + flavanoids + nonflavanoid_phenols + 
    proanthocyanins + colour_intensity + hue, data = data)
Variables actually used in tree construction:
[1] &quot;flavanoids&quot;       &quot;colour_intensity&quot; &quot;malic_acid&quot;       &quot;alcohol&quot;          &quot;magnesium&quot;       
Number of terminal nodes:  7 
Residual mean deviance:  0.07393 = 12.64 / 171 
Misclassification error rate: 0.01685 = 3 / 178 
plot(tree1)
text(tree1)

wine_data_classifcation_tree5 characteristics were used to classify the three classes of wine, despite inputting all 14, with a very low misclassification error rate.

Print Friendly, PDF & Email



Creative Commons License
This work is licensed under a Creative Commons
Attribution 4.0 International License
.
7 comments Add yours
    1. The votes are stored in the randomForest object. If your object is called r, then you can get the votes for each record via r$votes.

  1. I would like to generate a ROC from the test data in your wine example. I believe the ROC’s from training data are too optimistic. Can you provide the correct syntax in teh following commands for doing this?

    aucc <- roc.area(as.integer(train$class=='class_2'), r$votes[,2])$Aroc.plot(as.integer(train$class=='class_2'), r$votes[,2], main="")

    Thank you for your help,
    David

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.