Predicting cancer

So far I’ve come across four machine learning methods, which includes random forests, classification trees, hierarchical clustering and k-means clustering. Here I use all four of these methods (plus SVMs) towards predicting cancer, or more specifically malignant cancers using the Wisconsin breast cancer dataset.

wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
cat breast-cancer-wisconsin.data | head -5
1000025,5,1,1,1,2,1,3,1,1,2
1002945,5,4,4,5,7,10,3,2,1,2
1015425,3,1,1,1,2,2,3,1,1,2
1016277,6,8,8,1,3,4,3,7,1,2
1017023,4,1,1,3,2,1,3,1,1,2
wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names
cat breast-cancer-wisconsin.names | tail -25 | head -16
7. Attribute Information: (class attribute has been moved to last column)

   #  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)

Data munging

data <- read.table("breast-cancer-wisconsin.data",header=F,sep=",",stringsAsFactors=F)
head(data)
       V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
1 1000025  5  1  1  1  2  1  3  1   1   2
2 1002945  5  4  4  5  7 10  3  2   1   2
3 1015425  3  1  1  1  2  2  3  1   1   2
4 1016277  6  8  8  1  3  4  3  7   1   2
5 1017023  4  1  1  3  2  1  3  1   1   2
6 1017122  8 10 10  8  7 10  9  7   1   4
dim(data)
[1] 699  11

#how many benign (2) and how many malignant (4)?
table(data$V11)

  2   4 
458 241

names(data) <- c('id','ct','ucsize','ucshape','ma','secs','bn','bc','nn','miti','class')
head(data)
       id ct ucsize ucshape ma secs bn bc nn miti class
1 1000025  5      1       1  1    2  1  3  1    1     2
2 1002945  5      4       4  5    7 10  3  2    1     2
3 1015425  3      1       1  1    2  2  3  1    1     2
4 1016277  6      8       8  1    3  4  3  7    1     2
5 1017023  4      1       1  3    2  1  3  1    1     2
6 1017122  8     10      10  8    7 10  9  7    1     4

#clean up data
require(stringr)
#remove whitespace
data <-t(apply(data, 1, function(x) {str_replace(x, "\\s+", "")}))
data <-t(apply(data, 1, function(x) {str_replace(x, "\\D", NA)}))
#I'm not sure what's the best way to deal with NAs
#so I'll just remove them
data <- na.omit(data)
#lost a few data points
dim(data)
[1] 683  11
#but everything is converted into characters
head(data)
  id        ct  ucsize ucshape ma  secs bn   bc  nn  miti class
1 "1000025" "5" "1"    "1"     "1" "2"  "1"  "3" "1" "1"  "2"  
2 "1002945" "5" "4"    "4"     "5" "7"  "10" "3" "2" "1"  "2"  
3 "1015425" "3" "1"    "1"     "1" "2"  "2"  "3" "1" "1"  "2"  
4 "1016277" "6" "8"    "8"     "1" "3"  "4"  "3" "7" "1"  "2"  
5 "1017023" "4" "1"    "1"     "3" "2"  "1"  "3" "1" "1"  "2"  
6 "1017122" "8" "10"   "10"    "8" "7"  "10" "9" "7" "1"  "4"
data <- as.data.frame(data, stringsAsFactors=F)
#but they're still characters!
sapply(data,mode)
         id          ct      ucsize     ucshape          ma        secs          bn          bc          nn 
"character" "character" "character" "character" "character" "character" "character" "character" "character" 
       miti       class 
"character" "character"
#transform them back to numeric
#define function
to_numeric <- function(x) as.numeric(as.character(x))
data <- modifyList(data, lapply(data, to_numeric))
sapply(data,mode)
       id        ct    ucsize   ucshape        ma      secs        bn        bc        nn      miti 
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
    class 
"numeric" 

Hierarchical clustering


#install if necessary
install.packages("sparcl")
library(sparcl)
hc <- hclust(dist(data[,2:10]))
ColorDendrogram(hc,y=data$class,branchlength=5)

hc_colour_dendrogramMost of the benign (green) and malignant (cyan) samples cluster together.

K-means clustering

fit <- kmeans(data[,c(2:10)], 2)
names(fit)
[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"

#k-means did a fairly good job
table(data.frame(fit$cluster,data[,11]))
table(data.frame(fit$cluster,data[,11]))
           data...11.
fit.cluster   2   4
          1   9 221
          2 435  18
(435 + 221) / (435 + 18 + 9 + 221)
[1] 0.9604685

Classification tree

install.packages("tree")
library(tree)
#make class into a character
length(data[data$class==2,]$class)
[1] 444
data[data$class==2,]$class <- rep(x='benign',444)
length(data[data$class==4,]$class)
[1] 239
data[data$class==4,]$class <- rep(x='malignant',239)
#factor class
data$class <- factor(data$class)
tree1 <- tree(class ~ ct + ucsize + ucshape + ma + secs + bn + bc + nn + miti, data = data)
summary(tree1)

Classification tree:
tree(formula = class ~ ct + ucsize + ucshape + ma + secs + bn + 
    bc + nn + miti, data = data)
Variables actually used in tree construction:
[1] "ucsize" "bn"     "secs"   "ct"     "nn"    
Number of terminal nodes:  9 
Residual mean deviance:  0.1603 = 108 / 674 
Misclassification error rate: 0.03221 = 22 / 683

plot(tree1)
text(tree1)

classification_tree5 variables, Uniformity of Cell Size, Bare Nuclei, Single Epithelial Cell Size, Clump Thickness and Normal Nucleoli, were used in the classification tree.

Random forests

Preparing the test and training data sets:

table(data$class)

   benign malignant 
      444       239
dim(data)
[1] 683  11

set.seed(123)
decide <- rbinom(n=683,size=1,p=0.5)
table(decide)
decide
  0   1 
349 334

test <- data[as.logical(decide),]
dim(test)
[1] 334  11
#write function to switch
switch <- function(x) if (x==1){ return(0)} else { return(1)}
decide <- sapply(decide,switch)
train <- data[as.logical(decide),]
dim(train)
[1] 349  11

table(test$class)

   benign malignant 
      217       117 
table(train$class)

   benign malignant 
      227       122

table(data$class)

   benign malignant 
      444       239

217 + 227
[1] 444
117 + 122
[1] 239

Now for the analysis:

#install if necessary
install.packages("randomForest")
library(randomForest)

r <- randomForest(class ~ ct + ucsize + ucshape + ma + secs + bn + bc + nn + miti, data=train, importance=TRUE, do.trace=100)
ntree      OOB      1      2
  100:   2.87%  3.08%  2.46%
  200:   2.58%  3.08%  1.64%
  300:   2.58%  3.08%  1.64%
  400:   2.58%  3.08%  1.64%
  500:   2.58%  3.08%  1.64%

r

Call:
 randomForest(formula = class ~ ct + ucsize + ucshape + ma + secs + bn + bc + nn + miti, data = train, importance = TRUE, do.trace = 100) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 3.15%
Confusion matrix:
          benign malignant class.error
benign       220         7  0.03083700
malignant      4       118  0.03278689

#which variables were the most important
importance(r,type=1)
        MeanDecreaseAccuracy
ct                 17.758870
ucsize             23.772927
ucshape            17.331048
ma                 13.554124
secs               13.260320
bn                 24.705194
bc                 13.291793
nn                 12.956484
miti                4.517476

#predicting cancer
data.predict <- predict(r, test)
t = table(observed=test[,'class'], predict=data.predict)
t
           predict
observed    benign malignant
  benign       213         4
  malignant      7       110
prop.table(t,1)
           predict
observed        benign  malignant
  benign    0.98156682 0.01843318
  malignant 0.05982906 0.94017094
0.01843318 + 0.05982906
[1] 0.07826224

Support Vector Machines

Following this guide.

#install if necessary
install.packages("e1071")
library(e1071)
#using the training set defined above in the random forests analysis
head(train)
        id ct ucsize ucshape ma secs bn bc nn miti     class
1  1000025  5      1       1  1    2  1  3  1    1    benign
3  1015425  3      1       1  1    2  2  3  1    1    benign
6  1017122  8     10      10  8    7 10  9  7    1 malignant
10 1033078  4      2       1  1    2  1  2  1    1    benign
12 1036172  2      1       1  1    2  1  2  1    1    benign
15 1044572  8      7       5 10    7  9  5  5    4 malignant
tuned <- tune.svm(class~ ct + ucsize + ucshape + ma + secs + bn + bc + nn + miti, data = train, gamma = 10^(-6:-1), cost = 10^(-1:1))
summary(tuned)

Parameter tuning of ‘svm’:

- sampling method: 10-fold cross validation 

- best parameters:
 gamma cost
   0.1    1

- best performance: 0.02571429 

- Detailed performance results:
   gamma cost      error  dispersion
1  1e-06  0.1 0.35000000 0.083367340
2  1e-05  0.1 0.35000000 0.083367340
3  1e-04  0.1 0.35000000 0.083367340
4  1e-03  0.1 0.35000000 0.083367340
5  1e-02  0.1 0.04327731 0.044210254
6  1e-01  0.1 0.03151261 0.016204185
7  1e-06  1.0 0.35000000 0.083367340
8  1e-05  1.0 0.35000000 0.083367340
9  1e-04  1.0 0.35000000 0.083367340
10 1e-03  1.0 0.04327731 0.044210254
11 1e-02  1.0 0.02857143 0.013468701
12 1e-01  1.0 0.02571429 0.016218463
13 1e-06 10.0 0.35000000 0.083367340
14 1e-05 10.0 0.35000000 0.083367340
15 1e-04 10.0 0.04327731 0.044210254
16 1e-03 10.0 0.02857143 0.013468701
17 1e-02 10.0 0.03151261 0.009009424
18 1e-01 10.0 0.04033613 0.031621660

#best parameters: gamma = 1e-01 and cost = 1
svm_model <- svm(class~ ct + ucsize + ucshape + ma + secs + bn + bc + nn + miti, data = train, kernel="radial", gamma=0.1, cost=1)
summary(svm_model)
Call:
svm(formula = class ~ ct + ucsize + ucshape + ma + secs + bn + bc + nn + miti, data = train, 
    kernel = "radial", gamma = 0.1, cost = 1)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  1 
      gamma:  0.1 

Number of Support Vectors:  61

 ( 17 44 )


Number of Classes:  2 

Levels: 
 benign malignant

svm_prediction <- predict(svm_model, test)
table(svm_prediction, test$class)
              
svm_prediction benign malignant
     benign       211         6
     malignant      6       111

prop.table(table(svm_prediction, test$class),1)
              
svm_prediction     benign  malignant
     benign    0.97235023 0.02764977
     malignant 0.05128205 0.94871795

0.02764977 + 0.05128205
[1] 0.07893182

Conclusions

One method to complement these machine learning methods is SVD:

svd1 <- svd(data[,2:10])
plot(svd1$u,pch=19,col=data$class)

svd_left_singular_vectorThere’s a lot more variability in the malignant (red) samples than the benign (black) samples. The same variability in the malignant samples was observed in the variable branch lengths in the hierarchical clustering tree.

Actually from the classification tree we can see that just by comparing the “Uniformity of Cell Sizes”, we can distinguish most benign from malignant cancers, which is why most of the methods performed quite well.

table(data[data$ucsize<2.5,11])

   benign malignant 
      406        12

table(data[data$ucsize>2.5,11])

   benign malignant 
       38       227

aggregate(data$ucsize, list(class=data$class), summary)
      class x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max.
1    benign  1.000     1.000    1.000  1.306     1.000  9.000
2 malignant  1.000     4.000    6.000  6.577    10.000 10.000
Print Friendly, PDF & Email



Creative Commons License
This work is licensed under a Creative Commons
Attribution 4.0 International License
.
One comment Add yours

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.