Using R to obtain basic statistics on your dataset

Updated: 2014 June 20th

Most of the data I work with are represented as tables i.e. with rows and columns. R makes it easy to store (as data frames) and process such data to produce some basic statistics. Here are just some R functions that calculate some basic, but nevertheless useful, statistics. I will use the iris dataset that comes with R.

#how does the iris dataset look like?
head(iris)
#  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#1          5.1         3.5          1.4         0.2  setosa
#2          4.9         3.0          1.4         0.2  setosa
#3          4.7         3.2          1.3         0.2  setosa
#4          4.6         3.1          1.5         0.2  setosa
#5          5.0         3.6          1.4         0.2  setosa
#6          5.4         3.9          1.7         0.4  setosa

#what are the dimensions of this dataset
dim(iris)
[1] 150   5

So the iris dataset has 150 rows and 5 columns. Notice that the fifth column are categorical variables; we will need to remove them if we perform a numerical calculation. So let's create another object by subsetting the iris data:

#remove column 5
iris_subset <- iris[,-5]

#one less column
dim(iris_subset)
#[1] 150   4

#now let's perform some simple statistics
#calculate the mean of each row
#and add an extra column called mean
iris_subset$mean <- apply(iris_subset, 1, mean)
head(iris_subset)
#  Sepal.Length Sepal.Width Petal.Length Petal.Width  mean
#1          5.1         3.5          1.4         0.2 2.550
#2          4.9         3.0          1.4         0.2 2.375
#3          4.7         3.2          1.3         0.2 2.350
#4          4.6         3.1          1.5         0.2 2.350
#5          5.0         3.6          1.4         0.2 2.550
#6          5.4         3.9          1.7         0.4 2.850

#calculate the variance of each row
#and add an extra column called variance
iris_subset$variance <- apply(iris_subset, 1, var)
head(iris_subset)
#  Sepal.Length Sepal.Width Petal.Length Petal.Width  mean variance
#1          5.1         3.5          1.4         0.2 2.550 3.562500
#2          4.9         3.0          1.4         0.2 2.375 3.111875
#3          4.7         3.2          1.3         0.2 2.350 2.992500
#4          4.6         3.1          1.5         0.2 2.350 2.742500
#5          5.0         3.6          1.4         0.2 2.550 3.487500
#6          5.4         3.9          1.7         0.4 2.850 3.732500

#check the quantiles for the mean
quantile(iris_subset$mean)
#    0%    25%    50%    75%   100% 
#2.1000 2.6750 3.5750 4.0625 5.1000

#deciles
quantile(iris_subset$mean, probs = seq(from = 0, to = 1, by=.1))
#    0%    10%    20%    30%    40%    50%    60%    70%    80%    90%   100% 
#2.1000 2.4250 2.5950 2.8000 3.3000 3.5750 3.8350 3.9250 4.2000 4.4575 5.1000

#check to see how many rows have a mean greater than 10
table(iris_subset$mean > 4)
#FALSE  TRUE 
#  111    39

#calculate Pearson correlation between variance and mean
cor(iris_subset$mean, iris_subset$variance)
#[1] 0.311261

#calculate Spearman correlation between variance and mean
cor(iris_subset$mean, iris_subset$variance, method="spearman")
#[1] 0.2447656

#get the maximum petal length
max(iris$Petal.Length)
#[1] 6.9

#get the shortest sepal width
min(iris$Sepal.Width)
#[1] 2

#get means of the 4 measurements
colMeans(iris_subset)
#Sepal.Length  Sepal.Width Petal.Length  Petal.Width         mean     variance 
#    5.843333     3.057333     3.758000     1.199333     3.464500     3.290254

#get the sum of the rows
head(rowSums(iris_subset))
#[1] 16.31250 14.98688 14.74250 14.49250 16.23750 17.98250

#store all the rows that have a mean > 4
iris_mean_4 <- iris[iris_subset$mean>4,]
iris_mean_4 <- iris[iris_subset$mean>4,]
#head(iris_mean_4)
#    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#51           7.0         3.2          4.7         1.4 versicolor
#53           6.9         3.1          4.9         1.5 versicolor
#78           6.7         3.0          5.0         1.7 versicolor
#101          6.3         3.3          6.0         2.5  virginica
#103          7.1         3.0          5.9         2.1  virginica
#104          6.3         2.9          5.6         1.8  virginica

#are a particular species longer on average?
#table(iris_mean_4$Species)
#
#    setosa versicolor  virginica 
#         0          3         36 

#output data as a comma delimited file
write.csv(iris_mean_4, "iris_dataset_mean_4.csv")



Creative Commons License
This work is licensed under a Creative Commons
Attribution 4.0 International License
.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.