# Using R to obtain basic statistics on your dataset

Updated: 2014 June 20th

Most of the data I work with are represented as tables i.e. with rows and columns. R makes it easy to store (as data frames) and process such data to produce some basic statistics. Here are just some R functions that calculate some basic, but nevertheless useful, statistics. I will use the iris dataset that comes with R.

#how does the iris dataset look like?
#  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#1          5.1         3.5          1.4         0.2  setosa
#2          4.9         3.0          1.4         0.2  setosa
#3          4.7         3.2          1.3         0.2  setosa
#4          4.6         3.1          1.5         0.2  setosa
#5          5.0         3.6          1.4         0.2  setosa
#6          5.4         3.9          1.7         0.4  setosa

#what are the dimensions of this dataset
dim(iris)
[1] 150   5


So the iris dataset has 150 rows and 5 columns. Notice that the fifth column are categorical variables; we will need to remove them if we perform a numerical calculation. So let's create another object by subsetting the iris data:

#remove column 5
iris_subset <- iris[,-5]

#one less column
dim(iris_subset)
#[1] 150   4

#now let's perform some simple statistics
#calculate the mean of each row
#and add an extra column called mean
iris_subset$mean <- apply(iris_subset, 1, mean) head(iris_subset) # Sepal.Length Sepal.Width Petal.Length Petal.Width mean #1 5.1 3.5 1.4 0.2 2.550 #2 4.9 3.0 1.4 0.2 2.375 #3 4.7 3.2 1.3 0.2 2.350 #4 4.6 3.1 1.5 0.2 2.350 #5 5.0 3.6 1.4 0.2 2.550 #6 5.4 3.9 1.7 0.4 2.850 #calculate the variance of each row #and add an extra column called variance iris_subset$variance <- apply(iris_subset, 1, var)
#  Sepal.Length Sepal.Width Petal.Length Petal.Width  mean variance
#1          5.1         3.5          1.4         0.2 2.550 3.562500
#2          4.9         3.0          1.4         0.2 2.375 3.111875
#3          4.7         3.2          1.3         0.2 2.350 2.992500
#4          4.6         3.1          1.5         0.2 2.350 2.742500
#5          5.0         3.6          1.4         0.2 2.550 3.487500
#6          5.4         3.9          1.7         0.4 2.850 3.732500

#check the quantiles for the mean
quantile(iris_subset$mean) # 0% 25% 50% 75% 100% #2.1000 2.6750 3.5750 4.0625 5.1000 #deciles quantile(iris_subset$mean, probs = seq(from = 0, to = 1, by=.1))
#    0%    10%    20%    30%    40%    50%    60%    70%    80%    90%   100%
#2.1000 2.4250 2.5950 2.8000 3.3000 3.5750 3.8350 3.9250 4.2000 4.4575 5.1000

#check to see how many rows have a mean greater than 10
table(iris_subset$mean > 4) #FALSE TRUE # 111 39 #calculate Pearson correlation between variance and mean cor(iris_subset$mean, iris_subset$variance) #[1] 0.311261 #calculate Spearman correlation between variance and mean cor(iris_subset$mean, iris_subset$variance, method="spearman") #[1] 0.2447656 #get the maximum petal length max(iris$Petal.Length)
#[1] 6.9

#get the shortest sepal width
min(iris$Sepal.Width) #[1] 2 #get means of the 4 measurements colMeans(iris_subset) #Sepal.Length Sepal.Width Petal.Length Petal.Width mean variance # 5.843333 3.057333 3.758000 1.199333 3.464500 3.290254 #get the sum of the rows head(rowSums(iris_subset)) #[1] 16.31250 14.98688 14.74250 14.49250 16.23750 17.98250 #store all the rows that have a mean > 4 iris_mean_4 <- iris[iris_subset$mean>4,]
iris_mean_4 <- iris[iris_subset$mean>4,] #head(iris_mean_4) # Sepal.Length Sepal.Width Petal.Length Petal.Width Species #51 7.0 3.2 4.7 1.4 versicolor #53 6.9 3.1 4.9 1.5 versicolor #78 6.7 3.0 5.0 1.7 versicolor #101 6.3 3.3 6.0 2.5 virginica #103 7.1 3.0 5.9 2.1 virginica #104 6.3 2.9 5.6 1.8 virginica #are a particular species longer on average? #table(iris_mean_4$Species)
#
#    setosa versicolor  virginica
#         0          3         36

#output data as a comma delimited file
write.csv(iris_mean_4, "iris_dataset_mean_4.csv")