# Using R to obtain basic statistics on your dataset

Updated: 2014 June 20th

Most of the data I work with are represented as tables i.e. with rows and columns. R makes it easy to store (as data frames) and process such data to produce some basic statistics. Here are just some R functions that calculate some basic, but nevertheless useful, statistics. I will use the iris dataset that comes with R.

```#how does the iris dataset look like?
#  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#1          5.1         3.5          1.4         0.2  setosa
#2          4.9         3.0          1.4         0.2  setosa
#3          4.7         3.2          1.3         0.2  setosa
#4          4.6         3.1          1.5         0.2  setosa
#5          5.0         3.6          1.4         0.2  setosa
#6          5.4         3.9          1.7         0.4  setosa

#what are the dimensions of this dataset
dim(iris)
 150   5
```

So the iris dataset has 150 rows and 5 columns. Notice that the fifth column are categorical variables; we will need to remove them if we perform a numerical calculation. So let’s create another object by subsetting the iris data:

```#remove column 5
iris_subset <- iris[,-5]

#one less column
dim(iris_subset)
# 150   4

#now let's perform some simple statistics
#calculate the mean of each row
#and add an extra column called mean
iris_subset\$mean <- apply(iris_subset, 1, mean)
#  Sepal.Length Sepal.Width Petal.Length Petal.Width  mean
#1          5.1         3.5          1.4         0.2 2.550
#2          4.9         3.0          1.4         0.2 2.375
#3          4.7         3.2          1.3         0.2 2.350
#4          4.6         3.1          1.5         0.2 2.350
#5          5.0         3.6          1.4         0.2 2.550
#6          5.4         3.9          1.7         0.4 2.850

#calculate the variance of each row
#and add an extra column called variance
iris_subset\$variance <- apply(iris_subset, 1, var)
#  Sepal.Length Sepal.Width Petal.Length Petal.Width  mean variance
#1          5.1         3.5          1.4         0.2 2.550 3.562500
#2          4.9         3.0          1.4         0.2 2.375 3.111875
#3          4.7         3.2          1.3         0.2 2.350 2.992500
#4          4.6         3.1          1.5         0.2 2.350 2.742500
#5          5.0         3.6          1.4         0.2 2.550 3.487500
#6          5.4         3.9          1.7         0.4 2.850 3.732500

#check the quantiles for the mean
quantile(iris_subset\$mean)
#    0%    25%    50%    75%   100%
#2.1000 2.6750 3.5750 4.0625 5.1000

#deciles
quantile(iris_subset\$mean, probs = seq(from = 0, to = 1, by=.1))
#    0%    10%    20%    30%    40%    50%    60%    70%    80%    90%   100%
#2.1000 2.4250 2.5950 2.8000 3.3000 3.5750 3.8350 3.9250 4.2000 4.4575 5.1000

#check to see how many rows have a mean greater than 10
table(iris_subset\$mean > 4)
#FALSE  TRUE
#  111    39

#calculate Pearson correlation between variance and mean
cor(iris_subset\$mean, iris_subset\$variance)
# 0.311261

#calculate Spearman correlation between variance and mean
cor(iris_subset\$mean, iris_subset\$variance, method="spearman")
# 0.2447656

#get the maximum petal length
max(iris\$Petal.Length)
# 6.9

#get the shortest sepal width
min(iris\$Sepal.Width)
# 2

#get means of the 4 measurements
colMeans(iris_subset)
#Sepal.Length  Sepal.Width Petal.Length  Petal.Width         mean     variance
#    5.843333     3.057333     3.758000     1.199333     3.464500     3.290254

#get the sum of the rows
# 16.31250 14.98688 14.74250 14.49250 16.23750 17.98250

#store all the rows that have a mean > 4
iris_mean_4 <- iris[iris_subset\$mean>4,]
iris_mean_4 <- iris[iris_subset\$mean>4,]
#    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#51           7.0         3.2          4.7         1.4 versicolor
#53           6.9         3.1          4.9         1.5 versicolor
#78           6.7         3.0          5.0         1.7 versicolor
#101          6.3         3.3          6.0         2.5  virginica
#103          7.1         3.0          5.9         2.1  virginica
#104          6.3         2.9          5.6         1.8  virginica

#are a particular species longer on average?
#table(iris_mean_4\$Species)
#
#    setosa versicolor  virginica
#         0          3         36

#output data as a comma delimited file
write.csv(iris_mean_4, "iris_dataset_mean_4.csv")
``` 