Creating a fake dataset using Mean Unique Units per Cycle
# purposefully NOT making this a function because I want these variables to be accessible later
library(ggplot2) # for graphics
## Warning: package 'ggplot2' was built under R version 4.3.2
library(MASS) # for maximum likelihood estimation
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read in data
z <- read.csv("Allen et al - Proceedings B Supplementary Data - Raw Data 3.csv",header=TRUE,sep=",")
z$myVar <-z$Mean.Unique.Units.per.Cycle
summary(z)
## Year Mean.Unique.Units.per.Cycle Mean.Total.Units.per.Cycle
## Min. :2002 Min. :11.08 Min. :101.1
## 1st Qu.:2005 1st Qu.:16.25 1st Qu.:152.5
## Median :2008 Median :18.78 Median :170.8
## Mean :2008 Mean :20.04 Mean :172.2
## 3rd Qu.:2011 3rd Qu.:23.50 3rd Qu.:204.1
## Max. :2014 Max. :29.75 Max. :228.0
##
## Mean.Song.Cycle.Length..m. Mean.Themes Mean.Phrase.Length
## Min. : 4.253 Min. :2.780 Min. :12.96
## 1st Qu.: 6.328 1st Qu.:3.750 1st Qu.:14.60
## Median : 7.450 Median :4.310 Median :15.93
## Mean : 7.196 Mean :4.505 Mean :16.25
## 3rd Qu.: 7.713 3rd Qu.:4.920 3rd Qu.:16.31
## Max. :11.060 Max. :6.860 Max. :22.67
##
## mean.PCA.of.Themes Total.Score Song.Level.Score Theme.Level.Score
## Min. :-1.15876 Min. :-3.2997 Min. :-2.5206 Min. :-2.4552
## 1st Qu.:-0.50409 1st Qu.:-1.4860 1st Qu.:-1.6282 1st Qu.:-0.5603
## Median :-0.13506 Median :-0.4777 Median : 0.1094 Median :-0.2084
## Mean :-0.05374 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.49235 3rd Qu.: 1.4054 3rd Qu.: 1.4188 3rd Qu.: 0.4083
## Max. : 0.92238 Max. : 3.3521 Max. : 2.5897 Max. : 3.1775
##
## Song.Level.Entropy Theme.Level.Entropy Individuality myVar
## Min. :0.03744 Min. :0.7253 Min. :12.28 Min. :11.08
## 1st Qu.:0.14788 1st Qu.:0.9425 1st Qu.:25.96 1st Qu.:16.25
## Median :0.26520 Median :1.0102 Median :31.44 Median :18.78
## Mean :0.27329 Mean :1.1955 Mean :34.30 Mean :20.04
## 3rd Qu.:0.34729 3rd Qu.:1.3715 3rd Qu.:39.14 3rd Qu.:23.50
## Max. :0.61224 Max. :1.8287 Max. :60.52 Max. :29.75
## NA's :2
# Generate relevant summary statistics from data
normPars <- fitdistr(z$myVar,"normal")
print(normPars) #
## mean sd
## 20.038462 5.171887
## ( 1.434423) ( 1.014290)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 20.04 5.17
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 1.43 1.01
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 2.06 0 0 1.03
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 13
## $ loglik : num -39.8
## - attr(*, "class")= chr "fitdistr"
zMean <- normPars$estimate["mean"]
zMean
## mean
## 20.03846
zSD <- normPars$estimate["sd"]
zSD
## sd
## 5.171887
# Using data from Allen et al., 2018 - mean unique units per cycle
# Generate random normal data with the mean and sd from that dataset
mean <- rnorm(n=3,mean=zMean,sd=zSD)
mean
## [1] 11.07762 17.16189 23.17854
year <- c("2002","2003","2004")
sd <- runif(3)
# make that normal data into a data frame
normz <- data.frame(year,mean,sd)
print(normz)
## year mean sd
## 1 2002 11.07762 0.5320332
## 2 2003 17.16189 0.2463920
## 3 2004 23.17854 0.1907580
Conducting an ANOVA on my fake dataset
# setting up one-way ANOVA - just the first 3 years
# Global Variables
nGroup <- 3
nName <- c("2002","2003","2004") # names of groups
nSize <- c(1,1,1) # number of observations in each group
nSize
## [1] 1 1 1
nMean <- mean[1:3]
nMean
## [1] 11.07762 17.16189 23.17854
nSD <- sd
nSD
## [1] 0.5320332 0.2463920 0.1907580
# function to make a res_var data frame
getanodata <- function(n_size,nMean,nSD) {
res_var <- data.frame(matrix(nrow=3,ncol=4))
colnames(res_var) <- c("year","n","value","sd")
for (i in 1:3) {
res_var[i,] <- c(nName[i],nSize[i],nMean[i],nSD[i])
}
return(res_var)
}
res_var <- getanodata(nSize,nMean,nSD)
# res_var <- rep(res_var[,1:3],nSize) # year, n, mean, sd
res_var <- data.frame(res_var)
res_var
## year n value sd
## 1 2002 1 11.0776229336375 0.532033170573413
## 2 2003 1 17.1618945032749 0.24639202398248
## 3 2004 1 23.178544836829 0.190757958684117
year <- res_var[1,]
value <- res_var[,3]
# actual ANOVA! Doing ANOVA between years
anova <- aov(value ~ year,data=res_var)
print(anova)
## Call:
## aov(formula = value ~ year, data = res_var)
##
## Terms:
## year
## Sum of Squares 73.21692
## Deg. of Freedom 2
##
## Estimated effects may be unbalanced
z <- summary(anova)
print(z)
## Df Sum Sq Mean Sq
## year 2 73.22 36.61
# plot
ano_plot <- ggplot(anova) +
aes(year,mean) +
geom_boxplot()
print(ano_plot)
# I only have 1 mean per year so it looks weird