library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# fit normal distribution to data + grab maximum likelihood estimators
z <- read.csv("Allen et al - Proceedings B Supplementary Data - Raw Data 3.csv",header=TRUE,sep=",")
z$myVar <-z$Mean.Unique.Units.per.Cycle
summary(z)
## Year Mean.Unique.Units.per.Cycle Mean.Total.Units.per.Cycle
## Min. :2002 Min. :11.08 Min. :101.1
## 1st Qu.:2005 1st Qu.:16.25 1st Qu.:152.5
## Median :2008 Median :18.78 Median :170.8
## Mean :2008 Mean :20.04 Mean :172.2
## 3rd Qu.:2011 3rd Qu.:23.50 3rd Qu.:204.1
## Max. :2014 Max. :29.75 Max. :228.0
##
## Mean.Song.Cycle.Length..m. Mean.Themes Mean.Phrase.Length
## Min. : 4.253 Min. :2.780 Min. :12.96
## 1st Qu.: 6.328 1st Qu.:3.750 1st Qu.:14.60
## Median : 7.450 Median :4.310 Median :15.93
## Mean : 7.196 Mean :4.505 Mean :16.25
## 3rd Qu.: 7.713 3rd Qu.:4.920 3rd Qu.:16.31
## Max. :11.060 Max. :6.860 Max. :22.67
##
## mean.PCA.of.Themes Total.Score Song.Level.Score Theme.Level.Score
## Min. :-1.15876 Min. :-3.2997 Min. :-2.5206 Min. :-2.4552
## 1st Qu.:-0.50409 1st Qu.:-1.4860 1st Qu.:-1.6282 1st Qu.:-0.5603
## Median :-0.13506 Median :-0.4777 Median : 0.1094 Median :-0.2084
## Mean :-0.05374 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.49235 3rd Qu.: 1.4054 3rd Qu.: 1.4188 3rd Qu.: 0.4083
## Max. : 0.92238 Max. : 3.3521 Max. : 2.5897 Max. : 3.1775
##
## Song.Level.Entropy Theme.Level.Entropy Individuality myVar
## Min. :0.03744 Min. :0.7253 Min. :12.28 Min. :11.08
## 1st Qu.:0.14788 1st Qu.:0.9425 1st Qu.:25.96 1st Qu.:16.25
## Median :0.26520 Median :1.0102 Median :31.44 Median :18.78
## Mean :0.27329 Mean :1.1955 Mean :34.30 Mean :20.04
## 3rd Qu.:0.34729 3rd Qu.:1.3715 3rd Qu.:39.14 3rd Qu.:23.50
## Max. :0.61224 Max. :1.8287 Max. :60.52 Max. :29.75
## NA's :2
# get stats (mean + sd) of mean.unique.units.per.cycle from Allen et al.
normPars <- fitdistr(z$myVar,"normal")
print(normPars) # mean is 20.038462 , sd is 1.014290
## mean sd
## 20.038462 5.171887
## ( 1.434423) ( 1.014290)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 20.04 5.17
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 1.43 1.01
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 2.06 0 0 1.03
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 13
## $ loglik : num -39.8
## - attr(*, "class")= chr "fitdistr"
zMean <- normPars$estimate["mean"] # note structure of getting a named attribute -> mean is 20.61364 (as stated above)
zSD <- normPars$estimate["sd"]
# Using data from Allen et al., 2018 - mean unique units per cycle
# generate random normal data with the mean and sd from that dataset
counts <- rnorm(n=3,mean=zMean,sd=zSD)
years <- c("2002","2003","2004")
# make that normal data into a data frame
normz <- data.frame(years,counts)
print(normz)
## years counts
## 1 2002 30.26881
## 2 2003 16.14408
## 3 2004 16.56853
# setting up one-way ANOVA - just the first 3 years
nGroup <- 3
nName <- c("2002","2003","2004") # names of groups
nSize <- c(1,1,1) # number of observations in each group
# get the mean + sd
# 2002
year <- normz[normz$years=="2002",]
print(year)
## years counts
## 1 2002 30.26881
summary2002 <- summary(year[,"counts"])
print(summary2002)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.27 30.27 30.27 30.27 30.27 30.27
mean2002 <- summary2002["Mean"]
print(mean2002)
## Mean
## 30.26881
sd2002 <- sd((year[,"counts"]))
print(sd2002)
## [1] NA
# 2003
year <- normz[normz$years=="2003",]
print(year)
## years counts
## 2 2003 16.14408
summary2003 <- summary(year[,"counts"])
print(summary2003)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.14 16.14 16.14 16.14 16.14 16.14
mean2003 <- summary2003["Mean"]
print(mean2003)
## Mean
## 16.14408
sd2003 <- sd((year[,"counts"]))
print(sd2003)
## [1] NA
# 2004
year <- normz[normz$years=="2004",]
print(year)
## years counts
## 3 2004 16.56853
summary2004 <- summary(year[,"counts"])
print(summary2004)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.57 16.57 16.57 16.57 16.57 16.57
mean2004 <- summary2004["Mean"]
print(mean2004)
## Mean
## 16.56853
sd2004 <- sd((year[,"counts"]))
print(sd2004)
## [1] NA
# I am gettng NA for SD because there's really only one value which IS the mean, so I'm just going to assign random values
sd2002 <- 3
sd2003 <- 2
sd2004 <- 1
# adjusting the means of different groups!
# mean2002 <- 15.1
# mean2003 <- 15.2
# mean2004 <- 15.3
ID <- 1:(sum(nSize)) # ID vector for each row (sum of the number of observations) - the overall number of rows we need!
resVar <- c(rnorm(n=nSize[1],mean=mean2002,sd=sd2002),
rnorm(n=nSize[2],mean=mean2003,sd=sd2003),
rnorm(n=nSize[3],mean=mean2004,sd=sd2004))
# doing the ANOVA
TGroup <- rep(nName,nSize) # repeating the name by the number
ANOdata <- data.frame(ID,TGroup,resVar) #
str(ANOdata)
## 'data.frame': 3 obs. of 3 variables:
## $ ID : int 1 2 3
## $ TGroup: chr "2002" "2003" "2004"
## $ resVar: num 30.4 16 16
print(ANOdata)
## ID TGroup resVar
## 1 1 2002 30.38872
## 2 2 2003 16.01772
## 3 3 2004 15.96990
# make graph
ANOPlot <- ggplot(data=ANOdata,aes(x=TGroup,y=resVar,fill=TGroup)) +
geom_boxplot()
print(ANOPlot)
# Adjust the means of different groups: if I raise the means of all groups to 25, the ANOVA looks more uniform but still follows the same trend. If I lower the means of all groups to 15, it still seems to follow the same trend for the first 2 groups, but the last group's (2004) resVar skyrockets (not sure why). If I make mean2002 5, mean2003 15, and mean2004 25, the resVars follow the same trend. If the means are within 0.1 of each other, there still appears to be some significant difference, but with means within 0.01 of each other there is none.
# Adjust the sample sizes of different groups: by adjusting the sample size down to 3 per group, I see more of a significant difference between the 3 groups. Even given a sample size one 1, I see significance.