Get Barracudar ready
# load packages ----
# load packages ----
library(log4r)
## 
## Attaching package: 'log4r'
## The following object is masked from 'package:base':
## 
##     debug
library(TeachingDemos)
## Warning: package 'TeachingDemos' was built under R version 4.3.2
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pracma)
## 
## Attaching package: 'pracma'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
library(ggmosaic)
library(stringr)
# load any additional packages here...


# source function files ----

setwd("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/barracudar")

source("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/barracudar/DataTableTemplate.R")
source("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/barracudar/AddFolder.R")
source("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/barracudar/BuildFunction.R")
source("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/barracudar/MetaDataTemplate.R")
source("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/barracudar/CreatePaddedLabel.R")
source("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/barracudar/InitiateSeed.R")
source("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/barracudar/SetUpLog.R")
source("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/barracudar/SourceBatch.R")

setwd("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/OriginalData")


Question 2
# gathering the file names that we actually want to look at in a vector called filenames

# list.files() will gather character strings of file names

filelist <- list.files("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/OriginalData",pattern="BART")

# use a for loop for number of files that we're concerned with, pull out files

# paste() or paste0() function concatenates strings
# paste0("Here is ","the ","filepath: ", filelist[1]) # example

# make an empty vector
filenames <- c()

# make matrix of file names
for (i in 1:10) {
  setwd(paste0("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/OriginalData","/", filelist[i])) # create new file path to go to

  filenames[i] <- list.files(pattern="countdata") # any time the file has "countdata" in it, it pulls it out
}

filenames
##  [1] "NEON.D01.BART.DP1.10003.001.brd_countdata.2015-06.basic.20231226T232626Z.csv"
##  [2] "NEON.D01.BART.DP1.10003.001.brd_countdata.2016-06.basic.20231227T013428Z.csv"
##  [3] "NEON.D01.BART.DP1.10003.001.brd_countdata.2017-06.basic.20231227T094709Z.csv"
##  [4] "NEON.D01.BART.DP1.10003.001.brd_countdata.2018-06.basic.20231228T172744Z.csv"
##  [5] "NEON.D01.BART.DP1.10003.001.brd_countdata.2019-06.basic.20231227T184129Z.csv"
##  [6] "NEON.D01.BART.DP1.10003.001.brd_countdata.2020-06.basic.20231227T224944Z.csv"
##  [7] "NEON.D01.BART.DP1.10003.001.brd_countdata.2020-07.basic.20231227T225020Z.csv"
##  [8] "NEON.D01.BART.DP1.10003.001.brd_countdata.2021-06.basic.20231228T010546Z.csv"
##  [9] "NEON.D01.BART.DP1.10003.001.brd_countdata.2022-06.basic.20231229T053256Z.csv"
## [10] "NEON.D01.BART.DP1.10003.001.brd_countdata.2023-06.basic.20240131T234742Z.csv"


Questions 3 & 4
# generating functions to get rid of empty/missing cases, extract the year from the file name, calculate total number of individuals found, and calculate number of unique species found


getinfo <- function(filelist,filenames) {

  metadata <- matrix(0,length(filelist),4)
  colnames(metadata) <- c("File","Year","Total # Individuals", "Species Richness")

  for (i in 1:10) { # for every folder (there is only one good csv per folder)

# go into the folder listed at filelist[i]
setwd(paste0("~/Desktop/GitHub/ComputationalBiology/OppenheimerBio6100/Homework11/OriginalData","/", filelist[i]))

# get the csv data from filename
csvdata <- read.csv(filenames[i]) # read in the csv from this folder, off of the list of good csvs!
csvdata

dfcsv <- data.frame(csvdata)
dfcsv

# GETTING INFORMATION

# get rid of empty/missing cases
dfcsv <- na.omit(dfcsv[,1:20]) # rows after 20 are either all NA or no NA, so this seems prudent if I still want to have stuff to work with
dfcsv

# extract the year from the file name
year <- str_sub(filenames[i],43,46)


# calculate total individuals found
total_individuals <- nrow(dfcsv)

# calculate total number of unique species
species_richness <- length(unique(dfcsv[,12]))

 # fill in the empty vector for each csv
metadata[i,1:4] <- c(filelist[i], year, total_individuals, species_richness)
# number of items to replace is not a multiple of replacement length

  }

return(metadata)

  }

getinfo(filelist,filenames)
##       File                                                                     
##  [1,] "NEON.D01.BART.DP1.10003.001.2015-06.basic.20240127T000425Z.RELEASE-2024"
##  [2,] "NEON.D01.BART.DP1.10003.001.2016-06.basic.20240127T000425Z.RELEASE-2024"
##  [3,] "NEON.D01.BART.DP1.10003.001.2017-06.basic.20240127T000425Z.RELEASE-2024"
##  [4,] "NEON.D01.BART.DP1.10003.001.2018-06.basic.20240127T000425Z.RELEASE-2024"
##  [5,] "NEON.D01.BART.DP1.10003.001.2019-06.basic.20240127T000425Z.RELEASE-2024"
##  [6,] "NEON.D01.BART.DP1.10003.001.2020-06.basic.20240127T000425Z.RELEASE-2024"
##  [7,] "NEON.D01.BART.DP1.10003.001.2020-07.basic.20240127T000425Z.RELEASE-2024"
##  [8,] "NEON.D01.BART.DP1.10003.001.2021-06.basic.20240127T000425Z.RELEASE-2024"
##  [9,] "NEON.D01.BART.DP1.10003.001.2022-06.basic.20240127T000425Z.RELEASE-2024"
## [10,] "NEON.D01.BART.DP1.10003.001.2023-06.basic.20240131T234742Z.PROVISIONAL" 
##       Year   Total # Individuals Species Richness
##  [1,] "2015" "453"               "40"            
##  [2,] "2016" "680"               "38"            
##  [3,] "2017" "411"               "34"            
##  [4,] "2018" "512"               "36"            
##  [5,] "2019" "372"               "39"            
##  [6,] "2020" "447"               "43"            
##  [7,] "2020" "50"                "16"            
##  [8,] "2021" "869"               "45"            
##  [9,] "2022" "578"               "37"            
## [10,] "2023" "504"               "33"