Homework

I am using a bottom trawling dataset that looks at the catch per unit effort (CPUE) of alewife throughout the three different portions (north, central, south) of the main lake in Lake Champlain. I am not expecting to see any significant differences in CPUE between these regions because they are all part of the main lake and are not geographically isolated from eachother.

Running ANOVA and Tukey tests on my own data

#Libraries
library(ggplot2)
library(MASS)
library(dplyr)
library(truncnorm)

#Loading in data and getting attributes

ALW_CPUE_basin <- read.csv("ALW_CPUE_basin.csv")

ALW_CPUE_basin$minor_basin <- factor(ALW_CPUE_basin$minor_basin, levels = c("North Main Lake", "Central Main Lake", "South Main Lake"))

str(ALW_CPUE_basin)

## 'data.frame':    345 obs. of  3 variables:
##  $ sampleID   : chr  "BOT_04132018_01" "BOT_04132018_02" "BOT_04132018_03" "BOT_04132018_04" ...
##  $ minor_basin: Factor w/ 3 levels "North Main Lake",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ CPUE       : num  8.528 37.5 68.903 255.989 0.667 ...

summary(ALW_CPUE_basin)

##    sampleID                    minor_basin       CPUE         
##  Length:345         North Main Lake  : 72   Min.   :   0.500  
##  Class :character   Central Main Lake:218   1st Qu.:   4.003  
##  Mode  :character   South Main Lake  : 55   Median :  20.020  
##                                             Mean   :  86.624  
##                                             3rd Qu.:  90.337  
##                                             Max.   :1404.000

ALW_CPUE <- group_by(ALW_CPUE_basin, minor_basin) %>%
  summarise(
    count = n(),
    mean = mean(CPUE, na.rm = TRUE),
    sd = sd(CPUE, na.rm = TRUE)
  )
print(ALW_CPUE) # This shows the sample sizes, means, and variances for each group

## # A tibble: 3 × 4
##   minor_basin       count  mean    sd
##   <fct>             <int> <dbl> <dbl>
## 1 North Main Lake      72 100.   200.
## 2 Central Main Lake   218  90.1  170.
## 3 South Main Lake      55  55.0  119.

#Running ANOVA on data and creating figure

res.aov <- aov(CPUE~minor_basin, data = ALW_CPUE_basin)
summary(res.aov)

##              Df  Sum Sq Mean Sq F value Pr(>F)
## minor_basin   2   71360   35680   1.239  0.291
## Residuals   342 9851925   28807

TukeyHSD(res.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = CPUE ~ minor_basin, data = ALW_CPUE_basin)
## 
## $minor_basin
##                                        diff        lwr      upr     p adj
## Central Main Lake-North Main Lake -10.27977  -64.58625 44.02672 0.8963867
## South Main Lake-North Main Lake   -45.40800 -116.95666 26.14065 0.2950901
## South Main Lake-Central Main Lake -35.12824  -95.41458 25.15810 0.3569643

ANOplot <- ggplot(data = ALW_CPUE_basin)+
  aes(x= minor_basin, y = CPUE, fill = minor_basin) +
  geom_boxplot()
ANOplot

The ANOVA and tukey tests shows that there is no statistical difference (p > 0.05) between CPUE in the three regions of the main lake.

Making fake dataset that has the same attributes as my real one

# Giving this data the same parameters as my original data
nGroup <- 3
nName <- c("North Main Lake", "Central Main Lake", "South Main Lake")
nSize <- c(72, 218, 55)
nMean <- c(100.4, 90.1, 54.9)
nSD <- c(199.7, 169.8, 119.2)
ID <- 1:(sum(nSize))

#Used rtruncnorm to set my bottom limit to 0 because I can't have a negative catch
modelCPUE <- c(rtruncnorm(n=nSize[1], a = 0, b = Inf, mean=nMean[1],sd=nSD[1]),
            rtruncnorm(n=nSize[2], a = 0, b = Inf, mean=nMean[2],sd=nSD[2]),
            rtruncnorm(n=nSize[3], a = 0, b = Inf, mean=nMean[3],sd=nSD[3]))
TGroup <- rep(nName,nSize)
ANOdata <- data.frame(ID,TGroup,modelCPUE)

ANOdata$TGroup <- factor(ANOdata$TGroup, levels = c("North Main Lake", "Central Main Lake", "South Main Lake"))

str(ANOdata)

## 'data.frame':    345 obs. of  3 variables:
##  $ ID       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ TGroup   : Factor w/ 3 levels "North Main Lake",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ modelCPUE: num  314.3 349.8 146.9 46.2 185.3 ...

#ANOVA and plot of fake data

ANOmodel <- aov(modelCPUE~TGroup,data=ANOdata)
summary(ANOmodel)

##              Df  Sum Sq Mean Sq F value   Pr(>F)    
## TGroup        2  267462  133731   9.671 8.21e-05 ***
## Residuals   342 4728952   13827                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TestANOPlot <- ggplot(data=ANOdata,aes(x=TGroup,y=modelCPUE,fill=TGroup)) +
  geom_boxplot()
print(TestANOPlot)

TukeyHSD(ANOmodel) #This shows that there is no statistic difference between Central Main Lake CPUE and North Main Lake CPUE but there is a statistical difference between the South Main Lake CPUE and the other two regions.

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = modelCPUE ~ TGroup, data = ANOdata)
## 
## $TGroup
##                                        diff        lwr        upr     p adj
## Central Main Lake-North Main Lake -45.47622  -83.10096  -7.851469 0.0130323
## South Main Lake-North Main Lake   -92.18646 -141.75697 -42.615954 0.0000474
## South Main Lake-Central Main Lake -46.71024  -88.47797  -4.942519 0.0240024

Altering the metrics of the fake data to see how the parameters affect statistical significance

#Double the mean of the South Main Lake fake data
set.seed(100)
nGroup <- 3
nName <- c("North Main Lake", "Central Main Lake", "South Main Lake")
nSize <- c(72, 218, 55)
nMean2 <- c(100.4, 90.1, 109.8)
nSD <- c(199.7, 169.8, 119.2)
ID <- 1:(sum(nSize))

meanDubCPUE <- c(rtruncnorm(n=nSize[1], a = 0, b = Inf, mean=nMean2[1],sd=nSD[1]),
            rtruncnorm(n=nSize[2], a = 0, b = Inf, mean=nMean2[2],sd=nSD[2]),
            rtruncnorm(n=nSize[3], a = 0, b = Inf, mean=nMean2[3],sd=nSD[3]))
TGroup <- rep(nName,nSize)
ANOMeanDubdata <- data.frame(ID,TGroup,meanDubCPUE)

ANOMeanDubdata$TGroup <- factor(ANOMeanDubdata$TGroup, levels = c("North Main Lake", "Central Main Lake", "South Main Lake"))

#Run ANOVA and Tukey on doubled South Main Lake mean data

ANOMeanDubmodel <- aov(meanDubCPUE~TGroup,data=ANOMeanDubdata)
summary(ANOMeanDubmodel)

##              Df  Sum Sq Mean Sq F value Pr(>F)  
## TGroup        2   82514   41257   2.795 0.0625 .
## Residuals   342 5047412   14759                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOMeanDubPlot <- ggplot(data=ANOMeanDubdata,aes(x=TGroup,y=meanDubCPUE,fill=TGroup)) +
  geom_boxplot()
print(ANOMeanDubPlot)

TukeyHSD(ANOMeanDubmodel)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = meanDubCPUE ~ TGroup, data = ANOMeanDubdata)
## 
## $TGroup
##                                         diff       lwr       upr     p adj
## Central Main Lake-North Main Lake -37.633399 -76.50438  1.237585 0.0601281
## South Main Lake-North Main Lake   -39.548020 -90.76044 11.664400 0.1652668
## South Main Lake-Central Main Lake  -1.914621 -45.06581 41.236567 0.9940039

In order for there to be no statistical difference between the North Main Lake and the South Main Lake CPUEs I had to double the original mean of the South Main Lake.

#Change the sample sizes
set.seed(110)
nGroup <- 3
nName <- c("North Main Lake", "Central Main Lake", "South Main Lake")
nSize2 <- c(72, 218, 15)
nMean <- c(100.4, 90.1, 54.9)
nSD <- c(199.7, 169.8, 119.2)
ID <- 1:(sum(nSize2))

sizeCPUE <- c(rtruncnorm(n=nSize2[1], a = 0, b = Inf, mean=nMean[1],sd=nSD[1]),
            rtruncnorm(n=nSize2[2], a = 0, b = Inf, mean=nMean[2],sd=nSD[2]),
            rtruncnorm(n=nSize2[3], a = 0, b = Inf, mean=nMean[3],sd=nSD[3]))
TGroup <- rep(nName,nSize2)
ANOSizedata <- data.frame(ID,TGroup,sizeCPUE)

ANOSizedata$TGroup <- factor(ANOSizedata$TGroup, levels = c("North Main Lake", "Central Main Lake", "South Main Lake"))

#Run ANOVA and Tukey on altered sample size data

ANOSizemodel <- aov(sizeCPUE~TGroup,data=ANOSizedata)
summary(ANOSizemodel)

##              Df  Sum Sq Mean Sq F value  Pr(>F)   
## TGroup        2  144287   72144   5.492 0.00454 **
## Residuals   302 3967165   13136                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOSizePlot <- ggplot(data=ANOSizedata,aes(x=TGroup,y=sizeCPUE,fill=TGroup)) +
  geom_boxplot()
print(ANOSizePlot)

TukeyHSD(ANOSizemodel)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = sizeCPUE ~ TGroup, data = ANOSizedata)
## 
## $TGroup
##                                         diff        lwr         upr     p adj
## Central Main Lake-North Main Lake  -31.95289  -68.64668   4.7408994 0.1020220
## South Main Lake-North Main Lake   -103.02700 -179.64588 -26.4081134 0.0048182
## South Main Lake-Central Main Lake  -71.07411 -143.13376   0.9855457 0.0541551

In order for there to be no statistical difference between the Central Main Lake and the South Main Lake CPUEs (largest sample size difference) I had to reduce the South Main Lake sample size by 40 to 15.

Homework_07

Shelby N. Scarfo

2024-02-28

Running ANOVA and Tukey tests on my own data

Making fake dataset that has the same attributes as my real one

Altering the metrics of the fake data to see how the parameters affect statistical significance