I am using a bottom trawling dataset that looks at the catch per unit effort (CPUE) of alewife throughout the three different portions (north, central, south) of the main lake in Lake Champlain. I am not expecting to see any significant differences in CPUE between these regions because they are all part of the main lake and are not geographically isolated from eachother.
#Loading in data and getting attributes
ALW_CPUE_basin <- read.csv("ALW_CPUE_basin.csv")
ALW_CPUE_basin$minor_basin <- factor(ALW_CPUE_basin$minor_basin, levels = c("North Main Lake", "Central Main Lake", "South Main Lake"))
str(ALW_CPUE_basin)
## 'data.frame': 345 obs. of 3 variables:
## $ sampleID : chr "BOT_04132018_01" "BOT_04132018_02" "BOT_04132018_03" "BOT_04132018_04" ...
## $ minor_basin: Factor w/ 3 levels "North Main Lake",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ CPUE : num 8.528 37.5 68.903 255.989 0.667 ...
## sampleID minor_basin CPUE
## Length:345 North Main Lake : 72 Min. : 0.500
## Class :character Central Main Lake:218 1st Qu.: 4.003
## Mode :character South Main Lake : 55 Median : 20.020
## Mean : 86.624
## 3rd Qu.: 90.337
## Max. :1404.000
ALW_CPUE <- group_by(ALW_CPUE_basin, minor_basin) %>%
summarise(
count = n(),
mean = mean(CPUE, na.rm = TRUE),
sd = sd(CPUE, na.rm = TRUE)
)
print(ALW_CPUE) # This shows the sample sizes, means, and variances for each group
## # A tibble: 3 × 4
## minor_basin count mean sd
## <fct> <int> <dbl> <dbl>
## 1 North Main Lake 72 100. 200.
## 2 Central Main Lake 218 90.1 170.
## 3 South Main Lake 55 55.0 119.
#Running ANOVA on data and creating figure
res.aov <- aov(CPUE~minor_basin, data = ALW_CPUE_basin)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## minor_basin 2 71360 35680 1.239 0.291
## Residuals 342 9851925 28807
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = CPUE ~ minor_basin, data = ALW_CPUE_basin)
##
## $minor_basin
## diff lwr upr p adj
## Central Main Lake-North Main Lake -10.27977 -64.58625 44.02672 0.8963867
## South Main Lake-North Main Lake -45.40800 -116.95666 26.14065 0.2950901
## South Main Lake-Central Main Lake -35.12824 -95.41458 25.15810 0.3569643
ANOplot <- ggplot(data = ALW_CPUE_basin)+
aes(x= minor_basin, y = CPUE, fill = minor_basin) +
geom_boxplot()
ANOplot
The ANOVA and tukey tests shows that there is no statistical difference (p > 0.05) between CPUE in the three regions of the main lake.
# Giving this data the same parameters as my original data
nGroup <- 3
nName <- c("North Main Lake", "Central Main Lake", "South Main Lake")
nSize <- c(72, 218, 55)
nMean <- c(100.4, 90.1, 54.9)
nSD <- c(199.7, 169.8, 119.2)
ID <- 1:(sum(nSize))
#Used rtruncnorm to set my bottom limit to 0 because I can't have a negative catch
modelCPUE <- c(rtruncnorm(n=nSize[1], a = 0, b = Inf, mean=nMean[1],sd=nSD[1]),
rtruncnorm(n=nSize[2], a = 0, b = Inf, mean=nMean[2],sd=nSD[2]),
rtruncnorm(n=nSize[3], a = 0, b = Inf, mean=nMean[3],sd=nSD[3]))
TGroup <- rep(nName,nSize)
ANOdata <- data.frame(ID,TGroup,modelCPUE)
ANOdata$TGroup <- factor(ANOdata$TGroup, levels = c("North Main Lake", "Central Main Lake", "South Main Lake"))
str(ANOdata)
## 'data.frame': 345 obs. of 3 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ TGroup : Factor w/ 3 levels "North Main Lake",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ modelCPUE: num 314.3 349.8 146.9 46.2 185.3 ...
## Df Sum Sq Mean Sq F value Pr(>F)
## TGroup 2 267462 133731 9.671 8.21e-05 ***
## Residuals 342 4728952 13827
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TestANOPlot <- ggplot(data=ANOdata,aes(x=TGroup,y=modelCPUE,fill=TGroup)) +
geom_boxplot()
print(TestANOPlot)
TukeyHSD(ANOmodel) #This shows that there is no statistic difference between Central Main Lake CPUE and North Main Lake CPUE but there is a statistical difference between the South Main Lake CPUE and the other two regions.
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = modelCPUE ~ TGroup, data = ANOdata)
##
## $TGroup
## diff lwr upr p adj
## Central Main Lake-North Main Lake -45.47622 -83.10096 -7.851469 0.0130323
## South Main Lake-North Main Lake -92.18646 -141.75697 -42.615954 0.0000474
## South Main Lake-Central Main Lake -46.71024 -88.47797 -4.942519 0.0240024
#Double the mean of the South Main Lake fake data
set.seed(100)
nGroup <- 3
nName <- c("North Main Lake", "Central Main Lake", "South Main Lake")
nSize <- c(72, 218, 55)
nMean2 <- c(100.4, 90.1, 109.8)
nSD <- c(199.7, 169.8, 119.2)
ID <- 1:(sum(nSize))
meanDubCPUE <- c(rtruncnorm(n=nSize[1], a = 0, b = Inf, mean=nMean2[1],sd=nSD[1]),
rtruncnorm(n=nSize[2], a = 0, b = Inf, mean=nMean2[2],sd=nSD[2]),
rtruncnorm(n=nSize[3], a = 0, b = Inf, mean=nMean2[3],sd=nSD[3]))
TGroup <- rep(nName,nSize)
ANOMeanDubdata <- data.frame(ID,TGroup,meanDubCPUE)
ANOMeanDubdata$TGroup <- factor(ANOMeanDubdata$TGroup, levels = c("North Main Lake", "Central Main Lake", "South Main Lake"))
#Run ANOVA and Tukey on doubled South Main Lake mean data
ANOMeanDubmodel <- aov(meanDubCPUE~TGroup,data=ANOMeanDubdata)
summary(ANOMeanDubmodel)
## Df Sum Sq Mean Sq F value Pr(>F)
## TGroup 2 82514 41257 2.795 0.0625 .
## Residuals 342 5047412 14759
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ANOMeanDubPlot <- ggplot(data=ANOMeanDubdata,aes(x=TGroup,y=meanDubCPUE,fill=TGroup)) +
geom_boxplot()
print(ANOMeanDubPlot)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = meanDubCPUE ~ TGroup, data = ANOMeanDubdata)
##
## $TGroup
## diff lwr upr p adj
## Central Main Lake-North Main Lake -37.633399 -76.50438 1.237585 0.0601281
## South Main Lake-North Main Lake -39.548020 -90.76044 11.664400 0.1652668
## South Main Lake-Central Main Lake -1.914621 -45.06581 41.236567 0.9940039
In order for there to be no statistical difference between the North Main Lake and the South Main Lake CPUEs I had to double the original mean of the South Main Lake.
#Change the sample sizes
set.seed(110)
nGroup <- 3
nName <- c("North Main Lake", "Central Main Lake", "South Main Lake")
nSize2 <- c(72, 218, 15)
nMean <- c(100.4, 90.1, 54.9)
nSD <- c(199.7, 169.8, 119.2)
ID <- 1:(sum(nSize2))
sizeCPUE <- c(rtruncnorm(n=nSize2[1], a = 0, b = Inf, mean=nMean[1],sd=nSD[1]),
rtruncnorm(n=nSize2[2], a = 0, b = Inf, mean=nMean[2],sd=nSD[2]),
rtruncnorm(n=nSize2[3], a = 0, b = Inf, mean=nMean[3],sd=nSD[3]))
TGroup <- rep(nName,nSize2)
ANOSizedata <- data.frame(ID,TGroup,sizeCPUE)
ANOSizedata$TGroup <- factor(ANOSizedata$TGroup, levels = c("North Main Lake", "Central Main Lake", "South Main Lake"))
#Run ANOVA and Tukey on altered sample size data
ANOSizemodel <- aov(sizeCPUE~TGroup,data=ANOSizedata)
summary(ANOSizemodel)
## Df Sum Sq Mean Sq F value Pr(>F)
## TGroup 2 144287 72144 5.492 0.00454 **
## Residuals 302 3967165 13136
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ANOSizePlot <- ggplot(data=ANOSizedata,aes(x=TGroup,y=sizeCPUE,fill=TGroup)) +
geom_boxplot()
print(ANOSizePlot)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = sizeCPUE ~ TGroup, data = ANOSizedata)
##
## $TGroup
## diff lwr upr p adj
## Central Main Lake-North Main Lake -31.95289 -68.64668 4.7408994 0.1020220
## South Main Lake-North Main Lake -103.02700 -179.64588 -26.4081134 0.0048182
## South Main Lake-Central Main Lake -71.07411 -143.13376 0.9855457 0.0541551
In order for there to be no statistical difference between the Central Main Lake and the South Main Lake CPUEs (largest sample size difference) I had to reduce the South Main Lake sample size by 40 to 15.