Class Sample Data

#Load packages
library(ggplot2) #for graphics
library(MASS) #for maximum likelihood estimation
library(dplyr)

#Read in data vector
# quick and dirty, a truncated normal distribution to work on the solution set

z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame':    1676 obs. of  2 variables:
##  $ ID   : int  1 5 8 9 11 12 17 18 19 20 ...
##  $ myVar: num  1.411 0.319 0.784 1.618 0.993 ...
summary(z$myVar)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001614 0.366634 0.753549 0.878304 1.292016 3.995389
#Plot histogram of data
p1 <- ggplot(data=z, aes(x=myVar, y=after_stat(density))) +
  geom_histogram(color="grey60",fill="paleturquoise1",linewidth=0.2) 
print(p1)

#Add empirical density curve
p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

#Get maximum likelihodd parameters for normal
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
##       mean          sd    
##   0.87830419   0.63375506 
##  (0.01548048) (0.01094635)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 0.878 0.634
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0155 0.0109
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.00024 0 0 0.00012
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1676
##  $ loglik  : num -1614
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##      mean 
## 0.8783042
#Plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
 p1 + stat

#Plot exponential probability density
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
 p1 + stat + stat2

#Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
 p1 + stat + stat2 + stat3

#Plot gamma probability density
gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4

#Plot beta probability density
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=after_stat(density))) +
  geom_histogram(color="grey60",fill="paleturquoise1",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

Alewife length-frequency data

Alewife length-frequency data from trawling conducted from 2015-2021

#Bringing in trawling data dn filtering by alewife, removing NAs, and selecting for only age 1 fish.

fish_lengths <- read.csv("Ind_fish_sampled_detailed.csv", header = TRUE, sep = ",")

alw_lengths_1 <- fish_lengths %>%
  filter(spec_code == "ALW") %>%
  filter(!tl_mm == "NA") %>%
  filter(tl_mm <= 120)

str(alw_lengths_1)
## 'data.frame':    1961 obs. of  15 variables:
##  $ sampleID     : chr  "BOT_04272021_01" "BOT_04272021_01" "BOT_04272021_01" "BOT_04272021_01" ...
##  $ basin        : chr  "Main Lake" "Main Lake" "Main Lake" "Main Lake" ...
##  $ minor_basin  : chr  "Central Main Lake" "Central Main Lake" "Central Main Lake" "Central Main Lake" ...
##  $ gear         : chr  "BOT" "BOT" "BOT" "BOT" ...
##  $ gear_spec    : chr  "3 in 1" "3 in 1" "3 in 1" "3 in 1" ...
##  $ spec_code    : chr  "ALW" "ALW" "ALW" "ALW" ...
##  $ tl_mm        : num  95 95 80 81 65 79 80 82 100 75 ...
##  $ wt_g         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ goodsample   : chr  "" "" "" "" ...
##  $ year         : int  2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
##  $ month        : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ date_start   : chr  "4/27/2021 0:00:00" "4/27/2021 0:00:00" "4/27/2021 0:00:00" "4/27/2021 0:00:00" ...
##  $ duration_min : num  10 10 10 10 10 10 10 10 10 10 ...
##  $ depth_start_m: num  37.5 37.5 37.5 37.5 37.5 37.5 37.5 37.5 37.5 37.5 ...
##  $ depth_mean_m : num  39.2 39.2 39.2 39.2 39.2 39.2 39.2 39.2 39.2 39.2 ...
summary(alw_lengths_1)
##    sampleID            basin           minor_basin            gear          
##  Length:1961        Length:1961        Length:1961        Length:1961       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   gear_spec          spec_code             tl_mm             wt_g      
##  Length:1961        Length:1961        Min.   : 28.00   Min.   :0.750  
##  Class :character   Class :character   1st Qu.: 62.00   1st Qu.:2.750  
##  Mode  :character   Mode  :character   Median : 75.00   Median :5.020  
##                                        Mean   : 74.67   Mean   :4.357  
##                                        3rd Qu.: 86.00   3rd Qu.:5.835  
##                                        Max.   :120.00   Max.   :7.750  
##                                                         NA's   :1942   
##   goodsample             year          month         date_start       
##  Length:1961        Min.   :2016   Min.   : 4.000   Length:1961       
##  Class :character   1st Qu.:2019   1st Qu.: 5.000   Class :character  
##  Mode  :character   Median :2020   Median : 9.000   Mode  :character  
##                     Mean   :2020   Mean   : 7.953                     
##                     3rd Qu.:2021   3rd Qu.:10.000                     
##                     Max.   :2021   Max.   :11.000                     
##                                                                       
##   duration_min   depth_start_m    depth_mean_m  
##  Min.   : 8.99   Min.   :22.60   Min.   :25.00  
##  1st Qu.:10.00   1st Qu.:37.00   1st Qu.:36.25  
##  Median :10.00   Median :42.80   Median :42.60  
##  Mean   :11.03   Mean   :43.44   Mean   :42.92  
##  3rd Qu.:10.00   3rd Qu.:48.10   3rd Qu.:47.45  
##  Max.   :39.99   Max.   :97.60   Max.   :92.40  
##                                  NA's   :60
#Plot histogram of data
p1 <- ggplot(data = alw_lengths_1, aes(x = tl_mm, y = after_stat(density))) + 
  geom_histogram(color = "grey60", fill = "darkseagreen1", size = 0.2)
print(p1)

#Add empirical density curve
p1 <- p1 + geom_density(linetype = "dotted", size = 0.75)
print(p1)

#Get maximum likelihood parameters for normal
normPars <- fitdistr(alw_lengths_1$tl_mm, "normal")
print(normPars)
##       mean          sd    
##   74.6746558   16.5137192 
##  ( 0.3729118) ( 0.2636884)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 74.7 16.5
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.373 0.264
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.1391 0 0 0.0695
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1961
##  $ loglik  : num -8282
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"]
##     mean 
## 74.67466
#Plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0, max(alw_lengths_1$tl_mm), len = length(alw_lengths_1$tl_mm))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour = "red", n = length(alw_lengths_1$tl_mm), args = list(mean = meanML, sd = sdML))
p1 + stat

#Plot exponential probability density
expoPars <- fitdistr(alw_lengths_1$tl_mm, "exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour = "blue", n = length(alw_lengths_1$tl_mm),args = list(rate = rateML))

p1 + stat + stat2

#Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour = "darkgreen", n = length(alw_lengths_1$tl_mm), args = list(min= min(alw_lengths_1$tl_mm), max= max(alw_lengths_1$tl_mm)))

p1 + stat + stat2 + stat3

#Plot gamma probabiliuty density
gammaPars <- fitdistr(alw_lengths_1$tl_mm, "gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour = "brown", n = length(alw_lengths_1$tl_mm), args = list(shape = shapeML, rate = rateML))

p1+ stat +stat2 + stat3 + stat4

#Plot beta probability density
pSpecial <- ggplot(data = alw_lengths_1, aes(x = tl_mm/(max(tl_mm +0.1)), y = after_stat(density)))+
  geom_histogram(color = "grey60", fill = "darkseagreen1", size = 0.2)+
  xlim(c(0,1))+
  geom_density(size = 0.75, linetype = "dotted")
betaPars <- fitdistr(x = alw_lengths_1$tl_mm/max(alw_lengths_1$tl_mm +0.1), start = list(shape1 = 1, shape2 = 2), "beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour = "orchid",n = length(alw_lengths_1$tl_mm), args = list(shape1 = shape1ML, shape2 = shape2ML))

pSpecial + statSpecial

Explanation

A normal distribution fits this data set well, it is a fairly large dataset and it consists of the total length in millimeters of alewife <= age 1 from Lake Champlain. The age classes are usually represented in distinct total length bins within a species. It makes sense that the density of the mean value of the dataset would be the highest because that is that average length of an age-1 fish and the other lengths would be less common.

Simulated Data

z <- rnorm(n = 1961, mean = 74.6746558, sd = 16.5137192)
z <- data.frame(1:1961, z)
names(z) <- list("ID", "myVar")
z <- z[z$myVar >0,]
str(z)
## 'data.frame':    1961 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ myVar: num  86.6 63.3 78.9 91.9 81.3 ...
summary(z$myVar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.51   63.21   74.59   74.38   85.52  139.85
normPars <- fitdistr(z$myVar, "normal")
print(normPars)
##       mean          sd    
##   74.3837557   16.4349486 
##  ( 0.3711330) ( 0.2624306)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 74.4 16.4
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.371 0.262
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.1377 0 0 0.0689
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1961
##  $ loglik  : num -8272
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"]
##     mean 
## 74.38376
meanSim <- normPars$estimate["mean"]
sdSim <- normPars$estimate["sd"]

simVal <- seq(0, max(z$myVar), len = length(z$myVar))

p2 <- ggplot(data = z, aes(x = myVar, y = after_stat(density)))+
  geom_histogram(color = "grey60", fill = "thistle2", size = 0.2)
print(p2)

p2 <-  p2 +  geom_density(linetype="dotted",size=0.75)
print(p2)

stat_sim <- stat_function(aes(x = simVal, y = ..y..), fun = dnorm, colour = "red", n = length(z$myVar), args = list(mean = meanSim, sd = sdSim))
p2 + stat_sim

#Final figures
print(p1+stat)

print(p2+stat_sim)

The original dataset is large enough that it is representative of the population. Using the same n, the actual and simulated distributions are very similar.