#Load packages
library(ggplot2) #for graphics
library(MASS) #for maximum likelihood estimation
library(dplyr)
#Read in data vector
# quick and dirty, a truncated normal distribution to work on the solution set
z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 1676 obs. of 2 variables:
## $ ID : int 1 5 8 9 11 12 17 18 19 20 ...
## $ myVar: num 1.411 0.319 0.784 1.618 0.993 ...
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001614 0.366634 0.753549 0.878304 1.292016 3.995389
#Plot histogram of data
p1 <- ggplot(data=z, aes(x=myVar, y=after_stat(density))) +
geom_histogram(color="grey60",fill="paleturquoise1",linewidth=0.2)
print(p1)
#Add empirical density curve
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
#Get maximum likelihodd parameters for normal
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 0.87830419 0.63375506
## (0.01548048) (0.01094635)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 0.878 0.634
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0155 0.0109
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.00024 0 0 0.00012
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1676
## $ loglik : num -1614
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 0.8783042
#Plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
#Plot exponential probability density
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
#Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
#Plot gamma probability density
gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
#Plot beta probability density
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=after_stat(density))) +
geom_histogram(color="grey60",fill="paleturquoise1",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
#Bringing in trawling data dn filtering by alewife, removing NAs, and selecting for only age 1 fish.
fish_lengths <- read.csv("Ind_fish_sampled_detailed.csv", header = TRUE, sep = ",")
alw_lengths_1 <- fish_lengths %>%
filter(spec_code == "ALW") %>%
filter(!tl_mm == "NA") %>%
filter(tl_mm <= 120)
str(alw_lengths_1)
## 'data.frame': 1961 obs. of 15 variables:
## $ sampleID : chr "BOT_04272021_01" "BOT_04272021_01" "BOT_04272021_01" "BOT_04272021_01" ...
## $ basin : chr "Main Lake" "Main Lake" "Main Lake" "Main Lake" ...
## $ minor_basin : chr "Central Main Lake" "Central Main Lake" "Central Main Lake" "Central Main Lake" ...
## $ gear : chr "BOT" "BOT" "BOT" "BOT" ...
## $ gear_spec : chr "3 in 1" "3 in 1" "3 in 1" "3 in 1" ...
## $ spec_code : chr "ALW" "ALW" "ALW" "ALW" ...
## $ tl_mm : num 95 95 80 81 65 79 80 82 100 75 ...
## $ wt_g : num NA NA NA NA NA NA NA NA NA NA ...
## $ goodsample : chr "" "" "" "" ...
## $ year : int 2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
## $ month : int 4 4 4 4 4 4 4 4 4 4 ...
## $ date_start : chr "4/27/2021 0:00:00" "4/27/2021 0:00:00" "4/27/2021 0:00:00" "4/27/2021 0:00:00" ...
## $ duration_min : num 10 10 10 10 10 10 10 10 10 10 ...
## $ depth_start_m: num 37.5 37.5 37.5 37.5 37.5 37.5 37.5 37.5 37.5 37.5 ...
## $ depth_mean_m : num 39.2 39.2 39.2 39.2 39.2 39.2 39.2 39.2 39.2 39.2 ...
summary(alw_lengths_1)
## sampleID basin minor_basin gear
## Length:1961 Length:1961 Length:1961 Length:1961
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## gear_spec spec_code tl_mm wt_g
## Length:1961 Length:1961 Min. : 28.00 Min. :0.750
## Class :character Class :character 1st Qu.: 62.00 1st Qu.:2.750
## Mode :character Mode :character Median : 75.00 Median :5.020
## Mean : 74.67 Mean :4.357
## 3rd Qu.: 86.00 3rd Qu.:5.835
## Max. :120.00 Max. :7.750
## NA's :1942
## goodsample year month date_start
## Length:1961 Min. :2016 Min. : 4.000 Length:1961
## Class :character 1st Qu.:2019 1st Qu.: 5.000 Class :character
## Mode :character Median :2020 Median : 9.000 Mode :character
## Mean :2020 Mean : 7.953
## 3rd Qu.:2021 3rd Qu.:10.000
## Max. :2021 Max. :11.000
##
## duration_min depth_start_m depth_mean_m
## Min. : 8.99 Min. :22.60 Min. :25.00
## 1st Qu.:10.00 1st Qu.:37.00 1st Qu.:36.25
## Median :10.00 Median :42.80 Median :42.60
## Mean :11.03 Mean :43.44 Mean :42.92
## 3rd Qu.:10.00 3rd Qu.:48.10 3rd Qu.:47.45
## Max. :39.99 Max. :97.60 Max. :92.40
## NA's :60
#Plot histogram of data
p1 <- ggplot(data = alw_lengths_1, aes(x = tl_mm, y = after_stat(density))) +
geom_histogram(color = "grey60", fill = "darkseagreen1", size = 0.2)
print(p1)
#Add empirical density curve
p1 <- p1 + geom_density(linetype = "dotted", size = 0.75)
print(p1)
#Get maximum likelihood parameters for normal
normPars <- fitdistr(alw_lengths_1$tl_mm, "normal")
print(normPars)
## mean sd
## 74.6746558 16.5137192
## ( 0.3729118) ( 0.2636884)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 74.7 16.5
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.373 0.264
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.1391 0 0 0.0695
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1961
## $ loglik : num -8282
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"]
## mean
## 74.67466
#Plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0, max(alw_lengths_1$tl_mm), len = length(alw_lengths_1$tl_mm))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour = "red", n = length(alw_lengths_1$tl_mm), args = list(mean = meanML, sd = sdML))
p1 + stat
#Plot exponential probability density
expoPars <- fitdistr(alw_lengths_1$tl_mm, "exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour = "blue", n = length(alw_lengths_1$tl_mm),args = list(rate = rateML))
p1 + stat + stat2
#Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour = "darkgreen", n = length(alw_lengths_1$tl_mm), args = list(min= min(alw_lengths_1$tl_mm), max= max(alw_lengths_1$tl_mm)))
p1 + stat + stat2 + stat3
#Plot gamma probabiliuty density
gammaPars <- fitdistr(alw_lengths_1$tl_mm, "gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour = "brown", n = length(alw_lengths_1$tl_mm), args = list(shape = shapeML, rate = rateML))
p1+ stat +stat2 + stat3 + stat4
#Plot beta probability density
pSpecial <- ggplot(data = alw_lengths_1, aes(x = tl_mm/(max(tl_mm +0.1)), y = after_stat(density)))+
geom_histogram(color = "grey60", fill = "darkseagreen1", size = 0.2)+
xlim(c(0,1))+
geom_density(size = 0.75, linetype = "dotted")
betaPars <- fitdistr(x = alw_lengths_1$tl_mm/max(alw_lengths_1$tl_mm +0.1), start = list(shape1 = 1, shape2 = 2), "beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour = "orchid",n = length(alw_lengths_1$tl_mm), args = list(shape1 = shape1ML, shape2 = shape2ML))
pSpecial + statSpecial
A normal distribution fits this data set well, it is a fairly large dataset and it consists of the total length in millimeters of alewife <= age 1 from Lake Champlain. The age classes are usually represented in distinct total length bins within a species. It makes sense that the density of the mean value of the dataset would be the highest because that is that average length of an age-1 fish and the other lengths would be less common.
z <- rnorm(n = 1961, mean = 74.6746558, sd = 16.5137192)
z <- data.frame(1:1961, z)
names(z) <- list("ID", "myVar")
z <- z[z$myVar >0,]
str(z)
## 'data.frame': 1961 obs. of 2 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ myVar: num 86.6 63.3 78.9 91.9 81.3 ...
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.51 63.21 74.59 74.38 85.52 139.85
normPars <- fitdistr(z$myVar, "normal")
print(normPars)
## mean sd
## 74.3837557 16.4349486
## ( 0.3711330) ( 0.2624306)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 74.4 16.4
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.371 0.262
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.1377 0 0 0.0689
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1961
## $ loglik : num -8272
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"]
## mean
## 74.38376
meanSim <- normPars$estimate["mean"]
sdSim <- normPars$estimate["sd"]
simVal <- seq(0, max(z$myVar), len = length(z$myVar))
p2 <- ggplot(data = z, aes(x = myVar, y = after_stat(density)))+
geom_histogram(color = "grey60", fill = "thistle2", size = 0.2)
print(p2)
p2 <- p2 + geom_density(linetype="dotted",size=0.75)
print(p2)
stat_sim <- stat_function(aes(x = simVal, y = ..y..), fun = dnorm, colour = "red", n = length(z$myVar), args = list(mean = meanSim, sd = sdSim))
p2 + stat_sim
#Final figures
print(p1+stat)
print(p2+stat_sim)
The original dataset is large enough that it is representative of the population. Using the same n, the actual and simulated distributions are very similar.