library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
The data is about the mass of Steller sea lions Eumetopias jubatus at weaning from southeast Alaska.
Data citation:
Hastings, Kelly (2021), Age at weaning in Steller sea lions, Dryad, Dataset, https://doi.org/10.5061/dryad.3bk3j9kh5
# extract the column value
df <- read.table("my_dataset.csv",header=TRUE,sep=",")
z <- df$mass
z <- data.frame(1:length(z), z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.70 20.70 23.50 23.91 26.80 39.70
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
normal
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 23.91308774 4.33597307
## ( 0.09546341) ( 0.06750283)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 23.91 4.34
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0955 0.0675
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.00911 0 0 0.00456
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 2063
## $ loglik : num -5954
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 23.91309
normal
probability densitymeanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red",
n = length(z$myVar), args = list(mean = meanML,
sd = sdML))
p1 + stat
exponential
probability densityexpoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue",
n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
uniform
probability densitystat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif,
colour="darkgreen", n = length(z$myVar),
args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
gamma
probability densitygammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma,
colour="brown", n = length(z$myVar),
args=list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
beta
probability densitypSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),
start=list(shape1=1,shape2=2),"beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta,
colour="orchid", n = length(z$myVar),
args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
The my data best fits the normal distribution, so I would like to simulate a new data set using the normal distribution.
# find the maximum likelihood parameters
mean <- normPars$estimate["mean"]
sd <- normPars$estimate["sd"]
# simulate the data
len <- length(z$myVar)
newVar <- rnorm(len, mean=mean, sd=sd)
new_z <- data.frame(1:len, newVar)
names(new_z) <- list("ID","myVar")
new_p1 <- ggplot(data=new_z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
geom_density(linetype="dotted",size=0.75)
print(new_p1)
p1
The two histogram profiles look pretty identical, so I think the model is doing a good job of simulating realistic data that match my original measurements. That is because the my realistic data is highly normal distributed.