Homework6

Fit the data distribtuions

Open libraries

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation

Read in data vector

The data is about the mass of Steller sea lions Eumetopias jubatus at weaning from southeast Alaska.

Data citation:

Hastings, Kelly (2021), Age at weaning in Steller sea lions, Dryad, Dataset, https://doi.org/10.5061/dryad.3bk3j9kh5

# extract the column value
df <- read.table("my_dataset.csv",header=TRUE,sep=",")
z <- df$mass
z <- data.frame(1:length(z), z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
summary(z$myVar)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.70   20.70   23.50   23.91   26.80   39.70

Plot histogram of data

p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

Add empirical density curve

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

Get maximum likelihood parameters for `normal`

normPars <- fitdistr(z$myVar,"normal")
print(normPars)

##       mean           sd     
##   23.91308774    4.33597307 
##  ( 0.09546341) ( 0.06750283)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 23.91 4.34
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0955 0.0675
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.00911 0 0 0.00456
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 2063
##  $ loglik  : num -5954
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # note structure of getting a named attribute

##     mean 
## 23.91309

Plot `normal` probability density

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", 
                      n = length(z$myVar), args = list(mean = meanML, 
                                                       sd = sdML))
p1 + stat

Plot `exponential` probability density

expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue",
                       n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2

Plot `uniform` probability density

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif,
                       colour="darkgreen", n = length(z$myVar), 
                       args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3

Plot `gamma` probability density

gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma,
                       colour="brown", n = length(z$myVar),
                       args=list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4

Plot `beta` probability density

pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) + 
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),
                     start=list(shape1=1,shape2=2),"beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta,
                             colour="orchid", n = length(z$myVar), 
                             args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

Simulate the data distributions

The my data best fits the normal distribution, so I would like to simulate a new data set using the normal distribution.

Simulate new dataset

# find the maximum likelihood parameters
mean <- normPars$estimate["mean"]
sd <- normPars$estimate["sd"]
# simulate the data
len <- length(z$myVar)
newVar <- rnorm(len, mean=mean, sd=sd)
new_z <- data.frame(1:len, newVar)
names(new_z) <- list("ID","myVar")

Plot the new data

new_p1 <- ggplot(data=new_z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
  geom_density(linetype="dotted",size=0.75)
print(new_p1)

Compare with the original data

p1

The two histogram profiles look pretty identical, so I think the model is doing a good job of simulating realistic data that match my original measurements. That is because the my realistic data is highly normal distributed.

Homework6

Haorui Sun

2/23/2022

Fit the data distribtuions

Open libraries

Read in data vector

Plot histogram of data

Add empirical density curve

Get maximum likelihood parameters for `normal`

Plot `normal` probability density

Plot `exponential` probability density

Plot `uniform` probability density

Plot `gamma` probability density

Plot `beta` probability density

Simulate the data distributions

Simulate new dataset

Plot the new data

Compare with the original data

Homework6

Haorui Sun

2/23/2022

Fit the data distribtuions

Open libraries

Read in data vector

Plot histogram of data

Add empirical density curve

Get maximum likelihood parameters for normal

Plot normal probability density

Plot exponential probability density

Plot uniform probability density

Plot gamma probability density

Plot beta probability density

Simulate the data distributions

Simulate new dataset

Plot the new data

Compare with the original data

Get maximum likelihood parameters for `normal`

Plot `normal` probability density

Plot `exponential` probability density

Plot `uniform` probability density

Plot `gamma` probability density

Plot `beta` probability density