Homework 9

Functions

#################################
# FUNCTION: get_data
# purpose: simulate two normal distributed datasets and return them in one
#          dataframe
# input: n, mean1, sd1, mean2, sd2
# output: dataframe
# -------------------------------
sim_data <- function(n, mean1, sd1, mean2, sd2) {
  data1 <- rnorm(n=n, mean=mean1, sd=sd1)
  data2 <- rnorm(n=n, mean=mean2, sd=sd2)
  part_ID <- seq_len(n)
  df <- data.frame(part_ID, data1, data2)
  return(df)
}

#################################
# FUNCTION: calculate_p
# purpose: calculate p-value of the two classes
# input: dataframe
# output: p_var
# -------------------------------
calculate_p <- function(df) {
  return(t.test(df$data1, df$data2)$p.value)
}

#################################
# FUNCTION: nplot
# purpose: plot the normal distribution of two dataset
# input: dataframe
# output: create graph
# -------------------------------
nplot <- function(df) {
  plot <- ggplot(data=df, aes(x=data1, y=..density..)) +
    geom_histogram(color="grey60",fill="chocolate3",size=0.2, alpha=0.8) + 
    geom_histogram(aes(x=data2, y=..density..),
                   color="grey60",fill="steelblue4",size=0.2, alpha=0.8) +
    geom_density(linetype="dotted",size=0.75) + 
    geom_density(aes(x=data2, y=..density..), linetype="dotted",size=0.75)
  print(plot)
}

#################################
# FUNCTION: bplot
# purpose: create a longder dataframe and plot the two datasets in the boxplot
# input: dataframe
# output: create graph
# -------------------------------
bplot <- function(df) {
  long_df <- pivot_longer(df, cols= data1:data2,
                          names_to = "group",
                          values_to= "data")
  boxplot(long_df$data ~ long_df$group)
}

#################################
# NEW ADDED FUNC
# FUNCTION: CI_95
# purpose: extract the 95 percent confidence interval from the t-test
# input: dataframe
# output: 95CI interval
# -------------------------------
CI_95 <- function(df) {
  return(t.test(df$data1, df$data2)$conf.int[1:2])
}

Global variables

n <- 1000   # sample size
mean1 <- -1.75
sd1 <- 0.5
mean2 <- -1.7
sd2 <- 0.5

Body

# load library ------------------------------
library(tidyverse)
# simulate data ------------------------------
df <- sim_data(n, mean1, sd1, mean2, sd2)
head(df)

##   part_ID     data1      data2
## 1       1 -2.421747 -2.2405156
## 2       2 -1.199355 -1.6381388
## 3       3 -1.248706 -1.2724664
## 4       4 -1.449940 -1.2127458
## 5       5 -1.096353 -2.1338476
## 6       6 -1.790412 -0.8382875

# ANOVA test ------------------------------
cat("The p-value of two datasets is:", calculate_p(df))

## The p-value of two datasets is: 0.1760145

cat("The 95 confidence interval is:", CI_95(df))

## The 95 confidence interval is: -0.07372101 0.01351192

# plot normal distribution ------------------------------
nplot(df)

# box plot ------------------------------
bplot(df)

Run the simulation multiple times and record the p-values

set.seed(22)
p_vars <- vector()
for(i in 1:1000){
  df <- sim_data(n, mean1, sd1, mean2, sd2)
  p_vars <- append(p_vars, calculate_p(df))
}
hist(p_vars)

Homework 9

Haorui Sun

3/23/2022

Functions

Global variables

Body

Run the simulation multiple times and record the p-values