Homework 10

1.

#################################
# FUNCTION: count0
# purpose: counter the number of zeros in a numeric vector
# input: numeric vector
# output: the number of zeros
# -------------------------------
count0 <- function(num_vec) {
  counter <- 0
  for(i in num_vec) {
    if (i==0) {
     counter <- counter + 1 
    }
  }
  return(counter)
}

Testing

count0(1:5)

## [1] 0

count0(0:5)

## [1] 1

count0(c(0, 0, 1))

## [1] 2

2.

#################################
# FUNCTION: re_count0
# purpose: counter the number of zeros in a single line of code
# input: numeric vector
# output: the number of zeros
# -------------------------------
re_count0 <- function(num_vec) {
  return(length(num_vec[num_vec==0]))
}

Testing

re_count0(1:5)

## [1] 0

re_count0(0:5)

## [1] 1

re_count0(c(0, 0, 1))

## [1] 2

3.

#################################
# FUNCTION: gen_mat
# purpose: generate a matrix in which each element is the product of the 
#          row number x the column number
# input: the number of rows and columns 
# output: desired matrix
# -------------------------------
gen_mat <- function(row, col) {
  mat <- matrix(rep(0, row*col), nrow=row, ncol=col)
  for(i in 1:row){
    for(j in 1:col){
      mat[i, j] <- i*j
    }
  }
  return(mat)
}

Testing

gen_mat(1, 1)

##      [,1]
## [1,]    1

gen_mat(4, 5)

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    2    3    4    5
## [2,]    2    4    6    8   10
## [3,]    3    6    9   12   15
## [4,]    4    8   12   16   20

4.

a. Simulate the dataset

# create treatment groups
groups <- c(rep("group1",10), rep("group2",10), rep("group3",10))
# create response variable
z <- c(runif(min=20, max=30, n=10), 
       runif(min=40, max=70, n=10), 
       runif(min=60, max=90, n=10))
# combine into data frame
df <- data.frame(groups=groups, res=z)

b. Create the function

#################################
# FUNCTION: df_shuffle
# purpose: reshuffles the response variable, and calculates the mean 
#          of each group
# input: dataframe
# output: mean of each group in a vector of length 3
# -------------------------------
df_shuffle <- function(df) {
  # reshuffle the dataframe
  df_sim <- df
  df_sim$res <- sample(df_sim$res)
  # calculate the mean of each group
  mean <- tapply(df_sim$res, df_sim$groups, mean)
  return(mean)
}

##################################################
# function: shuffle_data
# randomize data for regression analysis
# input: 2-column data frame (x_var,y_var)
# output: 2-column data frame (x_var,y_var)
#------------------------------------------------- 
shuffle_data <- function(z) {             
  z[,2] <- sample(z[,2]) # use sample function with defaults to reshuffle
  return(z)
}

##################################################
# function: get_metric
# calculate metric for randomization test
# input: 2-column data frame for regression
# output: regression slope
#------------------------------------------------- 
get_metric <- function(z) {               
  . <- lm(z[,2]~z[,1])
  . <- summary(.)
  . <- .$coefficients[2,1]
  
  slope <- .
  return(slope)
}

##################################################
# function: get_pval
# calculate p value from simulation
# input: list of observed metric, and vector of simulated metrics
# output: lower, upper tail probability values
#------------------------------------------------- 
get_pval <- function(z) {
  p_lower <- mean(z[[2]]<=z[[1]])
  p_upper <- mean(z[[2]]>=z[[1]])
  return(c(pl=p_lower,pu=p_upper))
}

##################################################
# function: plot_ran_test
# create ggplot of histogram of simulated values
# input: list of observed metric and vector of simulated metrics
# output: saved ggplot graph
#------------------------------------------------- 
plot_ran_test <- function(z) {
  df <- data.frame(ID=seq_along(z[[2]]),sim_x=z[[2]])
  
  p1 <- ggplot(data=df) + 
    aes(x=sim_x)
  
  p1 + geom_histogram(aes(fill=I("goldenrod"),
                          color=I("black"))) +
    geom_vline(aes(xintercept=z[[1]],
                   col="blue")) 
}

c. Loop 100 times

# declare the dataframe
df_mean <- data.frame(matrix(ncol=4,nrow=0, 
                             dimnames=list(NULL, c("rep", 
                                                   "g1_mean", 
                                                   "g2_mean",
                                                   "g3_mean"))))
# repeat reshuffle 100 times
for(i in 1:100){
  df_mean[i,] <- c(i, df_shuffle(df))
}

d. Plot the results

library(ggplot2)
plot <- ggplot(data=df_mean, aes(x=g1_mean, y=..density..)) + 
  geom_histogram(color="grey60",fill="chocolate3",size=0.2, alpha=0.8) + 
  geom_histogram(aes(x=g2_mean, y=..density..),
                 color="grey60",fill="steelblue4",size=0.2, alpha=0.8) +
  geom_histogram(aes(x=g3_mean, y=..density..),
                 color="grey60",fill="forestgreen",size=0.2, alpha=0.8)
print(plot)

The reshuffled data means are normally distributed.

Plot p-value histrogram

n_sim <- 1000
x_sim <- rep(NA,n_sim) # vector of simulated slopes
df1 <- df[, 1:2]
x_obs <- get_metric(df1)
for (i in seq_len(n_sim)) {
  x_sim[i] <- get_metric(shuffle_data(df1))
}
slopes <- list(x_obs,x_sim)
get_pval(slopes)

##    pl    pu 
## 0.999 0.001

plot_ran_test(slopes)

The p-value for the standard test versus the p value estimated from randomization test are very different regardless of the number of replications. This difference represents the significant difference between two original datasets.

Homework 10

Haorui Sun

3/30/2022

1.

2.

3.

4.

a. Simulate the dataset

b. Create the function

c. Loop 100 times

d. Plot the results

Plot p-value histrogram