#################################
# FUNCTION: count0
# purpose: counter the number of zeros in a numeric vector
# input: numeric vector
# output: the number of zeros
# -------------------------------
count0 <- function(num_vec) {
counter <- 0
for(i in num_vec) {
if (i==0) {
counter <- counter + 1
}
}
return(counter)
}
Testing
count0(1:5)
## [1] 0
count0(0:5)
## [1] 1
count0(c(0, 0, 1))
## [1] 2
#################################
# FUNCTION: re_count0
# purpose: counter the number of zeros in a single line of code
# input: numeric vector
# output: the number of zeros
# -------------------------------
re_count0 <- function(num_vec) {
return(length(num_vec[num_vec==0]))
}
Testing
re_count0(1:5)
## [1] 0
re_count0(0:5)
## [1] 1
re_count0(c(0, 0, 1))
## [1] 2
#################################
# FUNCTION: gen_mat
# purpose: generate a matrix in which each element is the product of the
# row number x the column number
# input: the number of rows and columns
# output: desired matrix
# -------------------------------
gen_mat <- function(row, col) {
mat <- matrix(rep(0, row*col), nrow=row, ncol=col)
for(i in 1:row){
for(j in 1:col){
mat[i, j] <- i*j
}
}
return(mat)
}
Testing
gen_mat(1, 1)
## [,1]
## [1,] 1
gen_mat(4, 5)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 2 3 4 5
## [2,] 2 4 6 8 10
## [3,] 3 6 9 12 15
## [4,] 4 8 12 16 20
# create treatment groups
groups <- c(rep("group1",10), rep("group2",10), rep("group3",10))
# create response variable
z <- c(runif(min=20, max=30, n=10),
runif(min=40, max=70, n=10),
runif(min=60, max=90, n=10))
# combine into data frame
df <- data.frame(groups=groups, res=z)
#################################
# FUNCTION: df_shuffle
# purpose: reshuffles the response variable, and calculates the mean
# of each group
# input: dataframe
# output: mean of each group in a vector of length 3
# -------------------------------
df_shuffle <- function(df) {
# reshuffle the dataframe
df_sim <- df
df_sim$res <- sample(df_sim$res)
# calculate the mean of each group
mean <- tapply(df_sim$res, df_sim$groups, mean)
return(mean)
}
##################################################
# function: shuffle_data
# randomize data for regression analysis
# input: 2-column data frame (x_var,y_var)
# output: 2-column data frame (x_var,y_var)
#-------------------------------------------------
shuffle_data <- function(z) {
z[,2] <- sample(z[,2]) # use sample function with defaults to reshuffle
return(z)
}
##################################################
# function: get_metric
# calculate metric for randomization test
# input: 2-column data frame for regression
# output: regression slope
#-------------------------------------------------
get_metric <- function(z) {
. <- lm(z[,2]~z[,1])
. <- summary(.)
. <- .$coefficients[2,1]
slope <- .
return(slope)
}
##################################################
# function: get_pval
# calculate p value from simulation
# input: list of observed metric, and vector of simulated metrics
# output: lower, upper tail probability values
#-------------------------------------------------
get_pval <- function(z) {
p_lower <- mean(z[[2]]<=z[[1]])
p_upper <- mean(z[[2]]>=z[[1]])
return(c(pl=p_lower,pu=p_upper))
}
##################################################
# function: plot_ran_test
# create ggplot of histogram of simulated values
# input: list of observed metric and vector of simulated metrics
# output: saved ggplot graph
#-------------------------------------------------
plot_ran_test <- function(z) {
df <- data.frame(ID=seq_along(z[[2]]),sim_x=z[[2]])
p1 <- ggplot(data=df) +
aes(x=sim_x)
p1 + geom_histogram(aes(fill=I("goldenrod"),
color=I("black"))) +
geom_vline(aes(xintercept=z[[1]],
col="blue"))
}
# declare the dataframe
df_mean <- data.frame(matrix(ncol=4,nrow=0,
dimnames=list(NULL, c("rep",
"g1_mean",
"g2_mean",
"g3_mean"))))
# repeat reshuffle 100 times
for(i in 1:100){
df_mean[i,] <- c(i, df_shuffle(df))
}
library(ggplot2)
plot <- ggplot(data=df_mean, aes(x=g1_mean, y=..density..)) +
geom_histogram(color="grey60",fill="chocolate3",size=0.2, alpha=0.8) +
geom_histogram(aes(x=g2_mean, y=..density..),
color="grey60",fill="steelblue4",size=0.2, alpha=0.8) +
geom_histogram(aes(x=g3_mean, y=..density..),
color="grey60",fill="forestgreen",size=0.2, alpha=0.8)
print(plot)
The reshuffled data means are normally distributed.
n_sim <- 1000
x_sim <- rep(NA,n_sim) # vector of simulated slopes
df1 <- df[, 1:2]
x_obs <- get_metric(df1)
for (i in seq_len(n_sim)) {
x_sim[i] <- get_metric(shuffle_data(df1))
}
slopes <- list(x_obs,x_sim)
get_pval(slopes)
## pl pu
## 0.999 0.001
plot_ran_test(slopes)
The p-value for the standard test versus the p value estimated from randomization test are very different regardless of the number of replications. This difference represents the significant difference between two original datasets.