Homework 11

Functions

##################################################
# function: file_builder
# create a set of random files for regression
# input: file_n = number of files to create
#       : file_folder = name of folder for random files
#       : file_size = c(min,max) number of rows in file
#       : file_na = number on average of NA values per column
# output: set of random files
#------------------------------------------------- 
file_builder <- function(df,
                         file_n=10,
                         file_folder="RandomFiles/",
                         file_size=c(15, 100),
                         file_na=3){
  for (i in seq_len(file_n)) {
    file_length <- sample(file_size[1]:file_size[2],size=1) # get number of rows
    # get indices for one batch
    indices <- sample(1:nrow(df), file_length, replace=TRUE)
    df <- df[indices,] # bind into a data frame
    
    # create label for file name with padded zeroes
    file_label <- paste(file_folder,
                        "ranFile",
                        formatC(i,
                                width=3,
                                format="d",
                                flag="0"),
                        ".csv",sep="")
    
    # set up data file and incorporate time stamp and minimal metadata
    write.table(cat("# Simulated random data file for batch processing","\n",
                    "# timestamp: ",as.character(Sys.time()),"\n",
                    "# HS","\n",
                    "# ------------------------", "\n",
                    "\n",
                    file=file_label,
                    row.names="",
                    col.names="",
                    sep=""))
    
    # now add the data frame
    write.table(x=df,
                file=file_label,
                sep=",",
                row.names=FALSE,
                append=TRUE)
  }
}

##################################################
# function: reg_stats
# fits linear model, extracts statistics
# input: 2-column data frame (x and y)
# output: slope, p-value, and r2
#------------------------------------------------- 
reg_stats <- function(d) {
  . <- lm(data=d,d[,2]~d[,1])
  . <- summary(.)
  stats_list <- list(slope=.$coefficients[2,1],
                     p_val=.$coefficients[2,4],
                     r2=.$r.squared)
  return(stats_list)
}

Batch processing

I’m using a teen birth rate and poverty level dataset for this batch processing excerise.

Load data and build batch files

#--------------------------------------------
# define variables
file_folder <- "RandomFiles/"
n_files <- 10
file_out <- "StatsSummary.csv"
df <- read.delim("https://online.stat.psu.edu/stat462/sites/onlinecourses.science.psu.edu.stat462/files/data/poverty/index.txt")
#--------------------------------------------

# Create 10 batch datasets
dir.create(file_folder)
file_builder(df=df, file_n=n_files)
file_names <- list.files(path=file_folder)

Run regression on every batch dataset

# Create data frame to hold file summary statistics
ID <- seq_along(file_names)
file_name <- file_names
slope <- rep(NA,n_files)
p_val <- rep(NA,n_files)
r2 <- rep(NA,n_files)
stats_out <- data.frame(ID,file_name,slope,p_val,r2)

# batch process by looping through individual files
for (i in seq_along(file_names)) {
  data <- read.table(file=paste(file_folder,file_names[i],sep=""),
                     sep=",",
                     header=TRUE) # read in next data file
  
  . <- reg_stats(data[, 2:3]) # pull regression stats from clean file
  stats_out[i,3:5] <- unlist(.) # unlist, copy into last 3 columns
}
# set up output file and incorporate time stamp and minimal metadata
write.table(cat("# Summary stats for ",
                "batch processing of regression models","\n",
                "# timestamp: ",as.character(Sys.time()),"\n",
                "# HS","\n",
                "# ------------------------", "\n",
                "\n",
                file=file_out,
                row.names="",
                col.names="",
                sep=""))

# now add the data frame
write.table(x=stats_out,
            file=file_out,
            row.names=FALSE,
            col.names=TRUE,
            sep=",",
            append=TRUE)

Print out the results

stats_out

##    ID      file_name    slope        p_val        r2
## 1   1 ranFile001.csv 1.468546 3.947391e-10 0.5252787
## 2   2 ranFile002.csv 1.529764 5.624806e-13 0.5644282
## 3   3 ranFile003.csv 1.505516 6.400799e-15 0.6100126
## 4   4 ranFile004.csv 1.470126 3.850472e-16 0.6308372
## 5   5 ranFile005.csv 1.106098 2.945379e-05 0.6097926
## 6   6 ranFile006.csv 1.142658 2.380715e-18 0.5737284
## 7   7 ranFile007.csv 1.137407 5.159337e-16 0.5959407
## 8   8 ranFile008.csv 1.189003 1.240810e-11 0.5896295
## 9   9 ranFile009.csv 1.149548 2.167843e-14 0.5840933
## 10 10 ranFile010.csv 1.181417 8.686496e-10 0.6425759

Homework 11

Haorui Sun

4/6/2022

Functions

Batch processing

Load data and build batch files

Run regression on every batch dataset

Print out the results