# LLN simulation - https://rstudio-pubs-static.s3.amazonaws.com/202144_71c017d6692c47d885f452b41a89f100.html
# STAT 753
# January 30, 2020

# Initialize parameters
lambda = .02
mean   = 1/lambda # expected value of an exponential random variable with parameter lambda
sigma  = 1/lambda # standard deviation
n      = 1000 # max sample size
semean = sigma/sqrt(n) # standard error of the mean (SD of sample mean)
popvar = sigma^2 # population variance


# Generate a vector of n random exponentials
x = rexp(n,lambda)

# Vector of cumulative sums
sx = cumsum(x)

# Vector of sample means
meanx = sx/1:n

# Compare sample mean vs theoretical mean, plot sample mean as a function of sample size (observations)
plot(meanx, main = "Law of Large Numbers Illustration", xlab = "Number of observations (random exponentials)", 
     ylab = "Average", type = "l", lwd = 2)

# Expected value line
points(seq(1,1000,by=1), rep(mean,1000), type="l", col="red", lwd=2)

# Legend
legend('topright', c("Observed averages", "Theoretical mean"), col=c("black", "red"), lty=c(1,1))


# The code above is one replicate, one could generate many replicates (via a for loop) and get a distribution 
# of sample means to show that the central limit theorem holds.




#################################### Nonstandard Example ####################################### 
#install.packages("ggplot2")
library(ggplot2)

set.seed(10109) # use this so you can repeat the same simulation
n = 40 # number of samples drawn

# Generate vectors of length s (# of simulations) of sample means and standard deviations (n random exponentials)
mns    <- NULL
for (i in 1:s) mns = c(mns, mean(rexp(n,lambda)))
stdev   <- NULL
for (i in 1:s) stdev = c(stdev, sd(rexp(n,lambda)))


# Compare sample mean vs theoretical mean, plot cumulative mean as a function of # of simulations (observations)
means <- cumsum(mns)/(1:s)
g <- ggplot(data.frame(x=1:s, y=means), aes(x=x, y=y))
g <- g + geom_hline(yintercept = mean) + geom_line(size=2)
g <- g + labs(x = "Number of Observations", y= "Cumulative Mean")
g


# Compare sample mean vs theoretical mean, plot cumulative variance as a function of # of simulations (observations)
variances = cumsum(stdev^2)/(1:s)
g <- ggplot(data.frame(x=1:s, y=variances), aes(x=x, y=y))
g <- g + geom_hline(yintercept = popvar) + geom_line(size=2)
g <- g + labs(x = "Number of Observations", y= "Cumulative Variance")
g


# Plot histogram of sample means
par(mfrow=c(1,1))
hist(mns, breaks=20, main = "Means of 1000 Exponential Samples",
     sub = "Output of Simulations with n = 40", 
     xlab = "Theoretical Mean = 1/lambda = 50", 
     cex.main=0.75, cex.sub=0.75, cex.lab=0.75, cex.axis=0.75, col = "ivory")
abline(v=mean, col="red", lw="2")
abline(v=mean+sigma/sqrt(n), col="dark green", lw="2")
abline(v=mean-sigma/sqrt(n), col="dark green", lw="2")


# Compare normalized sample distribution to standard normal distribution (mean=0, sd=1)
samplingmean <- mean(mns)
semean       <- sigma/sqrt(n)
normalizedsample <- (mns-samplingmean)/semean

mean(normalizedsample)
sd(normalizedsample)

# Plot histogram of normalized sample and compare to normal density function
hist(normalizedsample, breaks=20, freq=FALSE, main = "Histogram of Normalized Sample", 
     xlab = "Normalized Sample", col = "light blue")

# Density of the normalized sample means
lines(density(normalizedsample),col="blue", lty=2, lwd =2)

# Normal density
xfit <- seq(min(normalizedsample), max(normalizedsample), length=100)
yfit <- dnorm(xfit, mean=0, sd=1)
lines(xfit, yfit, pch=22, col="red", lty=2, lwd=2)

# Legend
legend('topright', c("Simulation", "Theoretical"), 
       col=c("blue", "red"), lty=c(1,1))
