require(graphics)

dchisq(1, df = 1:3)
pchisq(1, df = 3)
pchisq(1, df = 3, ncp = 0:4) # includes the above

# non-central RNG -- df = 0 with ncp > 0: Z0 has point mass at 0!

Z0 <- rchisq(100, df = 0, ncp = 2.)
graphics::stem(Z0)

  The decimal point is at the |

   0 | 00000000000000000000000000000000000000000000011133456789011223445566
   2 | 11259003778
   4 | 23534578
   6 | 008155
   8 | 59
  10 | 2
  12 | 9

# visual testing
# do P-P plots for 1000 points at various degrees of freedom

L <- 1.2; n <- 1000; pp <- ppoints(n)
op <- par(mfrow = c(3,3), mar = c(3,3,1,1)+.1, mgp = c(1.5,.6,0),
oma = c(0,0,3,0))
for(df in 2^(4*rnorm(9))) {
plot(pp, sort(pchisq(rr <- rchisq(n, df = df, ncp = L), df = df, ncp = L)),
ylab = "pchisq(rchisq(.),.)", pch = ".")
mtext(paste("df = ", formatC(df, digits = 4)), line = -2, adj = 0.05)
abline(0, 1, col = 2)
}
mtext(expression("P-P plots : Noncentral "*
chi^2 *"(n=1000, df=X, ncp= 1.2)"),
cex = 1.5, font = 2, outer = TRUE)
par(op)

# We have already seen above that Z^2 has an estimated pdf similar to that of a χ2 rv. So, let’s show via estimating pdfs that the sum of a χ2 with 2 df and a χ2 with 3 df is a χ2 with 5 df.

chi2Data <- replicate(10000, rchisq(1,2) + rchisq(1,3))
f <- function(x) dchisq(x, df = 5)
library(ggplot2)
ggplot(data.frame(chi2Data), aes(x = chi2Data)) +
  geom_density() +
  stat_function(fun = f, color = "red")

# we estimate the density of (n−1/ σ2)*S^2 and compare it to that of a χ2 with n−1 degrees of freedom, in the case that n=4, μ=3 and σ=9.

sdData <- replicate(10000, 3/81 * sd(rnorm(4, 3, 9))^2)
f <- function(x) dchisq(x, df = 3)
ggplot(data.frame(sdData), aes(x = sdData)) + 
  geom_density() +
  stat_function(fun = f, color = "red")

# Let X1,…,X30 be Poisson random variables with rate 2. From our knowledge of the Poisson distribution, each Xi has mean μ=2 and standard deviation σ=sqrt{2}. 
# The central limit theorem then says that X¯− 2 / sqrt{2}/sqrt{30} is approximately normal with mean 0 and standard deviation 1. Let us check this with a simulation.
# This is a little but more complicated than our previous examples, but the idea is still the same. We create an experiment which computes X¯ and then transforms by subtracting 2 and dividing by sqrt{2}/sqrt{30}.
# Here is a single experiment:

xbar <- mean(rpois(30,2))
(xbar - 2)/(sqrt(2)/sqrt(30))

xbar <- replicate(10000, mean(rpois(30,2)))  # generate 10000 samples of xbar
poisData <- (xbar - 2)/(sqrt(2)/sqrt(30))
library(ggplot2)
ggplot(data.frame(poisData), aes(x = poisData)) + 
  geom_density() + 
  stat_function(fun = dnorm, color = "red")

# Very close!

# Let X1,…,X50 be exponential random variables with rate 1/3. From our knowledge of the exponential distribution, each Xi has mean μ=3 and standard deviation σ=3. The central limit theorem then says that X¯−3 / 3/sqrt{50} is approximately normal with mean 0 and standard deviation 1. We check this with a simulation.

xbar <- replicate(10000, mean(rexp(50,1/3)))   # generate 10000 samples of xbar
expData <- (xbar - 3)/(3/sqrt(50))
ggplot(data.frame(expData), aes(x = expData)) + 
  geom_density() + 
  stat_function(fun = dnorm, color = "red")

# A distribution for which sample size of n=500 is not sufficient for good approximation via normal distributions:
# We create a distribution that consists primarily of zeros, but has a few modest sized values and a few large values:

skewdata <- replicate(2000,0)                      # Start with lots of zeros
skewdata <- c(skewdata, rexp(200,1/10))            # Add a few moderately sized values
skewdata <- c(skewdata, seq(100000,500000,50000))  # Add a few large values

mu <- mean(skewdata)
sig <- sd(skewdata)

# We use sample to take a random sample of size n from this distribution. We take the mean, subtract the true mean of the distribution, and divide by σ/sqrt{n}. We replicate that 10000 times. Here are the plots when X¯ is the mean of 100 samples:

n <- 100

xbar <- replicate(10000, mean( sample(skewdata, n, TRUE)) )
skewSample <- (xbar - mu)/(sig/sqrt(n))

ggplot(data.frame(skewSample), aes(x = skewSample)) + 
  geom_density() + 
  stat_function(fun = dnorm, color = "red")

# and 500 samples:

n <- 500

xbar <- replicate(10000, mean( sample(skewdata, n, TRUE)) )
skewSample <- (xbar - mu)/(sig/sqrt(n))

ggplot(data.frame(skewSample), aes(x = skewSample)) + 
  geom_density() + 
  stat_function(fun = dnorm, color = "red")

# Even at 500 samples, the density is still far from the normal distribution, especially the lack of left tail. Of course, the Central Limit Theorem is still true, so X¯ must become normal if we choose n large enough:

n <- 5000

xbar <- replicate(10000, mean( sample(skewdata, n, TRUE)) )
skewSample <- (xbar - mu)/(sig/sqrt(n))

ggplot(data.frame(skewSample), aes(x = skewSample)) + 
  geom_density() + 
  stat_function(fun = dnorm, color = "red")

# There may still be a slight skewness in this picture, but we are now very close to being normal.

# Estimate via simulation the pdf of log⁡(|Z|) when Z is standard normal.

expData <- log(abs(rnorm(10000)))
ggplot(data.frame(expData), aes(x = expData)) + geom_density()

# Both of the above worked pretty well, but there are issues when your pdf has discontinuities. Let’s see.
# Estimate the pdf of X when X is uniform on the interval [−1,1].

unifData <- runif(10000,-1,1)
ggplot(data.frame(unifData), aes(x = unifData)) + geom_density()

# This distribution should be exactly 0.5 between -1 and 1, and zero everywhere else. The simulation is reasonable in the middle, but the discontinuities at the endpoints are not shown well. Increasing the number of data points helps, but does not fix the problem.

# Estimate the pdf of X^2 when X is uniform on the interval [-1,1].

X2data <- runif(10000,-1,1)^2
ggplot(data.frame(X2data), aes(x = X2data)) + geom_density()

# Estimate the pdf of X + Y when X and Y are exponential random variables with rate 2.

X <- rexp(10000,2)
Y <- rexp(10000,2)
sumData <- X + Y
ggplot(data.frame(sumData), aes(x = sumData)) + geom_density()

# Estimate the density of X1 + X2 + ⋯ + X20 when all of the Xi are exponential random variables with rate 2.

# This one is trickier and the first one for which we need to use replicate to create the data. Let’s build up the experiment that we are replicating from the ground up.
# Here’s the experiment of summing 20 exponential rvs with rate 2:

sum(rexp(20,2))

# Now, we replicate it (10 times to test):

replicate(10, sum(rexp(20,2)))

# Of course, we don’t want to just replicate it 10 times, we need about 10,000 data points to get a good density estimate.

sumExpData <- replicate(10000, sum(rexp(20,2)))
ggplot(data.frame(sumExpData), aes(x = sumExpData)) + geom_density()

# Note that this is starting to look like a normal random variable!!

ggplot(data.frame(sumxs), aes(x = sumxs)) + geom_density()

# if we wanted to estimate the probability that X = 2, for example, we would use

mean(replicate(10000, {dieRoll <- sample(1:6, 2, TRUE); sum(dieRoll) == 2}))

# indicates that there are 6 occurrences of “1”, 2 occurrences of “2”, and 1 occurrence each of “3” and “5”. To apply this to the die rolling, we create a vector of length 10,000 that has all of the observances of the rv X, in the following way:

set.seed(1)
table(replicate(10000, {dieRoll <- sample(1:6, 2, TRUE); sum(dieRoll)}))

   2    3    4    5    6    7    8    9   10   11   12 
 265  556  836 1147 1361 1688 1327 1122  853  581  264

# Suppose fifty randomly chosen people are in a room. Let X denote the number of people in the room who have the same birthday as someone else in the room. Estimate the pmf of X.

# We will simulate birthdays by taking a random sample from 1:365 and storing it in a vector. The tricky part is counting the number of elements in the vector that are repeated somewhere else in the vector. We will create a table and add up all of the values that are bigger than 1. Like this:

birthdays <- sample(1:365, 50, replace = TRUE)
table(birthdays)

birthdays
 21  37  39  42  67  70  73  75  81  83  87  91  95 118 124 125 127 134 138 145 
  1   1   1   2   1   1   1   1   1   1   1   1   1   1   1   2   1   1   2   1 
154 156 157 175 196 209 213 216 222 223 228 235 237 253 265 280 281 282 283 288 
  1   1   1   1   1   1   1   1   1   1   1   1   1   2   1   1   1   1   1   2 
289 295 301 332 345 
  1   1   1   1   1

# Now, we put that inside replicate, just like in the previous example.

set.seed(1)
birthdayTable <- table(replicate(10000, {birthdays <- sample(1:365, 50, replace = TRUE); sum(table(birthdays)[table(birthdays) > 1]) }))/10000
birthdayTable

# Now, we just need to table it like we did before.

table(replicate(10000, {movements <- sample(c(replicate(50,1),replicate(50,-1)), 100, FALSE); min(which(cumsum(movements) == 0)) }))/10000

     2      4      6      8     10     12     14     16     18     20     22 
0.5082 0.1304 0.0623 0.0390 0.0280 0.0211 0.0194 0.0162 0.0141 0.0100 0.0088 
    24     26     28     30     32     34     36     38     40     42     44 
0.0073 0.0059 0.0083 0.0068 0.0055 0.0057 0.0045 0.0032 0.0044 0.0036 0.0032 
    46     48     50     52     54     56     58     60     62     64     66 
0.0028 0.0036 0.0024 0.0025 0.0031 0.0025 0.0025 0.0027 0.0027 0.0027 0.0025 
    68     70     72     74     76     78     80     82     84     86     88 
0.0017 0.0022 0.0021 0.0030 0.0024 0.0023 0.0021 0.0021 0.0025 0.0024 0.0023 
    90     92     94     96     98    100 
0.0025 0.0028 0.0035 0.0046 0.0057 0.0099

# Let Z be a standard normal rv. Estimate P(Z^2 > 1).

z <- rnorm(10000)
mean(z^2 < 1)

# a typo here?

# Estimate the mean and standard deviation of Z^2.

z <- rnorm(10000)
mean(z^2)
sd(z^2)

# You can see the estimate is pretty close to the correct value of 1, but not exactly 1. However, if we repeat this experiment 10,000 times and take the average value of x¯, that should be very close to 1.

mean(replicate(10000, mean(rnorm(5,1,1))))

# It can be useful to repeat this a few times to see whether it seems to be consistently overestimating or underestimating the mean. In this case, it is not.

# For the sake of completeness, let’s show that 1/n ∑i=1 n (xi−x¯)^2 is biased.

mean(replicate(10000, {data = rnorm(5,1,2); 1/5*sum((data - mean(data))^2)}))

# If you run the above code several times, you will see that 1/n ∑i=1 n (xi−x¯)^2 tends to significantly underestimate the value of σ^2 = 4.

# Estimate the pdf of X¯−μ / S/sqrt{n} in the case that X1,…,X6 are iid normal with mean 3 and standard deviation 4, and compare it to the pdf of the appropriate t random variable.
library(ggplot2)
tData <- replicate(10000, {normData <- rnorm(6,3,4); (mean(normData) - 3)/(sd(normData)/sqrt(6))}) 
f <- function(x) dt(x, df = 5)
ggplot(data.frame(tData), aes(x = tData)) + 
  geom_density() +
  stat_function(fun = f, color = "red")

# a very good correlation.

# Compare the pdf of 2Z, where Z∼Normal(0,1) to the pdf of a normal random variable with mean 0 and standard deviation 2.

# We already saw how to estimate the pdf of 2Z, we just need to plot the pdf of Normal(0,2) on the same graph. 
# The pdf f(x) of Normal(0,2) is given in R by the function f(x)=dnorm(x,0,2). We can add the graph of a function to a ggplot using the stat_function command.

library(ggplot2)
zData <- 2 * rnorm(10000)
f <- function(x) dnorm(x,0,2)
ggplot(data.frame(zData), aes(x = zData)) + 
  geom_density() + 
  stat_function(fun = f,color="red")

# Those look really close!

# Let Z be a standard normal rv. Find the pdf of Z^2 and compare it to the pdf of a χ2 (chi-squared) rv with one degree of freedom. The distname for a χ2 rv is chisq, and rchisq requires the degrees of freedom to be specified using the df parameter.
# We plot the pdf of Z^2 and the chi-squared rv with one degree of freedom on the same plot. χ2 with one df has a vertical asymptote at x=0, so we need to explicitly set the y-axis scale:

zData <- rnorm(10000)^2
f <- function(x) dchisq(x, df=1)
ggplot(data.frame(zData), aes(x = zData)) +
  coord_cartesian(ylim = c(0,1.2)) +        # set y-axis scale
  geom_density() + 
  stat_function(fun = f, color = "red")

# As you can see, the estimated density follows the true density quite well. This is evidence that Z^2 is, in fact, chi-squared.

# student t

require(graphics)

1 - pt(1:5, df = 1)
qt(.975, df = c(1:10,20,50,100,1000))

tt <- seq(0, 10, len = 21)
ncp <- seq(0, 6, len = 31)
ptn <- outer(tt, ncp, function(t, d) pt(t, df = 3, ncp = d))
t.tit <- "Non-central t - Probabilities"
image(tt, ncp, ptn, zlim = c(0,1), main = t.tit)
persp(tt, ncp, ptn, zlim = 0:1, r = 2, phi = 20, theta = 200, main = t.tit,
xlab = "t", ylab = "non-centrality parameter",
zlab = "Pr(T <= t)")

plot(function(x) dt(x, df = 3, ncp = 2), -3, 11, ylim = c(0, 0.32),
main = "Non-central t - Density", yaxs = "i")

# Suppose 100 dice are thrown. What is the probability of observing 10 or fewer sixes? We assume that the results of the dice are independent and that the probability of rolling a six is p = 1/6.

sum(dbinom(0:10, 100, 1/6))
pbinom(10,100,1/6)

require(graphics)

plot(density(c(-20, rep(0,98), 20)), xlim = c(-4, 4)) # IQR = 0

# The Old Faithful geyser data

d <- density(faithful$eruptions, bw = "sj") 
d
plot(d)

plot(d, type = "n")
polygon(d, col = "wheat")

Call:
	density.default(x = faithful$eruptions, bw = "sj")

Data: faithful$eruptions (272 obs.);	Bandwidth 'bw' = 0.14

       x               y            
 Min.   :1.180   Min.   :0.0001834  
 1st Qu.:2.265   1st Qu.:0.0422638  
 Median :3.350   Median :0.1709243  
 Mean   :3.350   Mean   :0.2301726  
 3rd Qu.:4.435   3rd Qu.:0.4134348  
 Max.   :5.520   Max.   :0.5945634

m <- 10; n <- 7; k <- 8
x <- 0:(k+1)
rbind(phyper(x, m, n, k), dhyper(x, m, n, k))
all(phyper(x, m, n, k) == cumsum(dhyper(x, m, n, k))) # FALSE

## but the error is very small:

signif(phyper(x, m, n, k) - cumsum(dhyper(x, m, n, k)), digits = 3)
# [Package stats version 3.4.3 Index]

# Examining the distribution of a set of data: 
# Given a (univariate) set of data we can examine its distribution in a large number of ways. The simplest is to examine the numbers. Two slightly different summaries are given by summary and fivenum and a display of the numbers by stem (a “stem and leaf” plot).

attach(faithful)
summary(eruptions)
fivenum(eruptions)
stem(eruptions)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.600   2.163   4.000   3.488   4.454   5.100

  The decimal point is 1 digit(s) to the left of the |

  16 | 070355555588
  18 | 000022233333335577777777888822335777888
  20 | 00002223378800035778
  22 | 0002335578023578
  24 | 00228
  26 | 23
  28 | 080
  30 | 7
  32 | 2337
  34 | 250077
  36 | 0000823577
  38 | 2333335582225577
  40 | 0000003357788888002233555577778
  42 | 03335555778800233333555577778
  44 | 02222335557780000000023333357778888
  46 | 0000233357700000023578
  48 | 00000022335800333
  50 | 0370

# A stem-and-leaf plot is like a histogram, and R has a function hist to plot histograms.

hist(eruptions)

# make the bins smaller, make a plot of density

hist(eruptions, seq(1.6, 5.2, 0.2), prob=TRUE)
lines(density(eruptions, bw=0.1))
rug(eruptions) # show the actual data points

# More elegant density plots can be made by density, and we added a line produced by density in this example. The bandwidth bw was chosen by trial-and-error as the default gives too much smoothing (it usually does for “interesting” densities). (Better automated methods of bandwidth choice are available, and in this example bw = "SJ" gives a good result.)

1 - sum(dnorm(0:170, 150, 20))
sum(dnorm(0:130, 150, 20))
sum(dnorm(130:170, 150, 20))

# Note that when size = 1, negative binomial is exactly a geometric random variable, e.g.

dnbinom(x = 14, size = 1, prob = 1/6)
dgeom(14, prob = 1/6)

# Cumulative ('p') = Sum of discrete prob.s ('d'); Relative error :

summary(1 - cumsum(dnbinom(x, size = 2, prob = 1/2)) /
pnbinom(x, size = 2, prob = 1/2))

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-2.22e-16 -2.22e-16 -2.22e-16 -1.48e-16  0.00e+00  0.00e+00

x <- 0:15
size <- (1:20)/4
persp(x, size, dnb <- outer(x, size, function(x,s) dnbinom(x, s, prob = 0.4)),
xlab = "x", ylab = "s", zlab = "density", theta = 150)
title(tit <- "negative binomial density(x,s, pr = 0.4) vs. x & s")

image (x, size, log10(dnb), main = paste("log [", tit, "]"))
contour(x, size, log10(dnb), add = TRUE)

## Alternative parametrization

x1 <- rnbinom(500, mu = 4, size = 1)
x2 <- rnbinom(500, mu = 4, size = 10)
x3 <- rnbinom(500, mu = 4, size = 100)
h1 <- hist(x1, breaks = 20, plot = FALSE)
h2 <- hist(x2, breaks = h1$breaks, plot = FALSE)
h3 <- hist(x3, breaks = h1$breaks, plot = FALSE)
barplot(rbind(h1$counts, h2$counts, h3$counts),
beside = TRUE, col = c("red","blue","cyan"),
names.arg = round(h1$breaks[-length(h1$breaks)]))

## Equivalence of pt(.,nu) with pf(.^2, 1,nu):

x <- seq(0.001, 5, len = 100)
nu <- 4
stopifnot(all.equal(2*pt(x,nu) - 1, pf(x^2, 1,nu)),
## upper tails:
all.equal(2*pt(x, nu, lower=FALSE),
pf(x^2, 1,nu, lower=FALSE)))

## the density of the square of a t_m is 2*dt(x, m)/(2*x)
# check this is the same as the density of F_{1,m}

all.equal(df(x^2, 1, 5), dt(x, 5)/x)

## Identity: qf(2*p - 1, 1, df)) == qt(p, df)^2) for p >= 1/2

p <- seq(1/2, .99, length = 50); df <- 10
rel.err <- function(x, y) ifelse(x == y, 0, abs(x-y)/mean(abs(c(x,y))))
quantile(rel.err(qf(2*p - 1, df1 = 1, df2 = df), qt(p, df)^2), .90) # ~= 7e-9

var(rlogis(4000, 0, scale = 5)) # approximately (+/- 3)
pi^2/3 * 5^2

require(graphics)

dnorm(0) == 1/sqrt(2*pi)
dnorm(1) == exp(-1/2)/sqrt(2*pi)
dnorm(1) == 1/sqrt(2*pi*exp(1))

# Using "log = TRUE" for an extended range :

par(mfrow = c(2,1))
plot(function(x) dnorm(x, log = TRUE), -60, 50,
main = "log { Normal density }")
curve(log(dnorm(x)), add = TRUE, col = "red", lwd = 2)
mtext("dnorm(x, log=TRUE)", adj = 0)
mtext("log(dnorm(x))", col = "red", adj = 1)

plot(function(x) pnorm(x, log.p = TRUE), -50, 10,
main = "log { Normal Cumulative }")
curve(log(pnorm(x)), add = TRUE, col = "red", lwd = 2)
mtext("pnorm(x, log=TRUE)", adj = 0)
mtext("log(pnorm(x))", col = "red", adj = 1)

# if you want the so-called 'error function'

erf <- function(x) 2 * pnorm(x * sqrt(2)) - 1

# (see Abramowitz and Stegun 29.2.29)
# and the so-called 'complementary error function'

erfc <- function(x) 2 * pnorm(x * sqrt(2), lower = FALSE)

## and the inverses

erfinv <- function (x) qnorm((1 + x)/2)/sqrt(2)
erfcinv <- function (x) qnorm(x/2, lower = FALSE)/sqrt(2)

sp <- rnorm(10000,2,4)
qnorm(.75,2,4)
pnorm(4.7,2,4)
ggplot(data.frame(sp), aes(x = sp)) + 
  geom_density()

library(mosaic)
iris %>%
group_by(Species) %>%
mutate(zSepal.Length = zscore(Sepal.Length)) %>%
head()

p <- rnorm(10000, 92.6, 3.6)
ggplot(data.frame(p), aes(x = p)) + 
  geom_density()
qqnorm(p)

df1 <- data.frame(p)
df1 %>%
mutate(zp = zscore(p)) %>%
head()

# Using R, and the pdf dpois:

dpois(5,6)

# There is a 0.16 probability of making exactly five typos on a two-page sample.

require(graphics)

-log(dpois(0:7, lambda = 1) * gamma(1+ 0:7))                # == 1
Ni <- rpois(50, lambda = 4); table(factor(Ni, 0:max(Ni)))

 0  1  2  3  4  5  6  7  8  9 10 11 12 
 1  2  4 11 13  5  8  4  0  1  0  0  1

1 - ppois(10*(15:25), lambda = 100)                         # becomes 0 (cancellation)
ppois(10*(15:25), lambda = 100, lower.tail = FALSE)         # no cancellation

par(mfrow = c(2, 1))
x <- seq(-0.01, 5, 0.01)
plot(x, ppois(x, 1), type = "s", ylab = "F(x)", main = "Poisson(1) CDF")
plot(x, pbinom(x, 100, 0.01), type = "s", ylab = "F(x)",
main = "Binomial(100, 0.01) CDF")

runif(n = 29, min = 18, max = 54)
rnorm(n = 29, mean = 18, sd = 54)

0	0.0004113534	0.01336898	0.117030	0.4193747	0.7821884	0.9635952	0.99814891	1.00000000	1
0	0.0004113534	0.01295763	0.103661	0.3023447	0.3628137	0.1814068	0.03455368	0.00185109	0

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species	zSepal.Length
5.1	3.5	1.4	0.2	setosa	0.26667447
4.9	3.0	1.4	0.2	setosa	-0.30071802
4.7	3.2	1.3	0.2	setosa	-0.86811050
4.6	3.1	1.5	0.2	setosa	-1.15180675
5.0	3.6	1.4	0.2	setosa	-0.01702177
5.4	3.9	1.7	0.4	setosa	1.11776320

p	zp
93.12181	0.14856493
92.83763	0.06999332
95.65130	0.84794416
96.11340	0.97570887
100.97609	2.32019446
94.61071	0.56023123

R Programming & Statistics Notes

Chapter 1.2.5.3 Exercises