Chapter 1.2.5.3 Exercises
In [5]:
require(graphics)
dchisq(1, df = 1:3)
pchisq(1, df = 3)
pchisq(1, df = 3, ncp = 0:4) # includes the above
In [7]:
# non-central RNG -- df = 0 with ncp > 0: Z0 has point mass at 0!
Z0 <- rchisq(100, df = 0, ncp = 2.)
graphics::stem(Z0)
In [8]:
# visual testing
# do P-P plots for 1000 points at various degrees of freedom
L <- 1.2; n <- 1000; pp <- ppoints(n)
op <- par(mfrow = c(3,3), mar = c(3,3,1,1)+.1, mgp = c(1.5,.6,0),
oma = c(0,0,3,0))
for(df in 2^(4*rnorm(9))) {
plot(pp, sort(pchisq(rr <- rchisq(n, df = df, ncp = L), df = df, ncp = L)),
ylab = "pchisq(rchisq(.),.)", pch = ".")
mtext(paste("df = ", formatC(df, digits = 4)), line = -2, adj = 0.05)
abline(0, 1, col = 2)
}
mtext(expression("P-P plots : Noncentral "*
chi^2 *"(n=1000, df=X, ncp= 1.2)"),
cex = 1.5, font = 2, outer = TRUE)
par(op)
In [2]:
# We have already seen above that Z^2 has an estimated pdf similar to that of a χ2 rv. So, let’s show via estimating pdfs that the sum of a χ2 with 2 df and a χ2 with 3 df is a χ2 with 5 df.
chi2Data <- replicate(10000, rchisq(1,2) + rchisq(1,3))
f <- function(x) dchisq(x, df = 5)
library(ggplot2)
ggplot(data.frame(chi2Data), aes(x = chi2Data)) +
geom_density() +
stat_function(fun = f, color = "red")
In [4]:
# we estimate the density of (n−1/ σ2)*S^2 and compare it to that of a χ2 with n−1 degrees of freedom, in the case that n=4, μ=3 and σ=9.
sdData <- replicate(10000, 3/81 * sd(rnorm(4, 3, 9))^2)
f <- function(x) dchisq(x, df = 3)
ggplot(data.frame(sdData), aes(x = sdData)) +
geom_density() +
stat_function(fun = f, color = "red")
In [2]:
# Let X1,…,X30 be Poisson random variables with rate 2. From our knowledge of the Poisson distribution, each Xi has mean μ=2 and standard deviation σ=sqrt{2}.
# The central limit theorem then says that X¯− 2 / sqrt{2}/sqrt{30} is approximately normal with mean 0 and standard deviation 1. Let us check this with a simulation.
# This is a little but more complicated than our previous examples, but the idea is still the same. We create an experiment which computes X¯ and then transforms by subtracting 2 and dividing by sqrt{2}/sqrt{30}.
# Here is a single experiment:
xbar <- mean(rpois(30,2))
(xbar - 2)/(sqrt(2)/sqrt(30))
In [2]:
xbar <- replicate(10000, mean(rpois(30,2))) # generate 10000 samples of xbar
poisData <- (xbar - 2)/(sqrt(2)/sqrt(30))
library(ggplot2)
ggplot(data.frame(poisData), aes(x = poisData)) +
geom_density() +
stat_function(fun = dnorm, color = "red")
# Very close!
In [3]:
# Let X1,…,X50 be exponential random variables with rate 1/3. From our knowledge of the exponential distribution, each Xi has mean μ=3 and standard deviation σ=3. The central limit theorem then says that X¯−3 / 3/sqrt{50} is approximately normal with mean 0 and standard deviation 1. We check this with a simulation.
xbar <- replicate(10000, mean(rexp(50,1/3))) # generate 10000 samples of xbar
expData <- (xbar - 3)/(3/sqrt(50))
ggplot(data.frame(expData), aes(x = expData)) +
geom_density() +
stat_function(fun = dnorm, color = "red")
In [8]:
# A distribution for which sample size of n=500 is not sufficient for good approximation via normal distributions:
# We create a distribution that consists primarily of zeros, but has a few modest sized values and a few large values:
skewdata <- replicate(2000,0) # Start with lots of zeros
skewdata <- c(skewdata, rexp(200,1/10)) # Add a few moderately sized values
skewdata <- c(skewdata, seq(100000,500000,50000)) # Add a few large values
mu <- mean(skewdata)
sig <- sd(skewdata)
# We use sample to take a random sample of size n from this distribution. We take the mean, subtract the true mean of the distribution, and divide by σ/sqrt{n}. We replicate that 10000 times. Here are the plots when X¯ is the mean of 100 samples:
n <- 100
xbar <- replicate(10000, mean( sample(skewdata, n, TRUE)) )
skewSample <- (xbar - mu)/(sig/sqrt(n))
ggplot(data.frame(skewSample), aes(x = skewSample)) +
geom_density() +
stat_function(fun = dnorm, color = "red")
In [9]:
# and 500 samples:
n <- 500
xbar <- replicate(10000, mean( sample(skewdata, n, TRUE)) )
skewSample <- (xbar - mu)/(sig/sqrt(n))
ggplot(data.frame(skewSample), aes(x = skewSample)) +
geom_density() +
stat_function(fun = dnorm, color = "red")
In [10]:
# Even at 500 samples, the density is still far from the normal distribution, especially the lack of left tail. Of course, the Central Limit Theorem is still true, so X¯ must become normal if we choose n large enough:
n <- 5000
xbar <- replicate(10000, mean( sample(skewdata, n, TRUE)) )
skewSample <- (xbar - mu)/(sig/sqrt(n))
ggplot(data.frame(skewSample), aes(x = skewSample)) +
geom_density() +
stat_function(fun = dnorm, color = "red")
# There may still be a slight skewness in this picture, but we are now very close to being normal.
In [6]:
# Estimate via simulation the pdf of log(|Z|) when Z is standard normal.
expData <- log(abs(rnorm(10000)))
ggplot(data.frame(expData), aes(x = expData)) + geom_density()
In [7]:
# Both of the above worked pretty well, but there are issues when your pdf has discontinuities. Let’s see.
# Estimate the pdf of X when X is uniform on the interval [−1,1].
unifData <- runif(10000,-1,1)
ggplot(data.frame(unifData), aes(x = unifData)) + geom_density()
# This distribution should be exactly 0.5 between -1 and 1, and zero everywhere else. The simulation is reasonable in the middle, but the discontinuities at the endpoints are not shown well. Increasing the number of data points helps, but does not fix the problem.
In [8]:
# Estimate the pdf of X^2 when X is uniform on the interval [-1,1].
X2data <- runif(10000,-1,1)^2
ggplot(data.frame(X2data), aes(x = X2data)) + geom_density()
In [9]:
# Estimate the pdf of X + Y when X and Y are exponential random variables with rate 2.
X <- rexp(10000,2)
Y <- rexp(10000,2)
sumData <- X + Y
ggplot(data.frame(sumData), aes(x = sumData)) + geom_density()
In [10]:
# Estimate the density of X1 + X2 + ⋯ + X20 when all of the Xi are exponential random variables with rate 2.
# This one is trickier and the first one for which we need to use replicate to create the data. Let’s build up the experiment that we are replicating from the ground up.
# Here’s the experiment of summing 20 exponential rvs with rate 2:
sum(rexp(20,2))
In [12]:
# Now, we replicate it (10 times to test):
replicate(10, sum(rexp(20,2)))
In [14]:
# Of course, we don’t want to just replicate it 10 times, we need about 10,000 data points to get a good density estimate.
sumExpData <- replicate(10000, sum(rexp(20,2)))
ggplot(data.frame(sumExpData), aes(x = sumExpData)) + geom_density()
# Note that this is starting to look like a normal random variable!!
In [8]:
ggplot(data.frame(sumxs), aes(x = sumxs)) + geom_density()
In [1]:
# if we wanted to estimate the probability that X = 2, for example, we would use
mean(replicate(10000, {dieRoll <- sample(1:6, 2, TRUE); sum(dieRoll) == 2}))
In [4]:
# indicates that there are 6 occurrences of “1”, 2 occurrences of “2”, and 1 occurrence each of “3” and “5”. To apply this to the die rolling, we create a vector of length 10,000 that has all of the observances of the rv X, in the following way:
set.seed(1)
table(replicate(10000, {dieRoll <- sample(1:6, 2, TRUE); sum(dieRoll)}))
In [11]:
# Suppose fifty randomly chosen people are in a room. Let X denote the number of people in the room who have the same birthday as someone else in the room. Estimate the pmf of X.
# We will simulate birthdays by taking a random sample from 1:365 and storing it in a vector. The tricky part is counting the number of elements in the vector that are repeated somewhere else in the vector. We will create a table and add up all of the values that are bigger than 1. Like this:
birthdays <- sample(1:365, 50, replace = TRUE)
table(birthdays)
In [ ]:
# Now, we put that inside replicate, just like in the previous example.
set.seed(1)
birthdayTable <- table(replicate(10000, {birthdays <- sample(1:365, 50, replace = TRUE); sum(table(birthdays)[table(birthdays) > 1]) }))/10000
birthdayTable
In [18]:
# Now, we just need to table it like we did before.
table(replicate(10000, {movements <- sample(c(replicate(50,1),replicate(50,-1)), 100, FALSE); min(which(cumsum(movements) == 0)) }))/10000
In [3]:
# Let Z be a standard normal rv. Estimate P(Z^2 > 1).
z <- rnorm(10000)
mean(z^2 < 1)
# a typo here?
In [4]:
# Estimate the mean and standard deviation of Z^2.
z <- rnorm(10000)
mean(z^2)
sd(z^2)
In [5]:
# You can see the estimate is pretty close to the correct value of 1, but not exactly 1. However, if we repeat this experiment 10,000 times and take the average value of x¯, that should be very close to 1.
mean(replicate(10000, mean(rnorm(5,1,1))))
# It can be useful to repeat this a few times to see whether it seems to be consistently overestimating or underestimating the mean. In this case, it is not.
In [23]:
# For the sake of completeness, let’s show that 1/n ∑i=1 n (xi−x¯)^2 is biased.
mean(replicate(10000, {data = rnorm(5,1,2); 1/5*sum((data - mean(data))^2)}))
# If you run the above code several times, you will see that 1/n ∑i=1 n (xi−x¯)^2 tends to significantly underestimate the value of σ^2 = 4.
In [2]:
# Estimate the pdf of X¯−μ / S/sqrt{n} in the case that X1,…,X6 are iid normal with mean 3 and standard deviation 4, and compare it to the pdf of the appropriate t random variable.
library(ggplot2)
tData <- replicate(10000, {normData <- rnorm(6,3,4); (mean(normData) - 3)/(sd(normData)/sqrt(6))})
f <- function(x) dt(x, df = 5)
ggplot(data.frame(tData), aes(x = tData)) +
geom_density() +
stat_function(fun = f, color = "red")
# a very good correlation.
In [3]:
# Compare the pdf of 2Z, where Z∼Normal(0,1) to the pdf of a normal random variable with mean 0 and standard deviation 2.
# We already saw how to estimate the pdf of 2Z, we just need to plot the pdf of Normal(0,2) on the same graph.
# The pdf f(x) of Normal(0,2) is given in R by the function f(x)=dnorm(x,0,2). We can add the graph of a function to a ggplot using the stat_function command.
library(ggplot2)
zData <- 2 * rnorm(10000)
f <- function(x) dnorm(x,0,2)
ggplot(data.frame(zData), aes(x = zData)) +
geom_density() +
stat_function(fun = f,color="red")
# Those look really close!
In [4]:
# Let Z be a standard normal rv. Find the pdf of Z^2 and compare it to the pdf of a χ2 (chi-squared) rv with one degree of freedom. The distname for a χ2 rv is chisq, and rchisq requires the degrees of freedom to be specified using the df parameter.
# We plot the pdf of Z^2 and the chi-squared rv with one degree of freedom on the same plot. χ2 with one df has a vertical asymptote at x=0, so we need to explicitly set the y-axis scale:
zData <- rnorm(10000)^2
f <- function(x) dchisq(x, df=1)
ggplot(data.frame(zData), aes(x = zData)) +
coord_cartesian(ylim = c(0,1.2)) + # set y-axis scale
geom_density() +
stat_function(fun = f, color = "red")
# As you can see, the estimated density follows the true density quite well. This is evidence that Z^2 is, in fact, chi-squared.
In [1]:
# student t
require(graphics)
1 - pt(1:5, df = 1)
qt(.975, df = c(1:10,20,50,100,1000))
tt <- seq(0, 10, len = 21)
ncp <- seq(0, 6, len = 31)
ptn <- outer(tt, ncp, function(t, d) pt(t, df = 3, ncp = d))
t.tit <- "Non-central t - Probabilities"
image(tt, ncp, ptn, zlim = c(0,1), main = t.tit)
persp(tt, ncp, ptn, zlim = 0:1, r = 2, phi = 20, theta = 200, main = t.tit,
xlab = "t", ylab = "non-centrality parameter",
zlab = "Pr(T <= t)")
plot(function(x) dt(x, df = 3, ncp = 2), -3, 11, ylim = c(0, 0.32),
main = "Non-central t - Density", yaxs = "i")
In [1]:
# Suppose 100 dice are thrown. What is the probability of observing 10 or fewer sixes? We assume that the results of the dice are independent and that the probability of rolling a six is p = 1/6.
sum(dbinom(0:10, 100, 1/6))
pbinom(10,100,1/6)
In [1]:
require(graphics)
plot(density(c(-20, rep(0,98), 20)), xlim = c(-4, 4)) # IQR = 0
# The Old Faithful geyser data
d <- density(faithful$eruptions, bw = "sj")
d
plot(d)
plot(d, type = "n")
polygon(d, col = "wheat")
In [3]:
m <- 10; n <- 7; k <- 8
x <- 0:(k+1)
rbind(phyper(x, m, n, k), dhyper(x, m, n, k))
all(phyper(x, m, n, k) == cumsum(dhyper(x, m, n, k))) # FALSE
## but the error is very small:
signif(phyper(x, m, n, k) - cumsum(dhyper(x, m, n, k)), digits = 3)
# [Package stats version 3.4.3 Index]
In [2]:
# Examining the distribution of a set of data:
# Given a (univariate) set of data we can examine its distribution in a large number of ways. The simplest is to examine the numbers. Two slightly different summaries are given by summary and fivenum and a display of the numbers by stem (a “stem and leaf” plot).
attach(faithful)
summary(eruptions)
fivenum(eruptions)
stem(eruptions)
In [3]:
# A stem-and-leaf plot is like a histogram, and R has a function hist to plot histograms.
hist(eruptions)
# make the bins smaller, make a plot of density
hist(eruptions, seq(1.6, 5.2, 0.2), prob=TRUE)
lines(density(eruptions, bw=0.1))
rug(eruptions) # show the actual data points
# More elegant density plots can be made by density, and we added a line produced by density in this example. The bandwidth bw was chosen by trial-and-error as the default gives too much smoothing (it usually does for “interesting” densities). (Better automated methods of bandwidth choice are available, and in this example bw = "SJ" gives a good result.)
In [4]:
1 - sum(dnorm(0:170, 150, 20))
sum(dnorm(0:130, 150, 20))
sum(dnorm(130:170, 150, 20))
In [10]:
# Note that when size = 1, negative binomial is exactly a geometric random variable, e.g.
dnbinom(x = 14, size = 1, prob = 1/6)
dgeom(14, prob = 1/6)
In [4]:
# Cumulative ('p') = Sum of discrete prob.s ('d'); Relative error :
summary(1 - cumsum(dnbinom(x, size = 2, prob = 1/2)) /
pnbinom(x, size = 2, prob = 1/2))
In [7]:
x <- 0:15
size <- (1:20)/4
persp(x, size, dnb <- outer(x, size, function(x,s) dnbinom(x, s, prob = 0.4)),
xlab = "x", ylab = "s", zlab = "density", theta = 150)
title(tit <- "negative binomial density(x,s, pr = 0.4) vs. x & s")
In [8]:
image (x, size, log10(dnb), main = paste("log [", tit, "]"))
contour(x, size, log10(dnb), add = TRUE)
In [9]:
## Alternative parametrization
x1 <- rnbinom(500, mu = 4, size = 1)
x2 <- rnbinom(500, mu = 4, size = 10)
x3 <- rnbinom(500, mu = 4, size = 100)
h1 <- hist(x1, breaks = 20, plot = FALSE)
h2 <- hist(x2, breaks = h1$breaks, plot = FALSE)
h3 <- hist(x3, breaks = h1$breaks, plot = FALSE)
barplot(rbind(h1$counts, h2$counts, h3$counts),
beside = TRUE, col = c("red","blue","cyan"),
names.arg = round(h1$breaks[-length(h1$breaks)]))
In [2]:
## Equivalence of pt(.,nu) with pf(.^2, 1,nu):
x <- seq(0.001, 5, len = 100)
nu <- 4
stopifnot(all.equal(2*pt(x,nu) - 1, pf(x^2, 1,nu)),
## upper tails:
all.equal(2*pt(x, nu, lower=FALSE),
pf(x^2, 1,nu, lower=FALSE)))
## the density of the square of a t_m is 2*dt(x, m)/(2*x)
# check this is the same as the density of F_{1,m}
all.equal(df(x^2, 1, 5), dt(x, 5)/x)
## Identity: qf(2*p - 1, 1, df)) == qt(p, df)^2) for p >= 1/2
p <- seq(1/2, .99, length = 50); df <- 10
rel.err <- function(x, y) ifelse(x == y, 0, abs(x-y)/mean(abs(c(x,y))))
quantile(rel.err(qf(2*p - 1, df1 = 1, df2 = df), qt(p, df)^2), .90) # ~= 7e-9
In [1]:
var(rlogis(4000, 0, scale = 5)) # approximately (+/- 3)
pi^2/3 * 5^2
In [9]:
require(graphics)
dnorm(0) == 1/sqrt(2*pi)
dnorm(1) == exp(-1/2)/sqrt(2*pi)
dnorm(1) == 1/sqrt(2*pi*exp(1))
# Using "log = TRUE" for an extended range :
par(mfrow = c(2,1))
plot(function(x) dnorm(x, log = TRUE), -60, 50,
main = "log { Normal density }")
curve(log(dnorm(x)), add = TRUE, col = "red", lwd = 2)
mtext("dnorm(x, log=TRUE)", adj = 0)
mtext("log(dnorm(x))", col = "red", adj = 1)
plot(function(x) pnorm(x, log.p = TRUE), -50, 10,
main = "log { Normal Cumulative }")
curve(log(pnorm(x)), add = TRUE, col = "red", lwd = 2)
mtext("pnorm(x, log=TRUE)", adj = 0)
mtext("log(pnorm(x))", col = "red", adj = 1)
# if you want the so-called 'error function'
erf <- function(x) 2 * pnorm(x * sqrt(2)) - 1
# (see Abramowitz and Stegun 29.2.29)
# and the so-called 'complementary error function'
erfc <- function(x) 2 * pnorm(x * sqrt(2), lower = FALSE)
## and the inverses
erfinv <- function (x) qnorm((1 + x)/2)/sqrt(2)
erfcinv <- function (x) qnorm(x/2, lower = FALSE)/sqrt(2)
In [38]:
sp <- rnorm(10000,2,4)
qnorm(.75,2,4)
pnorm(4.7,2,4)
ggplot(data.frame(sp), aes(x = sp)) +
geom_density()
In [7]:
library(mosaic)
iris %>%
group_by(Species) %>%
mutate(zSepal.Length = zscore(Sepal.Length)) %>%
head()
In [49]:
p <- rnorm(10000, 92.6, 3.6)
ggplot(data.frame(p), aes(x = p)) +
geom_density()
qqnorm(p)
In [18]:
df1 <- data.frame(p)
df1 %>%
mutate(zp = zscore(p)) %>%
head()
In [1]:
# Using R, and the pdf dpois:
dpois(5,6)
# There is a 0.16 probability of making exactly five typos on a two-page sample.
In [2]:
require(graphics)
-log(dpois(0:7, lambda = 1) * gamma(1+ 0:7)) # == 1
Ni <- rpois(50, lambda = 4); table(factor(Ni, 0:max(Ni)))
In [3]:
1 - ppois(10*(15:25), lambda = 100) # becomes 0 (cancellation)
ppois(10*(15:25), lambda = 100, lower.tail = FALSE) # no cancellation
In [4]:
par(mfrow = c(2, 1))
x <- seq(-0.01, 5, 0.01)
plot(x, ppois(x, 1), type = "s", ylab = "F(x)", main = "Poisson(1) CDF")
plot(x, pbinom(x, 100, 0.01), type = "s", ylab = "F(x)",
main = "Binomial(100, 0.01) CDF")
In [1]:
runif(n = 29, min = 18, max = 54)
rnorm(n = 29, mean = 18, sd = 54)