Chapter 1.2.5.3 Exercises

distribs
In [5]:
require(graphics)

dchisq(1, df = 1:3)
pchisq(1, df = 3)
pchisq(1, df = 3, ncp = 0:4) # includes the above
  1. 0.241970724519143
  2. 0.303265329856317
  3. 0.241970724519143
0.198748043098799
  1. 0.198748043098799
  2. 0.132298554163576
  3. 0.0878731118073454
  4. 0.0582469145317944
  5. 0.0385359178462243
In [7]:
# non-central RNG -- df = 0 with ncp > 0: Z0 has point mass at 0!

Z0 <- rchisq(100, df = 0, ncp = 2.)
graphics::stem(Z0)
  The decimal point is at the |

   0 | 00000000000000000000000000000000000000000000011133456789011223445566
   2 | 11259003778
   4 | 23534578
   6 | 008155
   8 | 59
  10 | 2
  12 | 9

In [8]:
# visual testing
# do P-P plots for 1000 points at various degrees of freedom

L <- 1.2; n <- 1000; pp <- ppoints(n)
op <- par(mfrow = c(3,3), mar = c(3,3,1,1)+.1, mgp = c(1.5,.6,0),
oma = c(0,0,3,0))
for(df in 2^(4*rnorm(9))) {
plot(pp, sort(pchisq(rr <- rchisq(n, df = df, ncp = L), df = df, ncp = L)),
ylab = "pchisq(rchisq(.),.)", pch = ".")
mtext(paste("df = ", formatC(df, digits = 4)), line = -2, adj = 0.05)
abline(0, 1, col = 2)
}
mtext(expression("P-P plots : Noncentral "*
chi^2 *"(n=1000, df=X, ncp= 1.2)"),
cex = 1.5, font = 2, outer = TRUE)
par(op)
In [2]:
# We have already seen above that Z^2 has an estimated pdf similar to that of a χ2 rv. So, let’s show via estimating pdfs that the sum of a χ2 with 2 df and a χ2 with 3 df is a χ2 with 5 df.

chi2Data <- replicate(10000, rchisq(1,2) + rchisq(1,3))
f <- function(x) dchisq(x, df = 5)
library(ggplot2)
ggplot(data.frame(chi2Data), aes(x = chi2Data)) +
  geom_density() +
  stat_function(fun = f, color = "red")
In [4]:
# we estimate the density of (n−1/ σ2)*S^2 and compare it to that of a χ2 with n−1 degrees of freedom, in the case that n=4, μ=3 and σ=9.

sdData <- replicate(10000, 3/81 * sd(rnorm(4, 3, 9))^2)
f <- function(x) dchisq(x, df = 3)
ggplot(data.frame(sdData), aes(x = sdData)) + 
  geom_density() +
  stat_function(fun = f, color = "red")
In [2]:
# Let X1,…,X30 be Poisson random variables with rate 2. From our knowledge of the Poisson distribution, each Xi has mean μ=2 and standard deviation σ=sqrt{2}. 
# The central limit theorem then says that X¯− 2 / sqrt{2}/sqrt{30} is approximately normal with mean 0 and standard deviation 1. Let us check this with a simulation.
# This is a little but more complicated than our previous examples, but the idea is still the same. We create an experiment which computes X¯ and then transforms by subtracting 2 and dividing by sqrt{2}/sqrt{30}.
# Here is a single experiment:

xbar <- mean(rpois(30,2))
(xbar - 2)/(sqrt(2)/sqrt(30))
-0.516397779494322
In [2]:
xbar <- replicate(10000, mean(rpois(30,2)))  # generate 10000 samples of xbar
poisData <- (xbar - 2)/(sqrt(2)/sqrt(30))
library(ggplot2)
ggplot(data.frame(poisData), aes(x = poisData)) + 
  geom_density() + 
  stat_function(fun = dnorm, color = "red")

# Very close!
In [3]:
# Let X1,…,X50 be exponential random variables with rate 1/3. From our knowledge of the exponential distribution, each Xi has mean μ=3 and standard deviation σ=3. The central limit theorem then says that X¯−3 / 3/sqrt{50} is approximately normal with mean 0 and standard deviation 1. We check this with a simulation.

xbar <- replicate(10000, mean(rexp(50,1/3)))   # generate 10000 samples of xbar
expData <- (xbar - 3)/(3/sqrt(50))
ggplot(data.frame(expData), aes(x = expData)) + 
  geom_density() + 
  stat_function(fun = dnorm, color = "red")
In [8]:
# A distribution for which sample size of n=500 is not sufficient for good approximation via normal distributions:
# We create a distribution that consists primarily of zeros, but has a few modest sized values and a few large values:

skewdata <- replicate(2000,0)                      # Start with lots of zeros
skewdata <- c(skewdata, rexp(200,1/10))            # Add a few moderately sized values
skewdata <- c(skewdata, seq(100000,500000,50000))  # Add a few large values

mu <- mean(skewdata)
sig <- sd(skewdata)

# We use sample to take a random sample of size n from this distribution. We take the mean, subtract the true mean of the distribution, and divide by σ/sqrt{n}. We replicate that 10000 times. Here are the plots when X¯ is the mean of 100 samples:

n <- 100

xbar <- replicate(10000, mean( sample(skewdata, n, TRUE)) )
skewSample <- (xbar - mu)/(sig/sqrt(n))

ggplot(data.frame(skewSample), aes(x = skewSample)) + 
  geom_density() + 
  stat_function(fun = dnorm, color = "red")
In [9]:
# and 500 samples:

n <- 500

xbar <- replicate(10000, mean( sample(skewdata, n, TRUE)) )
skewSample <- (xbar - mu)/(sig/sqrt(n))

ggplot(data.frame(skewSample), aes(x = skewSample)) + 
  geom_density() + 
  stat_function(fun = dnorm, color = "red")
In [10]:
# Even at 500 samples, the density is still far from the normal distribution, especially the lack of left tail. Of course, the Central Limit Theorem is still true, so X¯ must become normal if we choose n large enough:

n <- 5000

xbar <- replicate(10000, mean( sample(skewdata, n, TRUE)) )
skewSample <- (xbar - mu)/(sig/sqrt(n))

ggplot(data.frame(skewSample), aes(x = skewSample)) + 
  geom_density() + 
  stat_function(fun = dnorm, color = "red")

# There may still be a slight skewness in this picture, but we are now very close to being normal.
In [6]:
# Estimate via simulation the pdf of log⁡(|Z|) when Z is standard normal.

expData <- log(abs(rnorm(10000)))
ggplot(data.frame(expData), aes(x = expData)) + geom_density()
In [7]:
# Both of the above worked pretty well, but there are issues when your pdf has discontinuities. Let’s see.
# Estimate the pdf of X when X is uniform on the interval [−1,1].

unifData <- runif(10000,-1,1)
ggplot(data.frame(unifData), aes(x = unifData)) + geom_density()

# This distribution should be exactly 0.5 between -1 and 1, and zero everywhere else. The simulation is reasonable in the middle, but the discontinuities at the endpoints are not shown well. Increasing the number of data points helps, but does not fix the problem.
In [8]:
# Estimate the pdf of X^2 when X is uniform on the interval [-1,1].

X2data <- runif(10000,-1,1)^2
ggplot(data.frame(X2data), aes(x = X2data)) + geom_density()
In [9]:
# Estimate the pdf of X + Y when X and Y are exponential random variables with rate 2.

X <- rexp(10000,2)
Y <- rexp(10000,2)
sumData <- X + Y
ggplot(data.frame(sumData), aes(x = sumData)) + geom_density()
In [10]:
# Estimate the density of X1 + X2 + ⋯ + X20 when all of the Xi are exponential random variables with rate 2.

# This one is trickier and the first one for which we need to use replicate to create the data. Let’s build up the experiment that we are replicating from the ground up.
# Here’s the experiment of summing 20 exponential rvs with rate 2:

sum(rexp(20,2))
10.1390341021935
In [12]:
# Now, we replicate it (10 times to test):

replicate(10, sum(rexp(20,2)))
  1. 8.7858754363624
  2. 11.7495889425114
  3. 8.89379313248743
  4. 5.67943837310145
  5. 12.0200044546032
  6. 13.6946372698699
  7. 8.14586423784414
  8. 8.4273056215378
  9. 6.94963616011086
  10. 11.0816738188767
In [14]:
# Of course, we don’t want to just replicate it 10 times, we need about 10,000 data points to get a good density estimate.

sumExpData <- replicate(10000, sum(rexp(20,2)))
ggplot(data.frame(sumExpData), aes(x = sumExpData)) + geom_density()

# Note that this is starting to look like a normal random variable!!
In [8]:
ggplot(data.frame(sumxs), aes(x = sumxs)) + geom_density()
In [1]:
# if we wanted to estimate the probability that X = 2, for example, we would use

mean(replicate(10000, {dieRoll <- sample(1:6, 2, TRUE); sum(dieRoll) == 2}))
0.0242
In [4]:
# indicates that there are 6 occurrences of “1”, 2 occurrences of “2”, and 1 occurrence each of “3” and “5”. To apply this to the die rolling, we create a vector of length 10,000 that has all of the observances of the rv X, in the following way:

set.seed(1)
table(replicate(10000, {dieRoll <- sample(1:6, 2, TRUE); sum(dieRoll)}))
   2    3    4    5    6    7    8    9   10   11   12 
 265  556  836 1147 1361 1688 1327 1122  853  581  264 
In [11]:
# Suppose fifty randomly chosen people are in a room. Let X denote the number of people in the room who have the same birthday as someone else in the room. Estimate the pmf of X.

# We will simulate birthdays by taking a random sample from 1:365 and storing it in a vector. The tricky part is counting the number of elements in the vector that are repeated somewhere else in the vector. We will create a table and add up all of the values that are bigger than 1. Like this:

birthdays <- sample(1:365, 50, replace = TRUE)
table(birthdays)
birthdays
 21  37  39  42  67  70  73  75  81  83  87  91  95 118 124 125 127 134 138 145 
  1   1   1   2   1   1   1   1   1   1   1   1   1   1   1   2   1   1   2   1 
154 156 157 175 196 209 213 216 222 223 228 235 237 253 265 280 281 282 283 288 
  1   1   1   1   1   1   1   1   1   1   1   1   1   2   1   1   1   1   1   2 
289 295 301 332 345 
  1   1   1   1   1 
In [ ]:
# Now, we put that inside replicate, just like in the previous example.

set.seed(1)
birthdayTable <- table(replicate(10000, {birthdays <- sample(1:365, 50, replace = TRUE); sum(table(birthdays)[table(birthdays) > 1]) }))/10000
birthdayTable
In [18]:
# Now, we just need to table it like we did before.

table(replicate(10000, {movements <- sample(c(replicate(50,1),replicate(50,-1)), 100, FALSE); min(which(cumsum(movements) == 0)) }))/10000
     2      4      6      8     10     12     14     16     18     20     22 
0.5082 0.1304 0.0623 0.0390 0.0280 0.0211 0.0194 0.0162 0.0141 0.0100 0.0088 
    24     26     28     30     32     34     36     38     40     42     44 
0.0073 0.0059 0.0083 0.0068 0.0055 0.0057 0.0045 0.0032 0.0044 0.0036 0.0032 
    46     48     50     52     54     56     58     60     62     64     66 
0.0028 0.0036 0.0024 0.0025 0.0031 0.0025 0.0025 0.0027 0.0027 0.0027 0.0025 
    68     70     72     74     76     78     80     82     84     86     88 
0.0017 0.0022 0.0021 0.0030 0.0024 0.0023 0.0021 0.0021 0.0025 0.0024 0.0023 
    90     92     94     96     98    100 
0.0025 0.0028 0.0035 0.0046 0.0057 0.0099 
In [3]:
# Let Z be a standard normal rv. Estimate P(Z^2 > 1).

z <- rnorm(10000)
mean(z^2 < 1)

# a typo here?
0.6839
In [4]:
# Estimate the mean and standard deviation of Z^2.

z <- rnorm(10000)
mean(z^2)
sd(z^2)
1.00035030360643
1.41755132859147
In [5]:
# You can see the estimate is pretty close to the correct value of 1, but not exactly 1. However, if we repeat this experiment 10,000 times and take the average value of x¯, that should be very close to 1.

mean(replicate(10000, mean(rnorm(5,1,1))))

# It can be useful to repeat this a few times to see whether it seems to be consistently overestimating or underestimating the mean. In this case, it is not.
1.00314227565085
In [23]:
# For the sake of completeness, let’s show that 1/n ∑i=1 n (xi−x¯)^2 is biased.

mean(replicate(10000, {data = rnorm(5,1,2); 1/5*sum((data - mean(data))^2)}))

# If you run the above code several times, you will see that 1/n ∑i=1 n (xi−x¯)^2 tends to significantly underestimate the value of σ^2 = 4.
3.191932139055
In [2]:
# Estimate the pdf of X¯−μ / S/sqrt{n} in the case that X1,…,X6 are iid normal with mean 3 and standard deviation 4, and compare it to the pdf of the appropriate t random variable.
library(ggplot2)
tData <- replicate(10000, {normData <- rnorm(6,3,4); (mean(normData) - 3)/(sd(normData)/sqrt(6))}) 
f <- function(x) dt(x, df = 5)
ggplot(data.frame(tData), aes(x = tData)) + 
  geom_density() +
  stat_function(fun = f, color = "red")

# a very good correlation.
In [3]:
# Compare the pdf of 2Z, where Z∼Normal(0,1) to the pdf of a normal random variable with mean 0 and standard deviation 2.

# We already saw how to estimate the pdf of 2Z, we just need to plot the pdf of Normal(0,2) on the same graph. 
# The pdf f(x) of Normal(0,2) is given in R by the function f(x)=dnorm(x,0,2). We can add the graph of a function to a ggplot using the stat_function command.

library(ggplot2)
zData <- 2 * rnorm(10000)
f <- function(x) dnorm(x,0,2)
ggplot(data.frame(zData), aes(x = zData)) + 
  geom_density() + 
  stat_function(fun = f,color="red")

# Those look really close!
In [4]:
# Let Z be a standard normal rv. Find the pdf of Z^2 and compare it to the pdf of a χ2 (chi-squared) rv with one degree of freedom. The distname for a χ2 rv is chisq, and rchisq requires the degrees of freedom to be specified using the df parameter.
# We plot the pdf of Z^2 and the chi-squared rv with one degree of freedom on the same plot. χ2 with one df has a vertical asymptote at x=0, so we need to explicitly set the y-axis scale:

zData <- rnorm(10000)^2
f <- function(x) dchisq(x, df=1)
ggplot(data.frame(zData), aes(x = zData)) +
  coord_cartesian(ylim = c(0,1.2)) +        # set y-axis scale
  geom_density() + 
  stat_function(fun = f, color = "red")

# As you can see, the estimated density follows the true density quite well. This is evidence that Z^2 is, in fact, chi-squared.
In [1]:
# student t

require(graphics)

1 - pt(1:5, df = 1)
qt(.975, df = c(1:10,20,50,100,1000))

tt <- seq(0, 10, len = 21)
ncp <- seq(0, 6, len = 31)
ptn <- outer(tt, ncp, function(t, d) pt(t, df = 3, ncp = d))
t.tit <- "Non-central t - Probabilities"
image(tt, ncp, ptn, zlim = c(0,1), main = t.tit)
persp(tt, ncp, ptn, zlim = 0:1, r = 2, phi = 20, theta = 200, main = t.tit,
xlab = "t", ylab = "non-centrality parameter",
zlab = "Pr(T <= t)")

plot(function(x) dt(x, df = 3, ncp = 2), -3, 11, ylim = c(0, 0.32),
main = "Non-central t - Density", yaxs = "i")
  1. 0.25
  2. 0.147583617650433
  3. 0.102416382349567
  4. 0.0779791303773694
  5. 0.0628329581890011
  1. 12.7062047361747
  2. 4.30265272974946
  3. 3.18244630528371
  4. 2.77644510519779
  5. 2.57058183563631
  6. 2.44691185114497
  7. 2.36462425159278
  8. 2.30600413520417
  9. 2.2621571627982
  10. 2.22813885198627
  11. 2.08596344726586
  12. 2.00855911210076
  13. 1.98397151852355
  14. 1.96233908082641
In [1]:
# Suppose 100 dice are thrown. What is the probability of observing 10 or fewer sixes? We assume that the results of the dice are independent and that the probability of rolling a six is p = 1/6.

sum(dbinom(0:10, 100, 1/6))
pbinom(10,100,1/6)
0.0426956841470246
0.0426956841470245
In [1]:
require(graphics)

plot(density(c(-20, rep(0,98), 20)), xlim = c(-4, 4)) # IQR = 0

# The Old Faithful geyser data

d <- density(faithful$eruptions, bw = "sj") 
d
plot(d)

plot(d, type = "n")
polygon(d, col = "wheat")
Call:
	density.default(x = faithful$eruptions, bw = "sj")

Data: faithful$eruptions (272 obs.);	Bandwidth 'bw' = 0.14

       x               y            
 Min.   :1.180   Min.   :0.0001834  
 1st Qu.:2.265   1st Qu.:0.0422638  
 Median :3.350   Median :0.1709243  
 Mean   :3.350   Mean   :0.2301726  
 3rd Qu.:4.435   3rd Qu.:0.4134348  
 Max.   :5.520   Max.   :0.5945634  
In [3]:
m <- 10; n <- 7; k <- 8
x <- 0:(k+1)
rbind(phyper(x, m, n, k), dhyper(x, m, n, k))
all(phyper(x, m, n, k) == cumsum(dhyper(x, m, n, k))) # FALSE

## but the error is very small:

signif(phyper(x, m, n, k) - cumsum(dhyper(x, m, n, k)), digits = 3)
# [Package stats version 3.4.3 Index]
0 0.00041135340.01336898 0.117030 0.4193747 0.7821884 0.9635952 0.99814891 1.00000000 1
0 0.00041135340.01295763 0.103661 0.3023447 0.3628137 0.1814068 0.03455368 0.00185109 0
FALSE
  1. 0
  2. 0
  3. -1.73e-18
  4. 1.39e-17
  5. -5.55e-17
  6. -2.22e-16
  7. -1.11e-16
  8. -1.11e-16
  9. -2.22e-16
  10. -2.22e-16
In [2]:
# Examining the distribution of a set of data: 
# Given a (univariate) set of data we can examine its distribution in a large number of ways. The simplest is to examine the numbers. Two slightly different summaries are given by summary and fivenum and a display of the numbers by stem (a “stem and leaf” plot).

attach(faithful)
summary(eruptions)
fivenum(eruptions)
stem(eruptions)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.600   2.163   4.000   3.488   4.454   5.100 
  1. 1.6
  2. 2.1585
  3. 4
  4. 4.4585
  5. 5.1
  The decimal point is 1 digit(s) to the left of the |

  16 | 070355555588
  18 | 000022233333335577777777888822335777888
  20 | 00002223378800035778
  22 | 0002335578023578
  24 | 00228
  26 | 23
  28 | 080
  30 | 7
  32 | 2337
  34 | 250077
  36 | 0000823577
  38 | 2333335582225577
  40 | 0000003357788888002233555577778
  42 | 03335555778800233333555577778
  44 | 02222335557780000000023333357778888
  46 | 0000233357700000023578
  48 | 00000022335800333
  50 | 0370

In [3]:
# A stem-and-leaf plot is like a histogram, and R has a function hist to plot histograms.

hist(eruptions)

# make the bins smaller, make a plot of density

hist(eruptions, seq(1.6, 5.2, 0.2), prob=TRUE)
lines(density(eruptions, bw=0.1))
rug(eruptions) # show the actual data points

# More elegant density plots can be made by density, and we added a line produced by density in this example. The bandwidth bw was chosen by trial-and-error as the default gives too much smoothing (it usually does for “interesting” densities). (Better automated methods of bandwidth choice are available, and in this example bw = "SJ" gives a good result.)
In [4]:
1 - sum(dnorm(0:170, 150, 20))
sum(dnorm(0:130, 150, 20))
sum(dnorm(130:170, 150, 20))
0.152656400587744
0.164754936813649
0.694687198824565
In [10]:
# Note that when size = 1, negative binomial is exactly a geometric random variable, e.g.

dnbinom(x = 14, size = 1, prob = 1/6)
dgeom(14, prob = 1/6)
0.0129810943037749
0.0129810943037749
In [4]:
# Cumulative ('p') = Sum of discrete prob.s ('d'); Relative error :

summary(1 - cumsum(dnbinom(x, size = 2, prob = 1/2)) /
pnbinom(x, size = 2, prob = 1/2))
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-2.22e-16 -2.22e-16 -2.22e-16 -1.48e-16  0.00e+00  0.00e+00 
In [7]:
x <- 0:15
size <- (1:20)/4
persp(x, size, dnb <- outer(x, size, function(x,s) dnbinom(x, s, prob = 0.4)),
xlab = "x", ylab = "s", zlab = "density", theta = 150)
title(tit <- "negative binomial density(x,s, pr = 0.4) vs. x & s")
In [8]:
image (x, size, log10(dnb), main = paste("log [", tit, "]"))
contour(x, size, log10(dnb), add = TRUE)
In [9]:
## Alternative parametrization

x1 <- rnbinom(500, mu = 4, size = 1)
x2 <- rnbinom(500, mu = 4, size = 10)
x3 <- rnbinom(500, mu = 4, size = 100)
h1 <- hist(x1, breaks = 20, plot = FALSE)
h2 <- hist(x2, breaks = h1$breaks, plot = FALSE)
h3 <- hist(x3, breaks = h1$breaks, plot = FALSE)
barplot(rbind(h1$counts, h2$counts, h3$counts),
beside = TRUE, col = c("red","blue","cyan"),
names.arg = round(h1$breaks[-length(h1$breaks)]))
In [2]:
## Equivalence of pt(.,nu) with pf(.^2, 1,nu):

x <- seq(0.001, 5, len = 100)
nu <- 4
stopifnot(all.equal(2*pt(x,nu) - 1, pf(x^2, 1,nu)),
## upper tails:
all.equal(2*pt(x, nu, lower=FALSE),
pf(x^2, 1,nu, lower=FALSE)))

## the density of the square of a t_m is 2*dt(x, m)/(2*x)
# check this is the same as the density of F_{1,m}

all.equal(df(x^2, 1, 5), dt(x, 5)/x)

## Identity: qf(2*p - 1, 1, df)) == qt(p, df)^2) for p >= 1/2

p <- seq(1/2, .99, length = 50); df <- 10
rel.err <- function(x, y) ifelse(x == y, 0, abs(x-y)/mean(abs(c(x,y))))
quantile(rel.err(qf(2*p - 1, df1 = 1, df2 = df), qt(p, df)^2), .90) # ~= 7e-9
TRUE
90%: 1.41633660767794e-15
In [1]:
var(rlogis(4000, 0, scale = 5)) # approximately (+/- 3)
pi^2/3 * 5^2
83.4394655830542
82.2467033424113
In [9]:
require(graphics)

dnorm(0) == 1/sqrt(2*pi)
dnorm(1) == exp(-1/2)/sqrt(2*pi)
dnorm(1) == 1/sqrt(2*pi*exp(1))

# Using "log = TRUE" for an extended range :

par(mfrow = c(2,1))
plot(function(x) dnorm(x, log = TRUE), -60, 50,
main = "log { Normal density }")
curve(log(dnorm(x)), add = TRUE, col = "red", lwd = 2)
mtext("dnorm(x, log=TRUE)", adj = 0)
mtext("log(dnorm(x))", col = "red", adj = 1)

plot(function(x) pnorm(x, log.p = TRUE), -50, 10,
main = "log { Normal Cumulative }")
curve(log(pnorm(x)), add = TRUE, col = "red", lwd = 2)
mtext("pnorm(x, log=TRUE)", adj = 0)
mtext("log(pnorm(x))", col = "red", adj = 1)

# if you want the so-called 'error function'

erf <- function(x) 2 * pnorm(x * sqrt(2)) - 1

# (see Abramowitz and Stegun 29.2.29)
# and the so-called 'complementary error function'

erfc <- function(x) 2 * pnorm(x * sqrt(2), lower = FALSE)

## and the inverses

erfinv <- function (x) qnorm((1 + x)/2)/sqrt(2)
erfcinv <- function (x) qnorm(x/2, lower = FALSE)/sqrt(2)
TRUE
TRUE
TRUE
In [38]:
sp <- rnorm(10000,2,4)
qnorm(.75,2,4)
pnorm(4.7,2,4)
ggplot(data.frame(sp), aes(x = sp)) + 
  geom_density()
4.69795900078433
0.750162117528223
In [7]:
library(mosaic)
iris %>%
group_by(Species) %>%
mutate(zSepal.Length = zscore(Sepal.Length)) %>%
head()
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecieszSepal.Length
5.1 3.5 1.4 0.2 setosa 0.26667447
4.9 3.0 1.4 0.2 setosa -0.30071802
4.7 3.2 1.3 0.2 setosa -0.86811050
4.6 3.1 1.5 0.2 setosa -1.15180675
5.0 3.6 1.4 0.2 setosa -0.01702177
5.4 3.9 1.7 0.4 setosa 1.11776320
In [49]:
p <- rnorm(10000, 92.6, 3.6)
ggplot(data.frame(p), aes(x = p)) + 
  geom_density()
qqnorm(p)
In [18]:
df1 <- data.frame(p)
df1 %>%
mutate(zp = zscore(p)) %>%
head()
pzp
93.12181 0.14856493
92.83763 0.06999332
95.65130 0.84794416
96.11340 0.97570887
100.97609 2.32019446
94.61071 0.56023123
In [1]:
# Using R, and the pdf dpois:

dpois(5,6)

# There is a 0.16 probability of making exactly five typos on a two-page sample.
0.16062314104798
In [2]:
require(graphics)

-log(dpois(0:7, lambda = 1) * gamma(1+ 0:7))                # == 1
Ni <- rpois(50, lambda = 4); table(factor(Ni, 0:max(Ni)))
  1. 1
  2. 1
  3. 1
  4. 1
  5. 1
  6. 1
  7. 1
  8. 0.999999999999999
 0  1  2  3  4  5  6  7  8  9 10 11 12 
 1  2  4 11 13  5  8  4  0  1  0  0  1 
In [3]:
1 - ppois(10*(15:25), lambda = 100)                         # becomes 0 (cancellation)
ppois(10*(15:25), lambda = 100, lower.tail = FALSE)         # no cancellation
  1. 1.23309441912856e-06
  2. 1.26166380676196e-08
  3. 7.08579861452563e-11
  4. 2.25264251696444e-13
  5. 4.44089209850063e-16
  6. 0
  7. 0
  8. 0
  9. 0
  10. 0
  11. 0
  1. 1.23309441916004e-06
  2. 1.26166381177638e-08
  3. 7.08579952563146e-11
  4. 2.25310997725103e-13
  5. 4.17423901438677e-16
  6. 4.62617947019577e-19
  7. 3.14209688559682e-22
  8. 1.33721907538907e-25
  9. 3.63932834023767e-29
  10. 6.45388259593527e-33
  11. 7.58780669533798e-37
In [4]:
par(mfrow = c(2, 1))
x <- seq(-0.01, 5, 0.01)
plot(x, ppois(x, 1), type = "s", ylab = "F(x)", main = "Poisson(1) CDF")
plot(x, pbinom(x, 100, 0.01), type = "s", ylab = "F(x)",
main = "Binomial(100, 0.01) CDF")
In [1]:
runif(n = 29, min = 18, max = 54)
rnorm(n = 29, mean = 18, sd = 54)
  1. 27.1955712251365
  2. 22.0233255587518
  3. 35.0356402471662
  4. 41.0560952872038
  5. 22.6900303848088
  6. 30.5824034716934
  7. 25.4327502027154
  8. 50.1896587181836
  9. 20.1789485262707
  10. 36.9963150713593
  11. 21.2678740303963
  12. 23.9648874476552
  13. 30.4179275482893
  14. 48.2265644576401
  15. 40.9570884061977
  16. 34.6769786356017
  17. 47.026730700396
  18. 18.034056100063
  19. 46.7995091322809
  20. 37.6622973773628
  21. 32.2035112883896
  22. 26.6470994967967
  23. 48.5707268184051
  24. 48.9341514632106
  25. 26.6191097907722
  26. 27.4681842941791
  27. 37.534052121453
  28. 26.3580023422837
  29. 23.7684020083398
  1. 39.8117024806976
  2. 29.4573689602028
  3. -94.7644627976712
  4. -20.7634519477918
  5. -16.9451765407427
  6. 23.3206706764095
  7. 125.682222171796
  8. 37.6672247533619
  9. -17.2525743800105
  10. -66.583075730557
  11. 85.8411946408998
  12. 63.048591649908
  13. 20.5254530029275
  14. -18.3281862784561
  15. -108.817279760943
  16. -63.4191879861618
  17. 94.8814387315986
  18. 25.3859942591542
  19. -99.320096493243
  20. -70.7866095506014
  21. 26.6845843618733
  22. -4.89875193393108
  23. 8.18536230128922
  24. 68.9069812351441
  25. 26.9579577433558
  26. 57.7658130026427
  27. -8.93949980150902
  28. -10.5879118681303
  29. 59.9903582529538