Chapter 5.1.5 Exercises

descr_stats
In [4]:
setwd("./data")
load( "parenthood.Rdata" )
library(lsr)
who(TRUE)
   -- Name --     -- Class --   -- Size --
   parenthood     data.frame    100 x 4   
    $dan.sleep    numeric       100       
    $baby.sleep   numeric       100       
    $dan.grump    numeric       100       
    $day          integer       100       
In [6]:
# If I calculate my descriptive statistics using the describe() function

library(psych)
describe( parenthood2 )

# we can see from the n column that there are 9 missing values for dan.sleep, 11 missing values for baby.sleep and 8 missing values for dan.grump.
varsnmeansdmediantrimmedmadminmaxrangeskewkurtosisse
dan.sleep1 91 6.976923 1.020409 7.03 7.022740 1.126776 4.84 9.00 4.16 -0.33466179-0.7278902 0.1069679
baby.sleep2 89 8.114494 2.046821 8.20 8.129452 2.283204 3.25 12.07 8.82 -0.09287191-0.5943754 0.2169626
dan.grump3 92 63.152174 9.851574 61.00 62.662162 10.378200 41.00 89.00 48.00 0.38335860-0.3144273 1.0270976
day4 100 50.500000 29.011492 50.50 50.500000 37.065000 1.00 100.00 99.00 0.00000000-1.2360552 2.9011492
In [12]:
# Similar, but not quite the same. It’s also worth noting that the correlate() function (in the lsr package) automatically uses the “pairwise complete” method:

library(lsr)
correlate(parenthood2)
$correlation
dan.sleepbaby.sleepdan.grumpday
dan.sleep NA 0.61472303 -0.903442442-0.076796665
baby.sleep 0.61472303 NA -0.567802669 0.058309485
dan.grump-0.90344244 -0.56780267 NA 0.005833399
day-0.07679667 0.05830949 0.005833399 NA
$p.value
dan.sleepbaby.sleepdan.grumpday
dan.sleep NA6.565004e-099.389475e-31 1
baby.sleep6.565004e-09 NA1.060058e-07 1
dan.grump9.389475e-311.060058e-07 NA 1
day1.000000e+001.000000e+001.000000e+00NA
$sample.size
dan.sleepbaby.sleepdan.grumpday
dan.sleep91 80 83 91
baby.sleep80 89 82 89
dan.grump83 82 92 92
day91 89 92 100
$args
two.inputs
'FALSE'
test
'FALSE'
corr.method
'pearson'
p.adjust.method
'holm'
$tiesProblem
FALSE
In [12]:
# Similar, but not quite the same. It’s also worth noting that the correlate() function (in the lsr package) automatically uses the “pairwise complete” method:

library(lsr)
correlate(parenthood2)
$correlation
dan.sleepbaby.sleepdan.grumpday
dan.sleep NA 0.61472303 -0.903442442-0.076796665
baby.sleep 0.61472303 NA -0.567802669 0.058309485
dan.grump-0.90344244 -0.56780267 NA 0.005833399
day-0.07679667 0.05830949 0.005833399 NA
$p.value
dan.sleepbaby.sleepdan.grumpday
dan.sleep NA6.565004e-099.389475e-31 1
baby.sleep6.565004e-09 NA1.060058e-07 1
dan.grump9.389475e-311.060058e-07 NA 1
day1.000000e+001.000000e+001.000000e+00NA
$sample.size
dan.sleepbaby.sleepdan.grumpday
dan.sleep91 80 83 91
baby.sleep80 89 82 89
dan.grump83 82 92 92
day91 89 92 100
$args
two.inputs
'FALSE'
test
'FALSE'
corr.method
'pearson'
p.adjust.method
'holm'
$tiesProblem
FALSE
In [4]:
# we see that the file contains a single data frame called parenthood, which contains four variables dan.sleep, baby.sleep, dan.grump and day. If we peek at the data using head() out the data, here’s what we get:

head(parenthood,10)
dan.sleepbaby.sleepdan.grumpday
7.59 10.1856 1
7.91 11.6660 2
5.14 7.9282 3
7.71 9.6155 4
6.68 9.7567 5
5.99 5.0472 6
8.19 10.4553 7
7.19 8.2760 8
7.40 6.0660 9
6.58 7.0971 10
In [6]:
# Next, I’ll calculate some basic descriptive statistics:
library(psych)
describe( parenthood )
varsnmeansdmediantrimmedmadminmaxrangeskewkurtosisse
dan.sleep1 100 6.9652 1.015884 7.03 7.003125 1.089711 4.84 9.00 4.16 -0.28683284-0.7225107 0.1015884
baby.sleep2 100 8.0492 2.074232 7.95 8.047750 2.327682 3.25 12.07 8.82 -0.02310118-0.6893588 0.2074232
dan.grump3 100 63.7100 10.049670 62.00 63.162500 9.636900 41.00 91.00 50.00 0.43410145-0.1576624 1.0049670
day4 100 50.5000 29.011492 50.50 50.500000 37.065000 1.00 100.00 99.00 0.00000000-1.2360552 2.9011492
In [10]:
library(ggplot2)
library(dplyr)
library(moderndive)
library(ISLR)
library(skimr)

Balance <- lm(dan.sleep ~ dan.grump, data = parenthood)
get_regression_table(Balance)

ggplot(parenthood, aes(x = dan.grump, y = dan.sleep)) +
  geom_point() +
  labs(x = "grump", y = "sleep", 
       title = "parenthood") +
  geom_smooth(method = "lm")
termestimatestd_errorstatisticp_valuelower_ciupper_ci
intercept12.783 0.282 45.267 0 12.223 13.344
dan.grump-0.091 0.004 -20.854 0 -0.100 -0.083
In [12]:
# However, the cor() function is a bit more powerful than this simple example suggests. For example, you can also calculate a complete “correlation matrix”, between all pairs of variables in the data frame.
# correlate all pairs of variables in "parenthood":

cor( x = parenthood )  
correlate(parenthood)
dan.sleepbaby.sleepdan.grumpday
dan.sleep 1.00000000 0.62794934-0.90338404-0.09840768
baby.sleep 0.62794934 1.00000000-0.56596373-0.01043394
dan.grump-0.90338404-0.56596373 1.00000000 0.07647926
day-0.09840768-0.01043394 0.07647926 1.00000000
$correlation
dan.sleepbaby.sleepdan.grumpday
dan.sleep NA 0.62794934-0.90338404-0.09840768
baby.sleep 0.62794934 NA-0.56596373-0.01043394
dan.grump-0.90338404-0.56596373 NA 0.07647926
day-0.09840768-0.01043394 0.07647926 NA
$p.value
dan.sleepbaby.sleepdan.grumpday
dan.sleep NA1.348133e-114.905856e-370.9900633
baby.sleep1.348133e-11 NA3.379004e-090.9900633
dan.grump4.905856e-373.379004e-09 NA0.9900633
day9.900633e-019.900633e-019.900633e-01 NA
$sample.size
dan.sleepbaby.sleepdan.grumpday
dan.sleep100100100100
baby.sleep100100100100
dan.grump100100100100
day100100100100
$args
two.inputs
'FALSE'
test
'FALSE'
corr.method
'pearson'
p.adjust.method
'holm'
$tiesProblem
FALSE
In [4]:
require(graphics)

y <- rt(200, df = 5)
qqnorm(y); qqline(y, col = 2)
In [11]:
qqplot(y, rt(300, df = 5))
In [36]:
library(knitr)
knitr::kable(
rbind(
c("-1.0 to -0.9" ,"Very strong", "Negative"),
c("-0.9 to -0.7", "Strong", "Negative") ,
c("-0.7 to -0.4", "Moderate", "Negative") ,
c("-0.4 to -0.2", "Weak", "Negative"),
c("-0.2 to 0","Negligible", "Negative") ,
c("0 to 0.2","Negligible", "Positive"),
c("0.2 to 0.4", "Weak", "Positive"), 
c("0.4 to 0.7", "Moderate", "Positive"), 
c("0.7 to 0.9", "Strong", "Positive"), 
c("0.9 to 1.0", "Very strong", "Positive")), col.names=c("Correlation", "Strength", "Direction"),
  booktabs = TRUE)

|Correlation  |Strength    |Direction |
|:------------|:-----------|:---------|
|-1.0 to -0.9 |Very strong |Negative  |
|-0.9 to -0.7 |Strong      |Negative  |
|-0.7 to -0.4 |Moderate    |Negative  |
|-0.4 to -0.2 |Weak        |Negative  |
|-0.2 to 0    |Negligible  |Negative  |
|0 to 0.2     |Negligible  |Positive  |
|0.2 to 0.4   |Weak        |Positive  |
|0.4 to 0.7   |Moderate    |Positive  |
|0.7 to 0.9   |Strong      |Positive  |
|0.9 to 1.0   |Very strong |Positive  |
In [11]:
# You’d think that these four data setswould look pretty similar to one another. They do not.

cor( anscombe$x1, anscombe$y1 )
cor( anscombe$x2, anscombe$y2 )
0.81642051634484
0.816236506000243
In [16]:
load( "effort.Rdata" ) 
who(TRUE)

# The raw data look like this:

effort
   -- Name --     -- Class --   -- Size --
   effort         data.frame    10 x 2    
    $hours        numeric       10        
    $grade        numeric       10        
   parenthood     data.frame    100 x 4   
    $dan.sleep    numeric       100       
    $baby.sleep   numeric       100       
    $dan.grump    numeric       100       
    $day          integer       100       
hoursgrade
213
7691
4079
614
1621
2874
2747
5985
4684
6888
In [22]:
#We can get R to construct these rankings using the rank() function, like this:

hours.rank <- rank( effort$hours )   # rank students by hours worked
grade.rank <- rank( effort$grade )   # rank students by grade received
hours.rank
grade.rank
  1. 1
  2. 10
  3. 6
  4. 2
  5. 3
  6. 5
  7. 4
  8. 8
  9. 7
  10. 9
  1. 1
  2. 10
  3. 6
  4. 2
  5. 3
  6. 5
  7. 4
  8. 8
  9. 7
  10. 9
In [23]:
# It’s much easier to just specify the method argument of the cor() function.

cor( effort$hours, effort$grade, method = "spearman")

# The default value of the method argument is "pearson", which is why we didn’t have to specify it earlier on when we were doing Pearson correlations.
1
In [24]:
load("work.Rdata")
who(TRUE)
   -- Name --     -- Class --   -- Size --
   effort         data.frame    10 x 2    
    $hours        numeric       10        
    $grade        numeric       10        
   grade.rank     numeric       10        
   hours.rank     numeric       10        
   parenthood     data.frame    100 x 4   
    $dan.sleep    numeric       100       
    $baby.sleep   numeric       100       
    $dan.grump    numeric       100       
    $day          integer       100       
   work           data.frame    49 x 7    
    $hours        numeric       49        
    $tasks        numeric       49        
    $pay          numeric       49        
    $day          integer       49        
    $weekday      factor        49        
    $week         numeric       49        
    $day.type     factor        49        
In [30]:
correlate(work)
$correlation
hourstaskspaydayweekdayweekday.type
hours NA 0.800172470.7604283 -0.04944665NA 0.01822231NA
tasks 0.80017247 NA0.7202438 -0.07154406NA -0.01273831NA
pay 0.76042829 0.72024377 NA 0.13652421NA 0.19643348NA
day-0.04944665-0.071544060.1365242 NANA 0.98994949NA
weekday NA NA NA NANA NANA
week 0.01822231-0.012738310.1964335 0.98994949NA NANA
day.type NA NA NA NANA NANA
$p.value
hourstaskspaydayweekdayweekday.type
hours NA4.732297e-111.854085e-091.000000e+00NA 1.000000e+00NA
tasks4.732297e-11 NA3.793229e-081.000000e+00NA 1.000000e+00NA
pay1.854085e-093.793229e-08 NA1.000000e+00NA 1.000000e+00NA
day1.000000e+001.000000e+001.000000e+00 NANA 1.386732e-40NA
weekday NA NA NA NANA NANA
week1.000000e+001.000000e+001.000000e+001.386732e-40NA NANA
day.type NA NA NA NANA NANA
$sample.size
hourstaskspaydayweekdayweekday.type
hours49494949494949
tasks49494949494949
pay49494949494949
day49494949494949
weekday49494949494949
week49494949494949
day.type49494949494949
$args
two.inputs
'FALSE'
test
'FALSE'
corr.method
'pearson'
p.adjust.method
'holm'
$tiesProblem
FALSE
In [3]:
# let’s load the aflsmall.Rdata file, and use the who() function in the lsr package to see what variables are stored in the file:
setwd("./data")
load( "aflsmall.Rdata" )
library(lsr)
who( expand = TRUE )
   -- Name --      -- Class --   -- Size --
   afl.finalists   factor        400       
   afl.margins     numeric       176       
In [5]:
print(afl.margins)
  [1]  56  31  56   8  32  14  36  56  19   1   3 104  43  44  72   9  28  25
 [19]  27  55  20  16  16   7  23  40  48  64  22  55  95  15  49  52  50  10
 [37]  65  12  39  36   3  26  23  20  43 108  53  38   4   8   3  13  66  67
 [55]  50  61  36  38  29   9  81   3  26  12  36  37  70   1  35  12  50  35
 [73]   9  54  47   8  47   2  29  61  38  41  23  24   1   9  11  10  29  47
 [91]  71  38  49  65  18   0  16   9  19  36  60  24  25  44  55   3  57  83
[109]  84  35   4  35  26  22   2  14  19  30  19  68  11  75  48  32  36  39
[127]  50  11   0  63  82  26   3  82  73  19  33  48   8  10  53  20  71  75
[145]  76  54  44   5  22  94  29   8  98   9  89   1 101   7  21  52  42  21
[163] 116   3  44  29  27  16   6  44   3  28  38  29  10  10
In [6]:
hist(afl.margins)
In [14]:
# Let’s use the interquartile range to calculate the median AFL winning margin:

quantile( x = afl.margins, probs = .5)
50%: 30.5
In [26]:
X <- c(56, 31,56,8,32)   # enter the data
X.bar <- mean( X )       # step 1. the mean of the data
AD <- abs( X - X.bar )   # step 2. the absolute deviations from the mean
AAD <- mean( AD )        # step 3. the mean absolute deviations
print( AAD )             # print the results
[1] 15.52
In [3]:
setwd("./data")
partial <- c(10, 20, NA, 30)
mean( x = partial )
mean( x = partial, na.rm = TRUE )

# Notice that the mean is 20 (i.e., 60 / 3) and not 15. When R ignores a NA value, it genuinely ignores it. In effect, the calculation above is identical to what you’d get if you asked for the mean of the three-element vector c(10, 20, 30).
<NA>
20
In [6]:
qqnorm(precip, ylab = "Precipitation [in/yr] for 70 US cities")
In [9]:
## "QQ-Chisquare" : --------------------------

y <- rchisq(500, df = 3)

## Q-Q plot for Chi^2 data against true theoretical distribution:

qqplot(qchisq(ppoints(500), df = 3), y,
main = expression("Q-Q plot for" ~~ {chi^2}[nu == 3]))
qqline(y, distribution = function(p) qchisq(p, df = 3),
prob = c(0.1, 0.6), col = 2)
mtext("qqline(*, dist = qchisq(., df=3), prob = c(0.1, 0.6))")
In [2]:
# show plot using runquantile
library(caTools)
k=31; n=200;
x = rnorm(n,sd=30) + abs(seq(n)-n/4)
y=runquantile(x, k, probs=c(0.05, 0.25, 0.5, 0.75, 0.95))
col = c("black", "red", "green", "blue", "magenta", "cyan")
plot(x, col=col[1], main = "Moving Window Quantiles")
lines(y[,1], col=col[2])
lines(y[,2], col=col[3])
lines(y[,3], col=col[4])
lines(y[,4], col=col[5])
lines(y[,5], col=col[6])
lab = c("data", "runquantile(.05)", "runquantile(.25)", "runquantile(0.5)",
"runquantile(.75)", "runquantile(.95)")
legend(0,230, lab, col=col, lty=1 )
In [29]:
setwd("~/Documents/learning/current/R/r-intro/rbook/data")
load( "aflsmall.Rdata" )

summary( object = afl.margins )
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   12.75   30.50   35.30   50.50  116.00 
In [34]:
# There’s a single data frame called clin.trial which contains three variables, drug, therapy and mood.gain. Presumably then, this data is from a clinical trial of some kind, in which people were administered different drugs; and the researchers looked to see what the drugs did to their mood. Let’s see if the summary() function sheds a little more light on this situation:

clin.trial
summary( clin.trial )

# Evidently there were three drugs: a placebo, something called “anxifree” and something called “joyzepam”; and there were 6 people administered each drug. There were 9 people treated using cognitive behavioural therapy (CBT) and 9 people who received no psychological treatment. 
# And we can see from looking at the summary of the mood.gain variable that most people did show a mood gain (mean =.88), though without knowing what the scale is here it’s hard to say much more than that. Still, that’s not too bad. Overall, I feel that I learned something from that.
drugtherapymood.gain
placebo no.therapy0.5
placebo no.therapy0.3
placebo no.therapy0.1
anxifree no.therapy0.6
anxifree no.therapy0.4
anxifree no.therapy0.2
joyzepam no.therapy1.4
joyzepam no.therapy1.7
joyzepam no.therapy1.3
placebo CBT 0.6
placebo CBT 0.9
placebo CBT 0.3
anxifree CBT 1.1
anxifree CBT 0.8
anxifree CBT 1.2
joyzepam CBT 1.8
joyzepam CBT 1.3
joyzepam CBT 1.4
       drug         therapy    mood.gain     
 placebo :6   no.therapy:9   Min.   :0.1000  
 anxifree:6   CBT       :9   1st Qu.:0.4250  
 joyzepam:6                  Median :0.8500  
                             Mean   :0.8833  
                             3rd Qu.:1.3000  
                             Max.   :1.8000  
In [26]:
# let’s use the describe() function to have a look at the clin.trial data frame. Here’s what we get:
library(psych)
describe( x = clin.trial )
# As you can see, the output for the asterisked variables is pretty meaningless, and should be ignored. However, for the mood.gain variable, there’s a lot of useful information.
varsnmeansdmediantrimmedmadminmaxrangeskewkurtosisse
drug*1 18 2.00000000.84016812.00 2.000 1.48260 1.0 3.0 2.0 0.0000000-1.6620370.1980295
therapy*2 18 1.50000000.51449581.50 1.500 0.74130 1.0 2.0 1.0 0.0000000-2.1080250.1212678
mood.gain3 18 0.88333330.53385390.85 0.875 0.66717 0.1 1.8 1.7 0.1333981-1.4397390.1258306
In [31]:
# For instance, let’s say, I want to look at the descriptive statistics for the clin.trial data, broken down separately by therapy type. The command I would use here is:

describeBy( x=clin.trial, group=clin.trial$therapy )
 Descriptive statistics by group 
group: no.therapy
          vars n mean   sd median trimmed  mad min max range skew kurtosis   se
drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0 0.00    -1.81 0.29
therapy*     2 9 1.00 0.00    1.0    1.00 0.00 1.0 1.0   0.0  NaN      NaN 0.00
mood.gain    3 9 0.72 0.59    0.5    0.72 0.44 0.1 1.7   1.6 0.51    -1.59 0.20
------------------------------------------------------------ 
group: CBT
          vars n mean   sd median trimmed  mad min max range  skew kurtosis
drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0  0.00    -1.81
therapy*     2 9 2.00 0.00    2.0    2.00 0.00 2.0 2.0   0.0   NaN      NaN
mood.gain    3 9 1.04 0.45    1.1    1.04 0.44 0.3 1.8   1.5 -0.03    -1.12
            se
drug*     0.29
therapy*  0.00
mood.gain 0.15
In [32]:
# To give a sense of how powerful by() is, you can reproduce the describeBy() function by using a command like this:

by( data=clin.trial, INDICES=clin.trial$therapy, FUN=describe )
clin.trial$therapy: no.therapy
          vars n mean   sd median trimmed  mad min max range skew kurtosis   se
drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0 0.00    -1.81 0.29
therapy*     2 9 1.00 0.00    1.0    1.00 0.00 1.0 1.0   0.0  NaN      NaN 0.00
mood.gain    3 9 0.72 0.59    0.5    0.72 0.44 0.1 1.7   1.6 0.51    -1.59 0.20
------------------------------------------------------------ 
clin.trial$therapy: CBT
          vars n mean   sd median trimmed  mad min max range  skew kurtosis
drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0  0.00    -1.81
therapy*     2 9 2.00 0.00    2.0    2.00 0.00 2.0 2.0   0.0   NaN      NaN
mood.gain    3 9 1.04 0.45    1.1    1.04 0.44 0.3 1.8   1.5 -0.03    -1.12
            se
drug*     0.29
therapy*  0.00
mood.gain 0.15
In [33]:
# This will produce the exact same output as the command shown earlier. However, there’s nothing special about the describe() function. You could just as easily use the by() function in conjunction with the summary() function. For example:

by( data=clin.trial, INDICES=clin.trial$therapy, FUN=summary )
clin.trial$therapy: no.therapy
       drug         therapy    mood.gain     
 placebo :3   no.therapy:9   Min.   :0.1000  
 anxifree:3   CBT       :0   1st Qu.:0.3000  
 joyzepam:3                  Median :0.5000  
                             Mean   :0.7222  
                             3rd Qu.:1.3000  
                             Max.   :1.7000  
------------------------------------------------------------ 
clin.trial$therapy: CBT
       drug         therapy    mood.gain    
 placebo :3   no.therapy:0   Min.   :0.300  
 anxifree:3   CBT       :9   1st Qu.:0.800  
 joyzepam:3                  Median :1.100  
                             Mean   :1.044  
                             3rd Qu.:1.300  
                             Max.   :1.800  
In [35]:
# to obtain group means, use this command:

 aggregate( formula = mood.gain ~ drug + therapy,  # mood.gain by drug/therapy combination
            data = clin.trial,                     # data is in the clin.trial data frame
            FUN = mean                             # print out group means
 )
drugtherapymood.gain
placebo no.therapy0.300000
anxifree no.therapy0.400000
joyzepam no.therapy1.466667
placebo CBT 0.600000
anxifree CBT 1.033333
joyzepam CBT 1.500000
In [36]:
# or, alternatively, if you want to calculate the standard deviations for each group, you would use the following command (argument names omitted this time):

aggregate( mood.gain ~ drug + therapy, clin.trial, sd )
drugtherapymood.gain
placebo no.therapy0.2000000
anxifree no.therapy0.2000000
joyzepam no.therapy0.2081666
placebo CBT 0.3000000
anxifree CBT 0.2081666
joyzepam CBT 0.2645751