setwd("./data")
load( "parenthood.Rdata" )
library(lsr)
who(TRUE)

   -- Name --     -- Class --   -- Size --
   parenthood     data.frame    100 x 4   
    $dan.sleep    numeric       100       
    $baby.sleep   numeric       100       
    $dan.grump    numeric       100       
    $day          integer       100

# If I calculate my descriptive statistics using the describe() function

library(psych)
describe( parenthood2 )

# we can see from the n column that there are 9 missing values for dan.sleep, 11 missing values for baby.sleep and 8 missing values for dan.grump.

# Similar, but not quite the same. It’s also worth noting that the correlate() function (in the lsr package) automatically uses the “pairwise complete” method:

library(lsr)
correlate(parenthood2)

# Similar, but not quite the same. It’s also worth noting that the correlate() function (in the lsr package) automatically uses the “pairwise complete” method:

library(lsr)
correlate(parenthood2)

# we see that the file contains a single data frame called parenthood, which contains four variables dan.sleep, baby.sleep, dan.grump and day. If we peek at the data using head() out the data, here’s what we get:

head(parenthood,10)

# Next, I’ll calculate some basic descriptive statistics:
library(psych)
describe( parenthood )

library(ggplot2)
library(dplyr)
library(moderndive)
library(ISLR)
library(skimr)

Balance <- lm(dan.sleep ~ dan.grump, data = parenthood)
get_regression_table(Balance)

ggplot(parenthood, aes(x = dan.grump, y = dan.sleep)) +
  geom_point() +
  labs(x = "grump", y = "sleep", 
       title = "parenthood") +
  geom_smooth(method = "lm")

# However, the cor() function is a bit more powerful than this simple example suggests. For example, you can also calculate a complete “correlation matrix”, between all pairs of variables in the data frame.
# correlate all pairs of variables in "parenthood":

cor( x = parenthood )  
correlate(parenthood)

require(graphics)

y <- rt(200, df = 5)
qqnorm(y); qqline(y, col = 2)

qqplot(y, rt(300, df = 5))

library(knitr)
knitr::kable(
rbind(
c("-1.0 to -0.9" ,"Very strong", "Negative"),
c("-0.9 to -0.7", "Strong", "Negative") ,
c("-0.7 to -0.4", "Moderate", "Negative") ,
c("-0.4 to -0.2", "Weak", "Negative"),
c("-0.2 to 0","Negligible", "Negative") ,
c("0 to 0.2","Negligible", "Positive"),
c("0.2 to 0.4", "Weak", "Positive"), 
c("0.4 to 0.7", "Moderate", "Positive"), 
c("0.7 to 0.9", "Strong", "Positive"), 
c("0.9 to 1.0", "Very strong", "Positive")), col.names=c("Correlation", "Strength", "Direction"),
  booktabs = TRUE)


|Correlation  |Strength    |Direction |
|:------------|:-----------|:---------|
|-1.0 to -0.9 |Very strong |Negative  |
|-0.9 to -0.7 |Strong      |Negative  |
|-0.7 to -0.4 |Moderate    |Negative  |
|-0.4 to -0.2 |Weak        |Negative  |
|-0.2 to 0    |Negligible  |Negative  |
|0 to 0.2     |Negligible  |Positive  |
|0.2 to 0.4   |Weak        |Positive  |
|0.4 to 0.7   |Moderate    |Positive  |
|0.7 to 0.9   |Strong      |Positive  |
|0.9 to 1.0   |Very strong |Positive  |

# You’d think that these four data setswould look pretty similar to one another. They do not.

cor( anscombe$x1, anscombe$y1 )
cor( anscombe$x2, anscombe$y2 )

load( "effort.Rdata" ) 
who(TRUE)

# The raw data look like this:

effort

   -- Name --     -- Class --   -- Size --
   effort         data.frame    10 x 2    
    $hours        numeric       10        
    $grade        numeric       10        
   parenthood     data.frame    100 x 4   
    $dan.sleep    numeric       100       
    $baby.sleep   numeric       100       
    $dan.grump    numeric       100       
    $day          integer       100

#We can get R to construct these rankings using the rank() function, like this:

hours.rank <- rank( effort$hours )   # rank students by hours worked
grade.rank <- rank( effort$grade )   # rank students by grade received
hours.rank
grade.rank

# It’s much easier to just specify the method argument of the cor() function.

cor( effort$hours, effort$grade, method = "spearman")

# The default value of the method argument is "pearson", which is why we didn’t have to specify it earlier on when we were doing Pearson correlations.

load("work.Rdata")
who(TRUE)

   -- Name --     -- Class --   -- Size --
   effort         data.frame    10 x 2    
    $hours        numeric       10        
    $grade        numeric       10        
   grade.rank     numeric       10        
   hours.rank     numeric       10        
   parenthood     data.frame    100 x 4   
    $dan.sleep    numeric       100       
    $baby.sleep   numeric       100       
    $dan.grump    numeric       100       
    $day          integer       100       
   work           data.frame    49 x 7    
    $hours        numeric       49        
    $tasks        numeric       49        
    $pay          numeric       49        
    $day          integer       49        
    $weekday      factor        49        
    $week         numeric       49        
    $day.type     factor        49

correlate(work)

# let’s load the aflsmall.Rdata file, and use the who() function in the lsr package to see what variables are stored in the file:
setwd("./data")
load( "aflsmall.Rdata" )
library(lsr)
who( expand = TRUE )

   -- Name --      -- Class --   -- Size --
   afl.finalists   factor        400       
   afl.margins     numeric       176

print(afl.margins)

  [1]  56  31  56   8  32  14  36  56  19   1   3 104  43  44  72   9  28  25
 [19]  27  55  20  16  16   7  23  40  48  64  22  55  95  15  49  52  50  10
 [37]  65  12  39  36   3  26  23  20  43 108  53  38   4   8   3  13  66  67
 [55]  50  61  36  38  29   9  81   3  26  12  36  37  70   1  35  12  50  35
 [73]   9  54  47   8  47   2  29  61  38  41  23  24   1   9  11  10  29  47
 [91]  71  38  49  65  18   0  16   9  19  36  60  24  25  44  55   3  57  83
[109]  84  35   4  35  26  22   2  14  19  30  19  68  11  75  48  32  36  39
[127]  50  11   0  63  82  26   3  82  73  19  33  48   8  10  53  20  71  75
[145]  76  54  44   5  22  94  29   8  98   9  89   1 101   7  21  52  42  21
[163] 116   3  44  29  27  16   6  44   3  28  38  29  10  10

hist(afl.margins)

# Let’s use the interquartile range to calculate the median AFL winning margin:

quantile( x = afl.margins, probs = .5)

X <- c(56, 31,56,8,32)   # enter the data
X.bar <- mean( X )       # step 1. the mean of the data
AD <- abs( X - X.bar )   # step 2. the absolute deviations from the mean
AAD <- mean( AD )        # step 3. the mean absolute deviations
print( AAD )             # print the results

[1] 15.52

setwd("./data")
partial <- c(10, 20, NA, 30)
mean( x = partial )
mean( x = partial, na.rm = TRUE )

# Notice that the mean is 20 (i.e., 60 / 3) and not 15. When R ignores a NA value, it genuinely ignores it. In effect, the calculation above is identical to what you’d get if you asked for the mean of the three-element vector c(10, 20, 30).

qqnorm(precip, ylab = "Precipitation [in/yr] for 70 US cities")

## "QQ-Chisquare" : --------------------------

y <- rchisq(500, df = 3)

## Q-Q plot for Chi^2 data against true theoretical distribution:

qqplot(qchisq(ppoints(500), df = 3), y,
main = expression("Q-Q plot for" ~~ {chi^2}[nu == 3]))
qqline(y, distribution = function(p) qchisq(p, df = 3),
prob = c(0.1, 0.6), col = 2)
mtext("qqline(*, dist = qchisq(., df=3), prob = c(0.1, 0.6))")

# show plot using runquantile
library(caTools)
k=31; n=200;
x = rnorm(n,sd=30) + abs(seq(n)-n/4)
y=runquantile(x, k, probs=c(0.05, 0.25, 0.5, 0.75, 0.95))
col = c("black", "red", "green", "blue", "magenta", "cyan")
plot(x, col=col[1], main = "Moving Window Quantiles")
lines(y[,1], col=col[2])
lines(y[,2], col=col[3])
lines(y[,3], col=col[4])
lines(y[,4], col=col[5])
lines(y[,5], col=col[6])
lab = c("data", "runquantile(.05)", "runquantile(.25)", "runquantile(0.5)",
"runquantile(.75)", "runquantile(.95)")
legend(0,230, lab, col=col, lty=1 )

setwd("~/Documents/learning/current/R/r-intro/rbook/data")
load( "aflsmall.Rdata" )

summary( object = afl.margins )

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   12.75   30.50   35.30   50.50  116.00

# There’s a single data frame called clin.trial which contains three variables, drug, therapy and mood.gain. Presumably then, this data is from a clinical trial of some kind, in which people were administered different drugs; and the researchers looked to see what the drugs did to their mood. Let’s see if the summary() function sheds a little more light on this situation:

clin.trial
summary( clin.trial )

# Evidently there were three drugs: a placebo, something called “anxifree” and something called “joyzepam”; and there were 6 people administered each drug. There were 9 people treated using cognitive behavioural therapy (CBT) and 9 people who received no psychological treatment. 
# And we can see from looking at the summary of the mood.gain variable that most people did show a mood gain (mean =.88), though without knowing what the scale is here it’s hard to say much more than that. Still, that’s not too bad. Overall, I feel that I learned something from that.

       drug         therapy    mood.gain     
 placebo :6   no.therapy:9   Min.   :0.1000  
 anxifree:6   CBT       :9   1st Qu.:0.4250  
 joyzepam:6                  Median :0.8500  
                             Mean   :0.8833  
                             3rd Qu.:1.3000  
                             Max.   :1.8000

# let’s use the describe() function to have a look at the clin.trial data frame. Here’s what we get:
library(psych)
describe( x = clin.trial )
# As you can see, the output for the asterisked variables is pretty meaningless, and should be ignored. However, for the mood.gain variable, there’s a lot of useful information.

# For instance, let’s say, I want to look at the descriptive statistics for the clin.trial data, broken down separately by therapy type. The command I would use here is:

describeBy( x=clin.trial, group=clin.trial$therapy )

 Descriptive statistics by group 
group: no.therapy
          vars n mean   sd median trimmed  mad min max range skew kurtosis   se
drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0 0.00    -1.81 0.29
therapy*     2 9 1.00 0.00    1.0    1.00 0.00 1.0 1.0   0.0  NaN      NaN 0.00
mood.gain    3 9 0.72 0.59    0.5    0.72 0.44 0.1 1.7   1.6 0.51    -1.59 0.20
------------------------------------------------------------ 
group: CBT
          vars n mean   sd median trimmed  mad min max range  skew kurtosis
drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0  0.00    -1.81
therapy*     2 9 2.00 0.00    2.0    2.00 0.00 2.0 2.0   0.0   NaN      NaN
mood.gain    3 9 1.04 0.45    1.1    1.04 0.44 0.3 1.8   1.5 -0.03    -1.12
            se
drug*     0.29
therapy*  0.00
mood.gain 0.15

# To give a sense of how powerful by() is, you can reproduce the describeBy() function by using a command like this:

by( data=clin.trial, INDICES=clin.trial$therapy, FUN=describe )

clin.trial$therapy: no.therapy
          vars n mean   sd median trimmed  mad min max range skew kurtosis   se
drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0 0.00    -1.81 0.29
therapy*     2 9 1.00 0.00    1.0    1.00 0.00 1.0 1.0   0.0  NaN      NaN 0.00
mood.gain    3 9 0.72 0.59    0.5    0.72 0.44 0.1 1.7   1.6 0.51    -1.59 0.20
------------------------------------------------------------ 
clin.trial$therapy: CBT
          vars n mean   sd median trimmed  mad min max range  skew kurtosis
drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0  0.00    -1.81
therapy*     2 9 2.00 0.00    2.0    2.00 0.00 2.0 2.0   0.0   NaN      NaN
mood.gain    3 9 1.04 0.45    1.1    1.04 0.44 0.3 1.8   1.5 -0.03    -1.12
            se
drug*     0.29
therapy*  0.00
mood.gain 0.15

# This will produce the exact same output as the command shown earlier. However, there’s nothing special about the describe() function. You could just as easily use the by() function in conjunction with the summary() function. For example:

by( data=clin.trial, INDICES=clin.trial$therapy, FUN=summary )

clin.trial$therapy: no.therapy
       drug         therapy    mood.gain     
 placebo :3   no.therapy:9   Min.   :0.1000  
 anxifree:3   CBT       :0   1st Qu.:0.3000  
 joyzepam:3                  Median :0.5000  
                             Mean   :0.7222  
                             3rd Qu.:1.3000  
                             Max.   :1.7000  
------------------------------------------------------------ 
clin.trial$therapy: CBT
       drug         therapy    mood.gain    
 placebo :3   no.therapy:0   Min.   :0.300  
 anxifree:3   CBT       :9   1st Qu.:0.800  
 joyzepam:3                  Median :1.100  
                             Mean   :1.044  
                             3rd Qu.:1.300  
                             Max.   :1.800

# to obtain group means, use this command:

 aggregate( formula = mood.gain ~ drug + therapy,  # mood.gain by drug/therapy combination
            data = clin.trial,                     # data is in the clin.trial data frame
            FUN = mean                             # print out group means
 )

# or, alternatively, if you want to calculate the standard deviations for each group, you would use the following command (argument names omitted this time):

aggregate( mood.gain ~ drug + therapy, clin.trial, sd )

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
dan.sleep	1	91	6.976923	1.020409	7.03	7.022740	1.126776	4.84	9.00	4.16	-0.33466179	-0.7278902	0.1069679
baby.sleep	2	89	8.114494	2.046821	8.20	8.129452	2.283204	3.25	12.07	8.82	-0.09287191	-0.5943754	0.2169626
dan.grump	3	92	63.152174	9.851574	61.00	62.662162	10.378200	41.00	89.00	48.00	0.38335860	-0.3144273	1.0270976
day	4	100	50.500000	29.011492	50.50	50.500000	37.065000	1.00	100.00	99.00	0.00000000	-1.2360552	2.9011492

dan.sleep	baby.sleep	dan.grump	day
7.59	10.18	56	1
7.91	11.66	60	2
5.14	7.92	82	3
7.71	9.61	55	4
6.68	9.75	67	5
5.99	5.04	72	6
8.19	10.45	53	7
7.19	8.27	60	8
7.40	6.06	60	9
6.58	7.09	71	10

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
dan.sleep	1	100	6.9652	1.015884	7.03	7.003125	1.089711	4.84	9.00	4.16	-0.28683284	-0.7225107	0.1015884
baby.sleep	2	100	8.0492	2.074232	7.95	8.047750	2.327682	3.25	12.07	8.82	-0.02310118	-0.6893588	0.2074232
dan.grump	3	100	63.7100	10.049670	62.00	63.162500	9.636900	41.00	91.00	50.00	0.43410145	-0.1576624	1.0049670
day	4	100	50.5000	29.011492	50.50	50.500000	37.065000	1.00	100.00	99.00	0.00000000	-1.2360552	2.9011492

term	estimate	std_error	statistic	p_value	lower_ci	upper_ci
intercept	12.783	0.282	45.267	0	12.223	13.344
dan.grump	-0.091	0.004	-20.854	0	-0.100	-0.083

	dan.sleep	baby.sleep	dan.grump	day
dan.sleep	1.00000000	0.62794934	-0.90338404	-0.09840768
baby.sleep	0.62794934	1.00000000	-0.56596373	-0.01043394
dan.grump	-0.90338404	-0.56596373	1.00000000	0.07647926
day	-0.09840768	-0.01043394	0.07647926	1.00000000

R Programming & Statistics Notes

Chapter 5.1.5 Exercises

	dan.sleep	baby.sleep	dan.grump	day
dan.sleep	NA	0.61472303	-0.903442442	-0.076796665
baby.sleep	0.61472303	NA	-0.567802669	0.058309485
dan.grump	-0.90344244	-0.56780267	NA	0.005833399
day	-0.07679667	0.05830949	0.005833399	NA

	dan.sleep	baby.sleep	dan.grump	day
dan.sleep	NA	6.565004e-09	9.389475e-31	1
baby.sleep	6.565004e-09	NA	1.060058e-07	1
dan.grump	9.389475e-31	1.060058e-07	NA	1
day	1.000000e+00	1.000000e+00	1.000000e+00	NA

	dan.sleep	baby.sleep	dan.grump	day
dan.sleep	NA	1.348133e-11	4.905856e-37	0.9900633
baby.sleep	1.348133e-11	NA	3.379004e-09	0.9900633
dan.grump	4.905856e-37	3.379004e-09	NA	0.9900633
day	9.900633e-01	9.900633e-01	9.900633e-01	NA

	hours	tasks	pay	day	weekday	week	day.type
hours	NA	0.80017247	0.7604283	-0.04944665	NA	0.01822231	NA
tasks	0.80017247	NA	0.7202438	-0.07154406	NA	-0.01273831	NA
pay	0.76042829	0.72024377	NA	0.13652421	NA	0.19643348	NA
day	-0.04944665	-0.07154406	0.1365242	NA	NA	0.98994949	NA
weekday	NA	NA	NA	NA	NA	NA	NA
week	0.01822231	-0.01273831	0.1964335	0.98994949	NA	NA	NA
day.type	NA	NA	NA	NA	NA	NA	NA

	hours	tasks	pay	day	weekday	week	day.type
hours	NA	4.732297e-11	1.854085e-09	1.000000e+00	NA	1.000000e+00	NA
tasks	4.732297e-11	NA	3.793229e-08	1.000000e+00	NA	1.000000e+00	NA
pay	1.854085e-09	3.793229e-08	NA	1.000000e+00	NA	1.000000e+00	NA
day	1.000000e+00	1.000000e+00	1.000000e+00	NA	NA	1.386732e-40	NA
weekday	NA	NA	NA	NA	NA	NA	NA
week	1.000000e+00	1.000000e+00	1.000000e+00	1.386732e-40	NA	NA	NA
day.type	NA	NA	NA	NA	NA	NA	NA

drug	therapy	mood.gain
placebo	no.therapy	0.5
placebo	no.therapy	0.3
placebo	no.therapy	0.1
anxifree	no.therapy	0.6
anxifree	no.therapy	0.4
anxifree	no.therapy	0.2
joyzepam	no.therapy	1.4
joyzepam	no.therapy	1.7
joyzepam	no.therapy	1.3
placebo	CBT	0.6
placebo	CBT	0.9
placebo	CBT	0.3
anxifree	CBT	1.1
anxifree	CBT	0.8
anxifree	CBT	1.2
joyzepam	CBT	1.8
joyzepam	CBT	1.3
joyzepam	CBT	1.4

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
drug*	1	18	2.0000000	0.8401681	2.00	2.000	1.48260	1.0	3.0	2.0	0.0000000	-1.662037	0.1980295
therapy*	2	18	1.5000000	0.5144958	1.50	1.500	0.74130	1.0	2.0	1.0	0.0000000	-2.108025	0.1212678
mood.gain	3	18	0.8833333	0.5338539	0.85	0.875	0.66717	0.1	1.8	1.7	0.1333981	-1.439739	0.1258306

drug	therapy	mood.gain
placebo	no.therapy	0.300000
anxifree	no.therapy	0.400000
joyzepam	no.therapy	1.466667
placebo	CBT	0.600000
anxifree	CBT	1.033333
joyzepam	CBT	1.500000

drug	therapy	mood.gain
placebo	no.therapy	0.2000000
anxifree	no.therapy	0.2000000
joyzepam	no.therapy	0.2081666
placebo	CBT	0.3000000
anxifree	CBT	0.2081666
joyzepam	CBT	0.2645751