Chapter 2.1.5 Exercises

data_str
In [ ]:
x <- array(1:20, dim=c(4,5)) ; x   # Generate a 4 by 5 array.
A matrix: 4 × 5 of type int
15 91317
26101418
37111519
48121620
In [11]:
d <- outer(0:9, 0:9); d
A matrix: 10 × 10 of type dbl
00 0 0 0 0 0 0 0 0
01 2 3 4 5 6 7 8 9
02 4 6 81012141618
03 6 9121518212427
04 812162024283236
051015202530354045
061218243036424854
071421283542495663
081624324048566472
091827364554637281
In [12]:
fr <- table(outer(d, d, "-")); fr
length(fr)
dim(fr)
-81 -80 -79 -78 -77 -76 -75 -74 -73 -72 -71 -70 -69 -68 -67 -66 -65 -64 -63 -62 
 19   1   2   2   3   2   4   2   4  41   4   4   8   6   6  10   7  27  49   8 
-61 -60 -59 -58 -57 -56 -55 -54 -53 -52 -51 -50 -49 -48 -47 -46 -45 -44 -43 -42 
  8  17   8  12  18  53  13  60  12  18  22  16  35  70  22  24  66  28  18  72 
-41 -40 -39 -38 -37 -36 -35 -34 -33 -32 -31 -30 -29 -28 -27 -26 -25 -24 -23 -22 
 22  75  37  34  26 111  63  36  45  84  34  94  36  93  97  50  53 156  42  60 
-21 -20 -19 -18 -17 -16 -15 -14 -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2 
103 107  50 168  51 140 112 116  59 191  65 126 156 185 115 206 117 179 153 156 
 -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
111 570 111 156 153 179 117 206 115 185 156 126  65 191  59 116 112 140  51 168 
 19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38 
 50 107 103  60  42 156  53  50  97  93  36  94  34  84  45  36  63 111  26  34 
 39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58 
 37  75  22  72  18  28  66  24  22  70  35  16  22  18  12  60  13  53  18  12 
 59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78 
  8  17   8   8  49  27   7  10   6   6   8   4   4  41   4   2   4   2   3   2 
 79  80  81 
  2   1  19 
163
163
In [13]:
plot(fr, xlab="Determinant", ylab="Frequency")
In [17]:
stopifnot(t(xt[,,2]) == x[,,2],
          t(xt[,,3]) == x[,,3],
          t(xt[,,4]) == x[,,4])

UCB <- aperm(UCBAdmissions, c(2,1,3))
UCB[1,,]
summary(UCB) # UCB is still a continency table
          Dept
Admit        A   B   C   D   E   F
  Admitted 512 353 120 138  53  22
  Rejected 313 207 205 279 138 351
Number of cases in table: 4526 
Number of factors: 3 
Test for independence of all factors:
	Chisq = 2000.3, df = 16, p-value = 0
In [16]:
x <- array(1:4, dim=c(2,2)) ; x
(j <- array(c(1:2,2:1), dim=c(2,2)))
as.vector(x)[as.vector(j)]
x[j]
A matrix: 2 × 2 of type int
13
24
A matrix: 2 × 2 of type int
12
21
  1. 1
  2. 2
  3. 2
  4. 1
  1. 3
  2. 2
In [37]:
# It’s also possible to have a column of a data frame that’s a matrix or array, as long as the number of rows matches the data frame. (This requires a slight extension to our definition of a data frame: it’s not the length() of each column that must be equal; but the NROW().) Like with list-columns, you must either add after creation, or wrap in I().

dfm <- data.frame(
  x = 1:3 * 10
)
dfm$y <- matrix(1:9, nrow = 3)
dfm$z <- data.frame(a = 3:1, b = letters[1:3], stringsAsFactors = FALSE)

str(dfm)
'data.frame':	3 obs. of  3 variables:
 $ x: num  10 20 30
 $ y: int [1:3, 1:3] 1 2 3 4 5 6 7 8 9
 $ z:'data.frame':	3 obs. of  2 variables:
  ..$ a: int  3 2 1
  ..$ b: chr  "a" "b" "c"
In [46]:
# Data frames allow you to label each row with a “name”, a character vector containing only unique values: 

df3 <- data.frame(
  age = c(35, 27, 18),
  hair = c("blond", "brown", "black"),
  row.names = c("Bob", "Susan", "Sam")
)
df3
agehair
Bob35 blond
Susan27 brown
Sam18 black
In [106]:
v <- data.frame("SN" = 1:2, "Age" = c(21,15), "Name" = c("John", "Dora")); v
class(v)

# In this example, x can be considered as a list of 3 components with each component having a two element vector.
SNAgeName
1 21 John
2 15 Dora
'data.frame'
In [16]:
str(v)      # structure of v
'data.frame':	2 obs. of  3 variables:
 $ SN  : int  1 2
 $ Age : num  21 15
 $ Name: Factor w/ 2 levels "Dora","John": 2 1
In [29]:
trees
GirthHeightVolume
8.370 10.3
8.665 10.3
8.863 10.2
10.572 16.4
10.781 18.8
10.883 19.7
11.066 15.6
11.075 18.2
11.180 22.6
11.275 19.9
11.379 24.2
11.476 21.0
11.476 21.4
11.769 21.3
12.075 19.1
12.974 22.2
12.985 33.8
13.386 27.4
13.771 25.7
13.864 24.9
14.078 34.5
14.280 31.7
14.574 36.3
16.072 38.3
16.377 42.6
17.381 55.4
17.582 55.7
17.980 58.3
18.080 51.5
18.080 51.0
20.687 77.0
In [36]:
# A data frame can be examined using functions like str() and head().

str(trees)
head(trees,n=10)
'data.frame':	31 obs. of  3 variables:
 $ Girth : num  8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
 $ Height: num  70 65 63 72 81 83 66 75 80 75 ...
 $ Volume: num  10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 ...
GirthHeightVolume
8.370 10.3
8.665 10.3
8.863 10.2
10.572 16.4
10.781 18.8
10.883 19.7
11.066 15.6
11.075 18.2
11.180 22.6
11.275 19.9
In [42]:
trees[trees$Height > 82,]    # selects rows with Height greater than 82
GirthHeightVolume
610.883 19.7
1712.985 33.8
1813.386 27.4
3120.687 77.0
In [133]:
v <- rbind(v,list(1,16,"Paul")); v
Warning message in `[<-.factor`(`*tmp*`, ri, value = "Paul"):
“invalid factor level, NA generated”
SNNameAge
1 John21
2 Dora15
1 Paul16
1 16 NA
In [132]:
v <- cbind(v,Age=c("21","15", "16")); v
SNNameAge
1 John21
2 Dora15
1 Paul16
In [1]:
x <- data.frame(foo = 1:4, bar = c(T, T, F, F)) 
x
nrow(x)
ncol(x)
foobar
1 TRUE
2 TRUE
3 FALSE
4 FALSE
4
2
In [2]:
# Exam data

exam <- data.frame(
  id = 1:5,
  q1 = c(1, 5, 2, 3, 2),
  q2 = c(8, 10, 9, 8, 7),
  q3 = c(3, 7, 4, 6, 4))
exam
idq1q2q3
1 1 83
2 5 107
3 2 94
4 3 86
5 2 74
In [4]:
# Demographic data

demographics <- data.frame(
  id = 1:5,
  sex = c("f", "m", "f", "f", "m"),
  age = c(25, 22, 24, 19, 23))
demographics
idsexage
1 f 25
2 m 22
3 f 24
4 f 19
5 m 23
In [5]:
# Combine exam and demographics

combined <- merge(x = exam, 
              y = demographics, 
              by = "id")
combined
idq1q2q3sexage
1 1 83 f 25
2 5 107 m 22
3 2 94 f 24
4 3 86 f 19
5 2 74 m 23
In [6]:
# Mean q1 score for each sex

aggregate(formula = q1 ~ sex, 
          data = combined, 
          FUN = mean)
aggregate
sexq1
f 2.0
m 3.5
function (x, ...) 
UseMethod("aggregate")
In [9]:
# Many summary statistics by sex using dplyr!

library(dplyr)
combined %>% group_by(sex) %>%
  summarise(
    q1.mean = mean(q1),
    q2.mean = mean(q2),
    q3.mean = mean(q3),
    age.mean = mean(age),
    N = n())
sexq1.meanq2.meanq3.meanage.meanN
f 2.0 8.3333334.33333322.666673
m 3.5 8.5000005.50000022.500002
In [9]:
# Many summary statistics by sex using dplyr!

library(dplyr)
combined %>% group_by(sex) %>%
  summarise(
    q1.mean = mean(q1),
    q2.mean = mean(q2),
    q3.mean = mean(q3),
    age.mean = mean(age),
    N = n())
sexq1.meanq2.meanq3.meanage.meanN
f 2.0 8.3333334.33333322.666673
m 3.5 8.5000005.50000022.500002
In [3]:
x <- factor(c("ab", "cd", "ab"), levels = c("ab", "cd", "ef")); x
typeof(x)
attributes(x)
  1. ab
  2. cd
  3. ab
'integer'
$levels
  1. 'ab'
  2. 'cd'
  3. 'ef'
$class
'factor'
In [17]:
z <- 0:9; z
digits <- as.character(z); digits
d <- as.integer(digits); d

# # Now d and z are the same.12 There is a large collection of functions of the form as.something() for either coercion from one mode to another, or for investing an object with some other attribute it may not already possess. The reader should consult the different help files to become familiar with them.
  1. 0
  2. 1
  3. 2
  4. 3
  5. 4
  6. 5
  7. 6
  8. 7
  9. 8
  10. 9
  1. '0'
  2. '1'
  3. '2'
  4. '3'
  5. '4'
  6. '5'
  7. '6'
  8. '7'
  9. '8'
  10. '9'
  1. 0
  2. 1
  3. 2
  4. 3
  5. 4
  6. 5
  7. 6
  8. 7
  9. 8
  10. 9
In [49]:
r <- factor(c("a","b")); r
levels(r) <- list(C = "C", A = "a", B = "b"); r
  1. a
  2. b
  1. A
  2. B
In [56]:
factor(c(1, 2, 3), labels=c('a', 'b', 'c'))
factor(c(3.2, 10, 500000), labels=c('a', 'b', 'c'))
factor(c(.49, 1, 5), labels=c('a', 'b', 'c'))
  1. a
  2. b
  3. c
  1. a
  2. b
  3. c
  1. a
  2. b
  3. c
In [13]:
x = sample(state.name, 10000, replace=T)
format(object.size(x), units='Kb')

# Because of the integer+metadata representation, factors are actually smaller than character strings, often notably so.
'80.8 Kb'
In [6]:
# While atomic vectors are most commonly turned into matrices, the dimension attribute can also be set on lists to
# make list-matrices or list-arrays:

l <- list(1:3, "a", TRUE, 1.0)
dim(l) <- c(2, 2)
l
l[[1, 1]]

# These are relatively esoteric data structures, but can be useful if you want to arrange objects into a grid-like
# structure. 
# For example, if you’re running models on a spatio-temporal grid, it might be natural to preserve the grid 
# structure by storing the models in a 3d array.
A matrix: 2 × 2
1, 2, 3TRUE
a1
  1. 1
  2. 2
  3. 3
In [13]:
# Unlike atomic vectors, list() can contain a mix of objects. 
# Following is an example of a list having three components each of different data type. In this example, a, b and
# c are called tags which makes it easier to reference the components of the list.

x <- list("a" = 2.5, "b" = TRUE, "c" = 1:3); x
typeof(x)
length(x)
$a
2.5
$b
TRUE
$c
  1. 1
  2. 2
  3. 3
'list'
3
In [1]:
v <- list("John", 19, c("English", "French")); names(v) <- c("name", "age", "speaks"); names(v[[3]]) <- c("english", "french"); v
$name
'John'
$age
19
$speaks
english
'English'
french
'French'
In [ ]:

In [20]:
# Lists can be accessed in similar fashion to vectors. Integer, logical or character vectors can be used for 
# indexing. 

v[c(1:2)]    # index using integer vector
v[-2]        # using negative integer to exclude second component
v[c(T,F,F)]  # index using logical vector
v[c("age","speaks")]    # index using character vector
$name
'John'
$age
19
$name
'John'
$speaks
  1. 'English'
  2. 'French'
$name = 'John'
$age
19
$speaks
  1. 'English'
  2. 'French'
In [21]:
# Indexing with [ as shown above will give us sublist not the content inside the component. 
# To retrieve the content, we need to use [[. However, this approach will allow us to access only a single component at a time.

v["age"]
typeof(v["age"]) 
v[["age"]] 
typeof(v[["age"]])
v[[2]]
$age = 19
'list'
19
'double'
19
In [22]:
# An alternative to [[, which is used often while accessing content of a list is the $ operator. They are both the same except that $ can do partial matching on tags.

v$name    # same as v[["name"]]
v$a       # partial matching, same as v$ag or v$age
v[["a"]]  # cannot do partial match with [[
'John'
19
NULL
In [29]:
# We can delete a component by assigning NULL to it.

v[["age"]] <- NULL; str(v)
List of 3
 $ name   : chr "Clair"
 $ speaks : chr [1:2] "English" "French"
 $ married: logi FALSE
In [46]:
# Matrices are vectors with a dimension attribute.

m <- matrix(nrow = 2, ncol = 3); m
dim(m)
attributes(m)
A matrix: 2 × 3 of type lgl
NANANA
NANANA
  1. 2
  2. 3
$dim =
  1. 2
  2. 3
In [10]:
# It is also possible to change names

colnames(x) <- c("C1","C2","C3")
rownames(x) <- c("R1","R2","R3")
x
A matrix: 3 × 3 of type int
C1C2C3
R1147
R2258
R3369
In [29]:
j <- matrix(1:9, nrow = 3, dimnames = list(c("X","Y","Z"), c("A","B","C"))); j
A matrix: 3 × 3 of type int
ABC
X147
Y258
Z369
In [30]:
j[,"A"]
j[TRUE,c("A","C")]
j[2:3,c("A","C")]
X
1
Y
2
Z
3
A matrix: 3 × 2 of type int
AC
X17
Y28
Z39
A matrix: 2 × 2 of type int
AC
Y28
Z39
In [34]:
j[j<5] <- 0; j    # modify elements less than 5
A matrix: 3 × 3 of type dbl
ABC
X0 07
Y0108
Z0 69
In [44]:
(m1 <- matrix(1:20, 4, 5))
upper.tri(m1)
A matrix: 4 × 5 of type int
15 91317
26101418
37111519
48121620
A matrix: 4 × 5 of type lgl
FALSE TRUE TRUE TRUETRUE
FALSEFALSE TRUE TRUETRUE
FALSEFALSEFALSE TRUETRUE
FALSEFALSEFALSEFALSETRUE