Chapter 2.2.3.1 Exercises
In [ ]:
# For small to moderately sized datasets, you can usually call read.table without specifying any other arguments
data <- read.table("foo.txt")
In [ ]:
# A quick an dirty way to figure out the classes of each column is the following:
initial <- read.table("datatable.txt", nrows = 100)
classes <- sapply(initial, class)
tabAll <- read.table("datatable.txt", colClasses = classes)
In [9]:
# col_names: Either TRUE, FALSE or a character vector of column names. If TRUE, the first row of the input will be used as the column names, and will not be included in the data frame. If FALSE, column names will be generated automatically: X1, X2, X3 etc.
# If col_names is a character vector, the values will be used as the names of the columns, and the first row of the input will be read into the first row of the output data frame.
# Missing (NA) column names will generate a warning, and be filled in with dummy names X1, X2 etc. Duplicate column names will generate a warning and be made unique with a numeric prefix.
test1 <- c(1:5, "6,7", "8,9,10")
tf <- tempfile()
writeLines(test1, tf)
# If fill=TRUE then in case the rows have unequal length, blank fields are implicitly added.
read.csv(tf, fill = TRUE) # 1 column
ncol <- max(count.fields(tf, sep = ","))
read.csv(tf, fill = TRUE, header = FALSE,
col.names = paste0("V", seq_len(ncol))) # seq_len(length.out)
unlink(tf)
# what is this?!
In [ ]:
In [3]:
library(readr)
teams <- read_csv("./team_standings.csv") ; teams
In [10]:
library(readr)
teams <- read_csv("./team_standings.csv", col_types = "cc") ; teams
# col_types =NULL shows all columns and rows.
In [11]:
# Note that the warnings indicate that read_csv may have had some difficulty identifying the type of each column. This can be solved by using the col_types argument.
# col_types: One of NULL, a cols() specification, or a string. See vignette("column-types") for more details.
# If NULL, all column types will be imputed from the first 1000 rows on the input. This is convenient (and fast), but not robust. If the imputation fails, you'll need to supply the correct types yourself.
# If a column specification created by cols(), it must contain one column specification for each column.
# If you only want to read a subset of the columns, use cols_only().
# Alternatively, you can use a compact string representation where each character represents one column: c = character, i = integer, n = number, d = double, l = logical, D = date, T = date time, t = time, ? = guess, or _/- to skip the column.
logs <- read_csv("./2016-07-19.csv.bz2", col_types = "ccicccccci", n_max = 10); logs
In [12]:
# You can specify the column type in a more detailed fashion by using the various col_* functions. For example, in the log data above, the first column is actually a date, so it might make more sense to read it in as a Date variable. If we wanted to just read in that first column, we could do
logdates <- read_csv("./2016-07-19.csv.bz2", col_types = cols_only(date = col_date()), n_max = 10); logdates
# Now the date column is stored as a Date object which can be used for relevant date-related computations (for example, see the lubridate package).
In [ ]:
read_csv(file,
col_names = TRUE,
col_types = NULL,
locale = default_locale(),
na = c("", "NA"),
quoted_na = TRUE,
quote = "\"",
comment = "",
trim_ws = TRUE,
skip = 0,
n_max = Inf,
guess_max = min(1000, n_max),
progress = show_progress()
)
In [19]:
logs <- read_csv("~/Documents/learning/current/Notebooks/R/intro/datasets/rprgfd/data/2016-07-19.csv.bz2", col_types = "ccicccccci", col_names n_max = 10); logs
# if col_names = FALSE ; the first row with column names is shown as X1, X2, X3...
In [25]:
library(readr)
# Input sources -------------------------------------------------------------
# Read from a path
read_csv(readr_example("mtcars.csv"))
read_csv(readr_example("mtcars.csv.zip"))
read_csv(readr_example("mtcars.csv.bz2"))
read_csv("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv")
In [26]:
# Or directly from a string (must contain a newline)
read_csv("x,y\n1,2\n3,4")
In [27]:
# Column types --------------------------------------------------------------
# By default, readr guesses the columns types, looking at the first 100 rows.
# You can override with a compact specification:
read_csv("x,y\n1,2\n3,4", col_types = "dc")
In [28]:
# Or with a list of column types:
read_csv("x,y\n1,2\n3,4", col_types = list(col_double(), col_character()))
In [34]:
# If there are parsing problems, you get a warning, and can extract more details with problems()
y <- read_csv("x\n1\n2\nb", col_types = list(col_double()))
y
problems(y)