# For small to moderately sized datasets, you can usually call read.table without specifying any other arguments

data <- read.table("foo.txt")

# A quick an dirty way to figure out the classes of each column is the following:

initial <- read.table("datatable.txt", nrows = 100)
classes <- sapply(initial, class)
tabAll <- read.table("datatable.txt", colClasses = classes)

# col_names: Either TRUE, FALSE or a character vector of column names. If TRUE, the first row of the input will be used as the column names, and will not be included in the data frame. If FALSE, column names will be generated automatically: X1, X2, X3 etc. 
# If col_names is a character vector, the values will be used as the names of the columns, and the first row of the input will be read into the first row of the output data frame. 
# Missing (NA) column names will generate a warning, and be filled in with dummy names X1, X2 etc. Duplicate column names will generate a warning and be made unique with a numeric prefix.

test1 <- c(1:5, "6,7", "8,9,10")
tf <- tempfile()
writeLines(test1, tf)

# If fill=TRUE then in case the rows have unequal length, blank fields are implicitly added.

read.csv(tf, fill = TRUE) # 1 column
ncol <- max(count.fields(tf, sep = ","))
read.csv(tf, fill = TRUE, header = FALSE,
col.names = paste0("V", seq_len(ncol)))     #  seq_len(length.out)
unlink(tf)

# what is this?!

library(readr) 
teams <- read_csv("./team_standings.csv") ; teams

Parsed with column specification:
cols(
  Standing = col_integer(),
  Team = col_character()
)

library(readr)
teams <- read_csv("./team_standings.csv", col_types = "cc") ; teams

# col_types =NULL shows all columns and rows.

# Note that the warnings indicate that read_csv may have had some difficulty identifying the type of each column. This can be solved by using the col_types argument.

# col_types: One of NULL, a cols() specification, or a string. See vignette("column-types") for more details. 
# If NULL, all column types will be imputed from the first 1000 rows on the input. This is convenient (and fast), but not robust. If the imputation fails, you'll need to supply the correct types yourself. 
# If a column specification created by cols(), it must contain one column specification for each column. 
# If you only want to read a subset of the columns, use cols_only(). 
# Alternatively, you can use a compact string representation where each character represents one column: c = character, i = integer, n = number, d = double, l = logical, D = date, T = date time, t = time, ? = guess, or _/- to skip the column.

logs <- read_csv("./2016-07-19.csv.bz2", col_types = "ccicccccci", n_max = 10); logs

# You can specify the column type in a more detailed fashion by using the various col_* functions. For example, in the log data above, the first column is actually a date, so it might make more sense to read it in as a Date variable. If we wanted to just read in that first column, we could do

logdates <- read_csv("./2016-07-19.csv.bz2", col_types = cols_only(date = col_date()), n_max = 10); logdates

# Now the date column is stored as a Date object which can be used for relevant date-related computations (for example, see the lubridate package).

read_csv(file, 
         col_names = TRUE, 
         col_types = NULL, 
         locale = default_locale(), 
         na = c("", "NA"), 
         quoted_na = TRUE, 
         quote = "\"", 
         comment = "", 
         trim_ws = TRUE, 
         skip = 0, 
         n_max = Inf, 
         guess_max = min(1000, n_max), 
         progress = show_progress()
        )

logs <- read_csv("~/Documents/learning/current/Notebooks/R/intro/datasets/rprgfd/data/2016-07-19.csv.bz2", col_types = "ccicccccci", col_names n_max = 10); logs

# if col_names = FALSE ; the first row with column names is shown as X1, X2, X3...

library(readr)
# Input sources -------------------------------------------------------------
# Read from a path

read_csv(readr_example("mtcars.csv"))
read_csv(readr_example("mtcars.csv.zip"))
read_csv(readr_example("mtcars.csv.bz2"))
read_csv("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv")

Parsed with column specification:
cols(
  mpg = col_double(),
  cyl = col_integer(),
  disp = col_double(),
  hp = col_integer(),
  drat = col_double(),
  wt = col_double(),
  qsec = col_double(),
  vs = col_integer(),
  am = col_integer(),
  gear = col_integer(),
  carb = col_integer()
)

Parsed with column specification:
cols(
  mpg = col_double(),
  cyl = col_integer(),
  disp = col_double(),
  hp = col_integer(),
  drat = col_double(),
  wt = col_double(),
  qsec = col_double(),
  vs = col_integer(),
  am = col_integer(),
  gear = col_integer(),
  carb = col_integer()
)

Parsed with column specification:
cols(
  mpg = col_double(),
  cyl = col_integer(),
  disp = col_double(),
  hp = col_integer(),
  drat = col_double(),
  wt = col_double(),
  qsec = col_double(),
  vs = col_integer(),
  am = col_integer(),
  gear = col_integer(),
  carb = col_integer()
)

`curl` package not installed, falling back to using `url()`
Parsed with column specification:
cols(
  mpg = col_double(),
  cyl = col_integer(),
  disp = col_double(),
  hp = col_integer(),
  drat = col_double(),
  wt = col_double(),
  qsec = col_double(),
  vs = col_integer(),
  am = col_integer(),
  gear = col_integer(),
  carb = col_integer()
)

# Or directly from a string (must contain a newline)

read_csv("x,y\n1,2\n3,4")

# Column types --------------------------------------------------------------
# By default, readr guesses the columns types, looking at the first 100 rows.
# You can override with a compact specification:

read_csv("x,y\n1,2\n3,4", col_types = "dc")

# Or with a list of column types:

read_csv("x,y\n1,2\n3,4", col_types = list(col_double(), col_character()))

# If there are parsing problems, you get a warning, and can extract more details with problems()

y <- read_csv("x\n1\n2\nb", col_types = list(col_double()))
y
problems(y)

Warning message in rbind(names(probs), probs_f):
“number of columns of result is not a multiple of vector length (arg 2)”Warning message:
“1 parsing failure.
row # A tibble: 1 x 5 col     row col   expected actual file         expected   <int> <chr> <chr>    <chr>  <chr>        actual 1     3 x     a double b      literal data file # A tibble: 1 x 5
”

Standing	Team
1	Spain
2	Netherlands
3	Germany
4	Uruguay
5	Argentina
6	Brazil
7	Ghana
8	Paraguay
9	Japan
10	Chile
11	Portugal
12	USA
13	England
14	Mexico
15	South Korea
16	Slovakia
17	Ivory Coast
18	Slovenia
19	Switzerland
20	South Africa
21	Australia
22	New Zealand
23	Serbia
24	Denmark
25	Greece
26	Italy
27	Nigeria
28	Algeria
29	France
30	Honduras
31	Cameroon
32	North Korea

Standing	Team
1	Spain
2	Netherlands
3	Germany
4	Uruguay
5	Argentina
6	Brazil
7	Ghana
8	Paraguay
9	Japan
10	Chile
11	Portugal
12	USA
13	England
14	Mexico
15	South Korea
16	Slovakia
17	Ivory Coast
18	Slovenia
19	Switzerland
20	South Africa
21	Australia
22	New Zealand
23	Serbia
24	Denmark
25	Greece
26	Italy
27	Nigeria
28	Algeria
29	France
30	Honduras
31	Cameroon
32	North Korea

date	time	size	r_version	r_arch	r_os	package	version	country	ip_id
2016-07-19	22:00:00	1887881	3.3.0	x86_64	mingw32	data.table	1.9.6	US	1
2016-07-19	22:00:05	45436	3.3.1	x86_64	mingw32	assertthat	0.1	US	2
2016-07-19	22:00:03	14259016	3.3.1	x86_64	mingw32	stringi	1.1.1	DE	3
2016-07-19	22:00:05	1887881	3.3.1	x86_64	mingw32	data.table	1.9.6	US	4
2016-07-19	22:00:06	389615	3.3.1	x86_64	mingw32	foreach	1.4.3	US	4
2016-07-19	22:00:08	48842	3.3.1	x86_64	linux-gnu	tree	1.0-37	CO	5
2016-07-19	22:00:12	525	3.3.1	x86_64	darwin13.4.0	survival	2.39-5	US	6
2016-07-19	22:00:08	3225980	3.3.1	x86_64	mingw32	Rcpp	0.12.5	US	2
2016-07-19	22:00:09	556091	3.3.1	x86_64	mingw32	tibble	1.1	US	2
2016-07-19	22:00:10	151527	3.3.1	x86_64	mingw32	magrittr	1.5	US	2

date	time	size	r_version	r_arch	r_os	package	version	country	ip_id
2016-07-19	22:00:00	1887881	3.3.0	x86_64	mingw32	data.table	1.9.6	US	1
2016-07-19	22:00:05	45436	3.3.1	x86_64	mingw32	assertthat	0.1	US	2
2016-07-19	22:00:03	14259016	3.3.1	x86_64	mingw32	stringi	1.1.1	DE	3
2016-07-19	22:00:05	1887881	3.3.1	x86_64	mingw32	data.table	1.9.6	US	4
2016-07-19	22:00:06	389615	3.3.1	x86_64	mingw32	foreach	1.4.3	US	4
2016-07-19	22:00:08	48842	3.3.1	x86_64	linux-gnu	tree	1.0-37	CO	5
2016-07-19	22:00:12	525	3.3.1	x86_64	darwin13.4.0	survival	2.39-5	US	6
2016-07-19	22:00:08	3225980	3.3.1	x86_64	mingw32	Rcpp	0.12.5	US	2
2016-07-19	22:00:09	556091	3.3.1	x86_64	mingw32	tibble	1.1	US	2
2016-07-19	22:00:10	151527	3.3.1	x86_64	mingw32	magrittr	1.5	US	2

mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
21.0	6	160.0	110	3.90	2.620	16.46	0	1	4	4
21.0	6	160.0	110	3.90	2.875	17.02	0	1	4	4
22.8	4	108.0	93	3.85	2.320	18.61	1	1	4	1
21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
18.7	8	360.0	175	3.15	3.440	17.02	0	0	3	2
18.1	6	225.0	105	2.76	3.460	20.22	1	0	3	1
14.3	8	360.0	245	3.21	3.570	15.84	0	0	3	4
24.4	4	146.7	62	3.69	3.190	20.00	1	0	4	2
22.8	4	140.8	95	3.92	3.150	22.90	1	0	4	2
19.2	6	167.6	123	3.92	3.440	18.30	1	0	4	4
17.8	6	167.6	123	3.92	3.440	18.90	1	0	4	4
16.4	8	275.8	180	3.07	4.070	17.40	0	0	3	3
17.3	8	275.8	180	3.07	3.730	17.60	0	0	3	3
15.2	8	275.8	180	3.07	3.780	18.00	0	0	3	3
10.4	8	472.0	205	2.93	5.250	17.98	0	0	3	4
10.4	8	460.0	215	3.00	5.424	17.82	0	0	3	4
14.7	8	440.0	230	3.23	5.345	17.42	0	0	3	4
32.4	4	78.7	66	4.08	2.200	19.47	1	1	4	1
30.4	4	75.7	52	4.93	1.615	18.52	1	1	4	2
33.9	4	71.1	65	4.22	1.835	19.90	1	1	4	1
21.5	4	120.1	97	3.70	2.465	20.01	1	0	3	1
15.5	8	318.0	150	2.76	3.520	16.87	0	0	3	2
15.2	8	304.0	150	3.15	3.435	17.30	0	0	3	2
13.3	8	350.0	245	3.73	3.840	15.41	0	0	3	4
19.2	8	400.0	175	3.08	3.845	17.05	0	0	3	2
27.3	4	79.0	66	4.08	1.935	18.90	1	1	4	1
26.0	4	120.3	91	4.43	2.140	16.70	0	1	5	2
30.4	4	95.1	113	3.77	1.513	16.90	1	1	5	2
15.8	8	351.0	264	4.22	3.170	14.50	0	1	5	4
19.7	6	145.0	175	3.62	2.770	15.50	0	1	5	6
15.0	8	301.0	335	3.54	3.570	14.60	0	1	5	8
21.4	4	121.0	109	4.11	2.780	18.60	1	1	4	2

R Programming & Statistics Notes

Chapter 2.2.3.1 Exercises