## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, #eval = FALSE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(tidyfst) set.seed(1L) ## Create a data table DF <- data.table(V1 = rep(c(1L, 2L), 5)[-10], V2 = 1:9, V3 = c(0.5, 1.0, 1.5), V4 = rep(LETTERS[1:3], 3)) copy(DF) -> DT class(DF) DF ## ----------------------------------------------------------------------------- ### Filter rows using indices slice_dt(DF, 3:4) ### Discard rows using negative indices slice_dt(DF, -(3:7)) ### Filter rows using a logical expression filter_dt(DF, V2 > 5) filter_dt(DF, V4 %in% c("A", "C")) filter_dt(DF, V4 %chin% c("A", "C")) # fast %in% for character ### Filter rows using multiple conditions filter_dt(DF, V1 == 1, V4 == "A") # equals to filter_dt(DF, V1 == 1 & V4 == "A") ### Filter unique rows distinct_dt(DF) # unique(DF) distinct_dt(DF, V1,V4) ### Discard rows with missing values drop_na_dt(DF) # na.omit(DF) ### Other filters sample_n_dt(DF, 3) # n random rows sample_frac_dt(DF, 0.5) # fraction of random rows slice_max_dt(DF, V1,1) # top n entries (includes equals) filter_dt(DT,V4 %like% "^B") filter_dt(DT,V2 %between% c(3, 5)) filter_dt(DT,between(V2, 3, 5, incbounds = FALSE)) filter_dt(DT,V2 %inrange% list(-1:1, 1:3)) # see also ?inrange ## ----------------------------------------------------------------------------- ### Sort rows by column arrange_dt(DF, V3) ### Sort rows in decreasing order arrange_dt(DF, -V3) ### Sort rows based on several columns arrange_dt(DF, V1, -V2) ## ----------------------------------------------------------------------------- ### Select one column using an index (not recommended) pull_dt(DT,3) # returns a vector select_dt(DT,3) # returns a data.table ### Select one column using column name select_dt(DF, V2) # returns a data.table pull_dt(DF, V2) # returns a vector ### Select several columns select_dt(DF, V2, V3, V4) select_dt(DF, V2:V4) # select columns between V2 and V4 ### Exclude columns select_dt(DF, -V2, -V3) ### Select/Exclude columns using a character vector cols <- c("V2", "V3") select_dt(DF,cols = cols) select_dt(DF,cols = cols,negate = TRUE) ### Other selections select_dt(DF, cols = paste0("V", 1:2)) relocate_dt(DF, V4) # reorder columns select_dt(DF, "V") select_dt(DF, "3$") select_dt(DF, ".2") select_dt(DF, "V1") select_dt(DF, -"^V2") # remove variables using "-" prior to function ## ----------------------------------------------------------------------------- ### Summarise one column summarise_dt(DF, sum(V1)) # returns a data.table summarise_dt(DF, sumV1 = sum(V1)) # returns a data.table ### Summarise several columns summarise_dt(DF, sum(V1), sd(V3)) ### Summarise several columns and assign column names DF %>% summarise_dt(sumv1 = sum(V1), sdv3 = sd(V3)) ### Summarise a subset of rows DT[1:4, sum(V1)] DF %>% slice_dt(1:4) %>% summarise_dt(sum(V1)) ### Misc summarise_dt(DF, nth(V3,1)) summarise_dt(DF, nth(V3,-1)) summarise_dt(DF, nth(V3, 5)) summarise_dt(DF, uniqueN(V4)) uniqueN(DF) ## ----------------------------------------------------------------------------- ### By group # not recommended DF %>% group_dt( by = V4, summarise_dt(sumV2 = sum(V2)) ) # recommended DF %>% summarise_dt(sumV2 = sum(V2),by = V4) ### By several groups DF %>% summarise_dt(sumV2 = sum(V2),by = .(V1,V4)) ### Calling function in by DF %>% summarise_dt(sumV2 = sum(V2),by = tolower(V4)) ### Assigning column name in by DF %>% summarise_dt(sumV2 = sum(V2),by = .(abc = tolower(V4))) ### Using a condition in by DF %>% summarise_dt(sumV2 = sum(V2),by = V4 == "A") ### By on a subset of rows DF %>% slice_dt(1:5) %>% summarise_dt(sumV1 = sum(V1),by = V4) ### Count number of observations for each group count_dt(DF, V4) ### Add a column with number of observations for each group add_count_dt(DF, V1) ### Retrieve the first/last/nth observation for each group DF %>% summarise_dt(by = V4,nth(V2,1)) DF %>% summarise_dt(by = V4,nth(V2,-1)) DF %>% summarise_dt(by = V4,nth(V2,2)) ## ----------------------------------------------------------------------------- ### Summarise all the columns summarise_vars(DT,.func = max) ### Summarise several columns summarise_vars(DT,c("V1", "V2"),mean) ### Summarise several columns by group DT %>% summarise_vars(c("V1", "V2"),mean,by = V4) ## using patterns (regex) DT %>% summarise_vars("V1|V2",mean,by = V4) ## Summarise with more than one function by group # when you can't find a way, you can always use `in_dt` to use data.table DT %>% in_dt(, by = V4, c(lapply(.SD, sum), lapply(.SD, mean))) ### Summarise using a condition summarise_vars(DF, is.numeric, mean) ### Modify all the columns mutate_vars(DF, .func = rev) ### Modify several columns (dropping the others) DF %>% select_dt(cols = c("V1", "V2")) %>% mutate_vars(.func = sqrt) DF %>% select_dt(-V4) %>% mutate_vars(.func = exp) ### Modify several columns (keeping the others) DF %>% mutate_vars(c("V1", "V2"), sqrt) DF %>% mutate_vars(-"V4", exp) ### Modify columns using a condition (dropping the others) select_dt(DT,is.numeric) ### Modify columns using a condition (keeping the others) mutate_vars(DT,is.numeric,as.integer) ### Use a complex expression DF %>% group_dt( by = V4, slice_dt(1:2) %>% transmute_dt(V1 = V1, V2 = "X") ) ### Use multiple expressions (with DT[,{j}]) DT %>% in_dt(,{ print(V1) # comments here! print(summary(V1)) x <- V1 + sum(V2) .(A = 1:.N, B = x) # last list returned as a data.table } ) ## ----------------------------------------------------------------------------- ### Select first/last/… row by group DT %>% group_dt( by = V4, head(1) ) DT %>% group_dt( by = V4, tail(2) ) DT %>% group_dt( by = V4, slice_dt(1,.N) ) ### Select rows using a nested query DF %>% group_dt( by = V4, arrange_dt(V2) %>% slice_dt(1) ) ### Add a group counter column DT %>% mutate_dt(Grp = .GRP,by = .(V4, V1)) ### Get row number of first (and last) observation by group DT %>% summarise_dt(I = .I,by = V4) DT %>% summarise_dt(I = .I[1],by = V4) DT %>% summarise_dt(I = .I[c(1,.N)],by = V4) ### Handle list-columns by group DT %>% select_dt(V1,V4) %>% chop_dt(V1) # return V1 as a list DT %>% nest_dt(V4) # subsets of the data ### Grouping sets (multiple by at once) # use data.table directly, tidyfst does not provide new methods for it yet data.table::rollup(DT, .(SumV2 = sum(V2)), by = c("V1", "V4")) data.table::rollup(DT, .(SumV2 = sum(V2), .N), by = c("V1", "V4"), id = TRUE) data.table::cube(DT, .(SumV2 = sum(V2), .N), by = c("V1", "V4"), id = TRUE) data.table::groupingsets(DT, .(SumV2 = sum(V2), .N), by = c("V1", "V4"), sets = list("V1", c("V1", "V4")), id = TRUE) ## ----eval=FALSE--------------------------------------------------------------- # ### Write data to a csv file # fwrite(DT, "DT.csv") # # ### Write data to a tab-delimited file # fwrite(DT, "DT.txt", sep = "\t") # # ### Write list-column data to a csv file # fwrite(setDT(list(0, list(1:5))), "DT2.csv") # # # ### Read a csv / tab-delimited file # fread("DT.csv") # # fread("DT.csv", verbose = TRUE) # full details # fread("DT.txt", sep = "\t") # # ### Read a csv file selecting / droping columns # fread("DT.csv", select = c("V1", "V4")) # fread("DT.csv", drop = "V4") # # NA # ### Read and rbind several files # rbindlist(lapply(c("DT.csv", "DT.csv"), fread)) # # c("DT.csv", "DT.csv") %>% lapply(fread) %>% rbindlist # ## ----------------------------------------------------------------------------- ### Melt data (from wide to long) fsetequal(DT,DF) mDT = DT %>% longer_dt(V3,V4) mDF = DF %>% longer_dt(-"V1|V2") fsetequal(mDT,mDF) mDT ### Cast data (from long to wide) mDT %>% wider_dt(V4,name = "name",value = "value") # below is a special case and could only be done in tidyfst mDT %>% wider_dt(V4,name = "name",value = "value",fun = list) mDT %>% wider_dt(V4,name = "name",value = "value",fun = sum) ### Split split(DT, by = "V4") ## ----------------------------------------------------------------------------- ### Lead/Lag lag_dt(1:10,n = 1) lag_dt(1:10,n = 1:2) lead_dt(1:10,n = 1) ## ----------------------------------------------------------------------------- x <- data.table(Id = c("A", "B", "C", "C"), X1 = c(1L, 3L, 5L, 7L), XY = c("x2", "x4", "x6", "x8"), key = "Id") y <- data.table(Id = c("A", "B", "B", "D"), Y1 = c(1L, 3L, 5L, 7L), XY = c("y1", "y3", "y5", "y7"), key = "Id") ### left join left_join_dt(x, y, by = "Id") ### right join right_join_dt(x, y, by = "Id") ### inner join inner_join_dt(x, y, by = "Id") ### full join full_join_dt(x, y, by = "Id") ### semi join semi_join_dt(x, y, by = "Id") ### anti join anti_join_dt(x, y, by = "Id") ## ----------------------------------------------------------------------------- x <- data.table(1:3) y <- data.table(4:6) z <- data.table(7:9, 0L) ### Bind rows rbind(x, y) rbind(x, z, fill = TRUE) ### Bind rows using a list rbindlist(list(x, y), idcol = TRUE) ### Bind columns cbind(x, y) ## ----------------------------------------------------------------------------- x <- data.table(c(1, 2, 2, 3, 3)) y <- data.table(c(2, 2, 3, 4, 4)) ### Intersection fintersect(x, y) fintersect(x, y, all = TRUE) ### Difference fsetdiff(x, y) fsetdiff(x, y, all = TRUE) ### Union funion(x, y) funion(x, y, all = TRUE) ### Equality fsetequal(x, x[order(-V1),]) all.equal(x, x) # S3 method setequal(x, x[order(-V1),])