############################################# # by Odenwald, Illmer, Sommerauer 2022 ############################################# library(dplyr) library(nycflights13) ### Piping demonstration flights %>% filter(tailnum=="N14228") %>% arrange(desc(air_time)) %>% select(tailnum, air_time, origin, dest) %>% slice_head(n=4) ### Observation Extraction ## filter # select planes with less than 10 seats filter(planes, seats < 10) planes %>% filter(seats < 10) # planes with 9-11 seats planes %>% filter(between(seats, 9, 11)) planes %>% filter(near(seats, 10, 2)) # planes from the year 1998 or 2004 planes %>% filter(year == 1998 | year == 2004) # find planes from EMBRAER where the year is missing planes %>% filter(is.na(year) & manufacturer == "EMBRAER") planes %>% filter(is.na(year), manufacturer == "EMBRAER") ## distinct # get distinct plane manufacturers planes['manufacturer'] %>% distinct() planes %>% distinct(manufacturer) # get number of distinct manufacturer-model pairs planes %>% distinct(manufacturer, model) %>% nrow() ## slice # select first, third and fifth airline airlines %>% slice(c(1,3,5)) # select everything except the first, third and fifth airline airlines %>% slice(-c(1,3,5)) # select rows 4 to 10 airlines %>% slice(4:10) # select 3rd airline from the back airlines %>% slice(n()-2) # select first/last 5 rows airlines %>% slice_head(n=5) airlines %>% slice_tail(n=5) # select first/last 20% of rows airlines %>% slice_head(prop=0.2) airlines %>% slice_tail(prop=0.2) # select at least 5 planes with min/max number of seats planes %>% slice_min(seats, n=5) planes %>% slice_max(seats, n=5) # select exactly 5 planes with min/max number of seats (output first in case of ties) planes %>% slice_min(seats, n=5, with_ties = F) planes %>% slice_max(seats, n=5, with_ties = F) # uniformly sample 5 airlines airlines %>% slice_sample(n=5) # uniformly sample 5 airlines (allow duplicates) airlines %>% slice_sample(n=5, replace = T) # sample weather entries with more weight given to entries with high temperature weather %>% filter(!is.na(temp)) %>% slice_sample(n = 5, weight_by = temp) ### Variable extraction ## select # select name, latitude and longitude from all airports airports %>% select(name, lat, lon) # select columns year - model from planes planes %>% select(year:model) # select wind information from weather entries [wind_dir, wind_speed, wind_gust] weather %>% select(starts_with("wind_")) weather %>% select(contains("wind_")) # select time information which is not departure related # [arr_time, air_time, ...] flights %>% select(ends_with("_time") & !contains("dep_")) # select columns from weather which contain 'wind' or 'pre' using RegEx weather %>% select(matches("wind|pre")) # select numeric columns from planes planes %>% select(where(is.numeric)) ## pull # extract altitudes of airports as a vector airports %>% pull(alt) # we can also use numbers for indexing airports %>% pull(1) # first col airports %>% pull(-1) # last col # pull named vector of years with model column as names planes %>% pull(model, year) ### Table manipulation ## arrange # sort planes first by year and model (higher priority for year) planes %>% arrange(year, model) # sort planes descending by Nr. of seats planes %>% arrange(desc(seats)) ## relocate # move model & manufacturer to the beginning of planes planes %>% relocate(model, manufacturer) # move year to the end planes %>% relocate(year, .after=last_col()) # renames are also possible # move speed before the engines column and rename to velocity planes %>% relocate(velocity=speed, .before=engines) # move numeric variables to the beginning planes %>% relocate(where(is.numeric), .before=where(is.character)) ## rename # rename lat -> latitude; lon -> longitude; alt -> altitude airports %>% rename(latitude=lat, longitude=lon, altitude=alt) # rename_with to use arbitrary functions # convert all columns to uppercase flights %>% rename_with(toupper) # column conditions # convert columns ending with '_time' to uppercase flights %>% rename_with(toupper, ends_with("_time")) # formulas can be used for functions with multiple variables # replace '_' with ':' in variable names flights %>% rename_with( ~gsub("_", ":", ., fixed = TRUE) ) ## mutate # create column speed per engine planes %>% filter(!is.na(speed)) %>% mutate(speed_per_engine=speed/engines) # remove type and manufacturer column planes %>% mutate(type=NULL, manufacturer=NULL) # convert carrier and origin columns to factor flights %>% mutate(carrier=as.factor(carrier), origin=as.factor(origin)) # collect origin, destination, air time and distance and # calculate avg speed of each flight flights %>% transmute(origin, dest, air_time, distance, avg_speed=distance/air_time*60) ################################# lubridate ###################################################################### library(lubridate) # Parsing date times ymd("20110604") mdy("06-04-2011") dmy("04/06/2011") arrive <- ymd_hms("2020-02-06 12:00:00", tz = "Australia/Melbourne") arrive leave <- ymd_hms("2020-03-06 14:00:00", tz = "Australia/Melbourne") leave as.numeric(ymd("19700103")) as.numeric(ymd_hms("19700101 00:01:30")) # Setting and extracting information hour(arrive) hour(arrive) <- 16 arrive wday(arrive) wday(arrive, label = TRUE) # time zones OlsonNames() grep("Vienna", OlsonNames(), value=TRUE) call <- ymd_hms("20200227 18:00:00", tz="Europe/Vienna") with_tz(call, tzone = "Australia/Melbourne") wrong <- force_tz(call, tzone="Australia/Melbourne") with_tz(wrong, tzone = "Europe/Vienna") # time intervals melbourne <- interval(arrive, leave) melbourne dad <- interval(ymd("20200303", tz="Australia/Melbourne"), ymd("20200311", tz="Australia/Melbourne")) dad int_overlaps(melbourne, dad) # arithmetic as.period(dad) as.duration(dad) melbourne / ddays(1) melbourne %/% days(1) arrive + months(1) arrive + dmonths(1) arrive2 <- update(arrive, year=2021) arrive2 + months(1) arrive2 + dmonths(1)