#--------------------------------------------------------------------------------------------------- #-------------------------- package preparation ---------------------------------------------------- #--------------------------------------------------------------------------------------------------- ##install packages ggplot2, gridExtr, plotly, dplyr, readxl, openxlsx, lubridate (no need to understand this R-Code) x <- c("ggplot2","gridExtra","plotly","lubridate","readxl","openxlsx","dplyr") ipak <- function(pkg){ new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])] print(new.pkg) if (length(new.pkg)>0){ install.packages(new.pkg, dependencies = TRUE) sapply(pkg, require, character.only = TRUE) } } ipak(pkg=x) lapply(x, require, character.only = TRUE) Sys.setlocale("LC_TIME","English") #set R internal time to English #------------------------------------------------------------ #--------------Step I: data loading and first look -------- #------------------------------------------------------------ #Download ATM.txt and import via RStudio or use direct link A<-read.table("http://www.trutschnig.net/ATM.txt",head=TRUE,stringsAsFactors = FALSE) #Alternative: read_delim summary(A) head(A) str(A) A$ymd<-as.Date(A$ymd) #dplyr alternative: A <- mutate(A,ymd=as.Date(ymd)) plot(A$ymd,A$sum_out,type="l") #save the data as xlsx File - select your own (!!) directory first setwd("C:/Users/Trutschnig/Documents/Miktex_docs/Forschung und Uni/Skripten/Data Science Intro/") options("openxlsx.dateFormat" = "yyyy-mm-dd") write.xlsx(A,file="ATM.xlsx",sheetName = "Blatt1",row.names = FALSE) #------------------------------------------------------------ #--------------Step 2: plot the timeseries ---------------- #------------------------------------------------------------ # a look at the first year A$year<-year(A$ymd) A7<-subset(A,A$year==2007) #dplyr equivalent: A7 <- A %>% filter(year==2007) p <- ggplot(data=A7,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point() p <- p + theme_bw() p p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point() p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p <- p + ggtitle("Timeseries and smoother per year") p <- p + theme_bw() p #weekdays in different colours (numerical scale) p <- ggplot(data=A,aes(x=ymd,y=sum_out,color=nr_weekday,size=nr_weekday)) p <- p + geom_line(colour="black") p <- p + geom_point(size=3) p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p <- p + ggtitle("Timeseries and smoother per year") p <- p + theme_bw() p #weekdays in different colours (discrete scale) p <- ggplot(data=A,aes(x=ymd,y=sum_out,color=as.factor(nr_weekday))) p <- p + geom_line(colour="black") p <- p + geom_point(size=3) p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p <- p + ggtitle("Timeseries and smoother per year") p <- p + theme_bw() p #show more capabilities of ggplot2 (no need to understand all of it right now) http://docs.ggplot2.org/current/ #improvement: better grid, holidays in colour, smoother A$monthday<-day(A$ymd) #day of month a<-max(A$sum_out,na.rm=TRUE) B<-subset(A,A$monthday==1) #first days per month A$month<-month(A$ymd) H<-subset(A,A$holiday==1) #holidays V<-subset(A,A$holiday==0.5) p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point() p <- p + geom_point(data=H,colour="red",size=3) p <- p + geom_point(data=V,colour="blue",size=3) p <- p + scale_x_date(breaks=B$ymd) p <- p + stat_smooth(se=FALSE,colour="gray") p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p <- p + scale_y_continuous(limits=c(0,a),name="withdrawn") p <- p + theme(panel.grid.minor.x = element_blank()) p <- p + theme_bw() p #---------------------------------------------------------------------------------------------- #Exercise: change the last code block in such a way that the smoother is "below" the timeseries #---------------------------------------------------------------------------------------------- #------------------------------------------------------------ #--------------Step 3: yearly histo and boxplots ----------- #------------------------------------------------------------ #histo p <- ggplot(data=A,aes(x=sum_out)) p <- p + geom_histogram(aes(fill = ..count..)) p <- p + facet_wrap(~year,nrow=3) p <- p + theme_bw() p #boxplots p <- ggplot(data=A,aes(x=factor(year),y=sum_out)) #why factor(year)? p <- p + geom_boxplot(fill="lightblue") p <- p + geom_jitter() p <- p + theme_bw() p #violin plot p <- ggplot(data=A,aes(x=factor(year),y=sum_out)) p <- p + geom_violin(adjust=0.5,fill="lightblue") p <- p + stat_summary(fun.y = median,fun.ymin = median, fun.ymax = median) p <- p + theme_bw() p #------------------------------------------------------------ #--------------Step 4-5: monthly and daily boxplots -------- #------------------------------------------------------------ #monthly boxplot p <- ggplot(data=A,aes(x=factor(month),y=sum_out)) p <- p + geom_boxplot(fill="green") p <- p + facet_wrap(~year) p <- p + theme_bw() p #reorder p <- ggplot(data=A,aes(x=factor(month),y=sum_out)) p <- p + geom_boxplot(aes(fill=factor(year))) p <- p + theme_bw() p #weekday p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out)) p <- p + geom_boxplot(aes(fill=factor(year))) p <- p + theme_bw() p #weekday reordered p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday))) p <- p + geom_boxplot() p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a)) p <- p + theme_bw() p #weekday reordered with holiday info p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday))) p <- p + geom_boxplot(outlier.shape = NA) p <- p + geom_jitter() p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a)) p <- p + theme_bw() p <- p + scale_fill_discrete(name = "weekday") p p01 <- p #---------------------------------------------------------------------------------------------- #Exercise: produce a violin plot per weekday and year #---------------------------------------------------------------------------------------------- #------------------------------------------------------------ #--------------Step 6: daily heatmap ---------------------- #------------------------------------------------------------ #heatmap for weekdays A$yw<-format(A$ymd,"%Y-%W") farbe<-rainbow(100,start=.40,end=.17) p <- ggplot(A, aes(x=yw,y=nr_weekday)) p <- p + geom_tile(aes(fill = sum_out),colour = "white") p <- p + scale_fill_gradientn(colours=farbe) p <- p + theme_bw() p <- p + theme(axis.text.x=element_text(angle=90)) p <- p + labs(title ="Sum_out per day and week") p <- p + facet_wrap(~year,nrow=3,scales="free") p p02 <- p A$ym<-substr(A$ymd,1,7) p <- ggplot(A, aes(ym,monthday)) p <- p + geom_tile(aes(fill = sum_out),colour = "white") p <- p + scale_fill_gradientn(colours=farbe) p <- p + theme_bw() p <- p + theme(axis.text.x=element_text(angle=90)) p <- p + labs(title = "Sum_out per day and week") p #------------------------------------------------------------ #--------------Step 7: save p01 and p02 in one plot---------- #------------------------------------------------------------ pdf(file="plots.pdf",width=20,height=12) grid.arrange(p01, p02, nrow=1) print(p02) dev.off() #--------------------------------------------------------------------------------------------------- #-------------------------- summarizing/aggregating data-------------------------------------------- #--------------------------------------------------------------------------------------------------- #calculate the mean withdrawn amount per weekday and year. #Option 1: G1 <- expand.grid(2007:2009,1:7) names(G1) <- c("year","weekday") G1$mean.sum_out <- 0 for(i in 1:nrow(G1)){ B <- subset(A,A$year==G1$year[i] & A$nr_weekday==G1$weekday[i]) G1$mean.sum_out[i] <- mean(B$sum_out,na.rm = TRUE) } #Option 2: aggregate G2 <- aggregate(data = A, sum_out~year+nr_weekday, FUN = "mean") #Option 3: using dplyr (best option) G3 <- A %>% #aggregate group_by(year, nr_weekday) %>% summarise(mean = mean(sum_out, na.rm = TRUE),count=n()) G3 <- as.data.frame(G3) #sort G3 <- arrange(G3,desc(nr_weekday),desc(year)) #-------------------------------------------------------------------------------------------------- #Exercise: Calculate the mean withdrawn amount per month and year; sort by month #-------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- #-------------------------- merging data -------------------------------------------- #--------------------------------------------------------------------------------------------------- W <- data.frame(weekday=A$weekday[1:7],Wochentag=c("Montag","Dienstag","Mittwoch","Donnerstag","Freitag","Samstag","Sonntag")) A1 <- merge(A,W) #Alternative: inner_join from dplyr package head(A1) A1 <- subset(A1,select = c(names(A),"Wochentag")) A1 <- arrange(A1,ymd) head(A1) #---------------------------------------------------------------------------------------------------- #-------------------------------- Exercises RTR data ------------------------------------------------ #---------------------------------------------------------------------------------------------------- #load dataset dir <- url("http://www.trutschnig.net/RTR2015.RData") load(dir) RTR <- RTR2015 summary(RTR) p <- ggplot(data=RTR,aes(x=longitude,y=latitude)) p <- p + geom_point(size=0.3) p <- p + theme_bw() p #Ex1: #Ex2: #yearly boxplot #Ex3: #monthly boxplot #Ex4: #monthly boxplot with other colours cols <- c("green","magenta","gray") #Ex5: ecdf per month and op p <- ggplot(data=RTR,aes(x=rtr_speed_dl,color=op_name)) p <- p + stat_ecdf(geom = "step") p <- p + facet_wrap(~mym) p #Ex6: Merge region info to data and aggregate dir <- url("http://www.trutschnig.net/iso12.RData") load(dir) head(iso12) summary(iso12) #Ex7: Boxplots speed per operator and region, 2014 vs. 2015 #Ex8: Heatmap depicting the monthly development of downlink speed per region #Ex9: same as Ex8 but distinguishing op #Ex10: Fastest measurment per device #Ex11: Downlink vs uplink #Ex12: Number measurements per rectangle #Ex13: Baselining 2014 vs 2015 #---------------------------------------------------------------------------------------------------- #-------------------------------- Interactive graphics ---------------------------------------------- #---------------------------------------------------------------------------------------------------- library(plotly) library(htmlwidgets) dir <- url("http://www.trutschnig.net/RTR2015.RData") load(dir) RTR <- RTR2015 dir <- url("http://www.trutschnig.net/iso12.RData") load(dir) head(iso12) summary(iso12) R2 <- merge(RTR,iso12) R2 <- R2[order(R2$id),] head(R2) R2$my <- year(R2$mymd) Z <- R2 %>% group_by(my, district,region,iso_adm2) %>% summarise(speed = median(rtr_speed_dl, na.rm = TRUE),count=n()) Z <- as.data.frame(Z) Z <- arrange(Z,iso_adm2) Z14 <- subset(Z,Z$my==2014,select = -my) names(Z14)[4:5] <- paste(names(Z14)[4:5],"14",sep="") Z15 <- subset(Z,Z$my==2015,,select = -my) names(Z15)[4:5] <- paste(names(Z15)[4:5],"15",sep="") Z <- merge(Z14,Z15) p <- ggplot(data=Z,aes(x=speed14,y=speed15,size=count15,colour=region)) p <- p + geom_abline(intercept = 0,slope = 1) p <- p + geom_point() p <- p + theme_bw() p <- p + ggtitle("Median download speed 2014 versus 2015 per district") p #make it interactive ggplotly(p) #include more info: p <- ggplot(data=Z,aes(x=speed14,y=speed15,size=count15,colour=region)) p <- p + geom_abline(intercept = 0,slope = 1) p <- p + geom_point() p <- p + geom_point(aes(text = paste('Bezirk: ', as.character(district), '
Bundesland: ', region, '
Speed 2014: ', speed14, '
Speed 2015: ', speed15, '
Measurements 2014: ', count14, '
Measurements 2015: ', count15,sep=""))) p <- p + theme_bw() p <- p + ggtitle("Median download speed 2014 versus 2015 per district") ggplotly(p,tooltip = c("text")) #save the file htmlwidgets::saveWidget(ggplotly(p,width=1650,height=730,tooltip = c("text")), file="baselining.html") ###################################################################################################### ##### Additional useful things ####################################################################### ###################################################################################################### # Clean memory: Use rm in combination with gc to free up memory #----------------------------------------------------------------------------------------------------- #---------- Use loop to readin/write several files -------------------------------------------------- #----------------------------------------------------------------------------------------------------- # read-in 20 files and export them as big xlsx file with several sheets #Step 1: Readin: file.names <- paste("http://www.trutschnig.net/school_results/","school-",LETTERS[1:20],".txt",sep="") LL <- vector("list",length=length(file.names)) for(i in 1:length(LL)){ print(i) LL[[i]] <- read.table(file.names[i],head=TRUE) } #Step 2: Write out in one joint Excel file write.xlsx(LL,file="school_results.xlsx")