#---------------------------------------------------------------------------------------------------
#-------------------------- package preparation ----------------------------------------------------
#---------------------------------------------------------------------------------------------------
##install packages ggplot2, gridExtr, plotly, dplyr, readxl, openxlsx, lubridate (no need to understand this R-Code)
x <- c("ggplot2","gridExtra","plotly","lubridate","readxl","openxlsx","dplyr")
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
print(new.pkg)
if (length(new.pkg)>0){
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
}
ipak(pkg=x)
lapply(x, require, character.only = TRUE)
Sys.setlocale("LC_TIME","English") #set R internal time to English
#------------------------------------------------------------
#--------------Step I: data loading and first look --------
#------------------------------------------------------------
#Download ATM.txt and import via RStudio or use direct link
A<-read.table("http://www.trutschnig.net/ATM.txt",head=TRUE,stringsAsFactors = FALSE) #Alternative: read_delim
summary(A)
head(A)
str(A)
A$ymd<-as.Date(A$ymd) #dplyr alternative: A <- mutate(A,ymd=as.Date(ymd))
plot(A$ymd,A$sum_out,type="l")
#save the data as xlsx File - select your own (!!) directory first
setwd("C:/Users/Trutschnig/Documents/Miktex_docs/Forschung und Uni/Skripten/Data Science Intro/")
options("openxlsx.dateFormat" = "yyyy-mm-dd")
write.xlsx(A,file="ATM.xlsx",sheetName = "Blatt1",row.names = FALSE)
#------------------------------------------------------------
#--------------Step 2: plot the timeseries ----------------
#------------------------------------------------------------
# a look at the first year
A$year<-year(A$ymd)
A7<-subset(A,A$year==2007) #dplyr equivalent: A7 <- A %>% filter(year==2007)
p <- ggplot(data=A7,aes(x=ymd,y=sum_out))
p <- p + geom_line()
p <- p + geom_point()
p <- p + theme_bw()
p
p <- ggplot(data=A,aes(x=ymd,y=sum_out))
p <- p + geom_line()
p <- p + geom_point()
p <- p + facet_wrap(~year,scales = "free_x",nrow=3)
p <- p + ggtitle("Timeseries and smoother per year")
p <- p + theme_bw()
p
#weekdays in different colours (numerical scale)
p <- ggplot(data=A,aes(x=ymd,y=sum_out,color=nr_weekday,size=nr_weekday))
p <- p + geom_line(colour="black")
p <- p + geom_point(size=3)
p <- p + facet_wrap(~year,scales = "free_x",nrow=3)
p <- p + ggtitle("Timeseries and smoother per year")
p <- p + theme_bw()
p
#weekdays in different colours (discrete scale)
p <- ggplot(data=A,aes(x=ymd,y=sum_out,color=as.factor(nr_weekday)))
p <- p + geom_line(colour="black")
p <- p + geom_point(size=3)
p <- p + facet_wrap(~year,scales = "free_x",nrow=3)
p <- p + ggtitle("Timeseries and smoother per year")
p <- p + theme_bw()
p
#show more capabilities of ggplot2 (no need to understand all of it right now) http://docs.ggplot2.org/current/
#improvement: better grid, holidays in colour, smoother
A$monthday<-day(A$ymd) #day of month
a<-max(A$sum_out,na.rm=TRUE)
B<-subset(A,A$monthday==1) #first days per month
A$month<-month(A$ymd)
H<-subset(A,A$holiday==1) #holidays
V<-subset(A,A$holiday==0.5)
p <- ggplot(data=A,aes(x=ymd,y=sum_out))
p <- p + geom_line()
p <- p + geom_point()
p <- p + geom_point(data=H,colour="red",size=3)
p <- p + geom_point(data=V,colour="blue",size=3)
p <- p + scale_x_date(breaks=B$ymd)
p <- p + stat_smooth(se=FALSE,colour="gray")
p <- p + facet_wrap(~year,scales = "free_x",nrow=3)
p <- p + scale_y_continuous(limits=c(0,a),name="withdrawn")
p <- p + theme(panel.grid.minor.x = element_blank())
p <- p + theme_bw()
p
#----------------------------------------------------------------------------------------------
#Exercise: change the last code block in such a way that the smoother is "below" the timeseries
#----------------------------------------------------------------------------------------------
#------------------------------------------------------------
#--------------Step 3: yearly histo and boxplots -----------
#------------------------------------------------------------
#histo
p <- ggplot(data=A,aes(x=sum_out))
p <- p + geom_histogram(aes(fill = ..count..))
p <- p + facet_wrap(~year,nrow=3)
p <- p + theme_bw()
p
#boxplots
p <- ggplot(data=A,aes(x=factor(year),y=sum_out)) #why factor(year)?
p <- p + geom_boxplot(fill="lightblue")
p <- p + geom_jitter()
p <- p + theme_bw()
p
#violin plot
p <- ggplot(data=A,aes(x=factor(year),y=sum_out))
p <- p + geom_violin(adjust=0.5,fill="lightblue")
p <- p + stat_summary(fun.y = median,fun.ymin = median, fun.ymax = median)
p <- p + theme_bw()
p
#------------------------------------------------------------
#--------------Step 4-5: monthly and daily boxplots --------
#------------------------------------------------------------
#monthly boxplot
p <- ggplot(data=A,aes(x=factor(month),y=sum_out))
p <- p + geom_boxplot(fill="green")
p <- p + facet_wrap(~year)
p <- p + theme_bw()
p
#reorder
p <- ggplot(data=A,aes(x=factor(month),y=sum_out))
p <- p + geom_boxplot(aes(fill=factor(year)))
p <- p + theme_bw()
p
#weekday
p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out))
p <- p + geom_boxplot(aes(fill=factor(year)))
p <- p + theme_bw()
p
#weekday reordered
p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday)))
p <- p + geom_boxplot()
p <- p + facet_wrap(~year)
p <- p + scale_y_continuous(limits=c(0,a))
p <- p + theme_bw()
p
#weekday reordered with holiday info
p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday)))
p <- p + geom_boxplot(outlier.shape = NA)
p <- p + geom_jitter()
p <- p + facet_wrap(~year)
p <- p + scale_y_continuous(limits=c(0,a))
p <- p + theme_bw()
p <- p + scale_fill_discrete(name = "weekday")
p
p01 <- p
#----------------------------------------------------------------------------------------------
#Exercise: produce a violin plot per weekday and year
#----------------------------------------------------------------------------------------------
#------------------------------------------------------------
#--------------Step 6: daily heatmap ----------------------
#------------------------------------------------------------
#heatmap for weekdays
A$yw<-format(A$ymd,"%Y-%W")
farbe<-rainbow(100,start=.40,end=.17)
p <- ggplot(A, aes(x=yw,y=nr_weekday))
p <- p + geom_tile(aes(fill = sum_out),colour = "white")
p <- p + scale_fill_gradientn(colours=farbe)
p <- p + theme_bw()
p <- p + theme(axis.text.x=element_text(angle=90))
p <- p + labs(title ="Sum_out per day and week")
p <- p + facet_wrap(~year,nrow=3,scales="free")
p
p02 <- p
A$ym<-substr(A$ymd,1,7)
p <- ggplot(A, aes(ym,monthday))
p <- p + geom_tile(aes(fill = sum_out),colour = "white")
p <- p + scale_fill_gradientn(colours=farbe)
p <- p + theme_bw()
p <- p + theme(axis.text.x=element_text(angle=90))
p <- p + labs(title = "Sum_out per day and week")
p
#------------------------------------------------------------
#--------------Step 7: save p01 and p02 in one plot----------
#------------------------------------------------------------
pdf(file="plots.pdf",width=20,height=12)
grid.arrange(p01, p02, nrow=1)
print(p02)
dev.off()
#---------------------------------------------------------------------------------------------------
#-------------------------- summarizing/aggregating data--------------------------------------------
#---------------------------------------------------------------------------------------------------
#calculate the mean withdrawn amount per weekday and year.
#Option 1:
G1 <- expand.grid(2007:2009,1:7)
names(G1) <- c("year","weekday")
G1$mean.sum_out <- 0
for(i in 1:nrow(G1)){
B <- subset(A,A$year==G1$year[i] & A$nr_weekday==G1$weekday[i])
G1$mean.sum_out[i] <- mean(B$sum_out,na.rm = TRUE)
}
#Option 2: aggregate
G2 <- aggregate(data = A, sum_out~year+nr_weekday, FUN = "mean")
#Option 3: using dplyr (best option)
G3 <- A %>% #aggregate
group_by(year, nr_weekday) %>%
summarise(mean = mean(sum_out, na.rm = TRUE),count=n())
G3 <- as.data.frame(G3)
#sort
G3 <- arrange(G3,desc(nr_weekday),desc(year))
#--------------------------------------------------------------------------------------------------
#Exercise: Calculate the mean withdrawn amount per month and year; sort by month
#--------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------
#-------------------------- merging data --------------------------------------------
#---------------------------------------------------------------------------------------------------
W <- data.frame(weekday=A$weekday[1:7],Wochentag=c("Montag","Dienstag","Mittwoch","Donnerstag","Freitag","Samstag","Sonntag"))
A1 <- merge(A,W) #Alternative: inner_join from dplyr package
head(A1)
A1 <- subset(A1,select = c(names(A),"Wochentag"))
A1 <- arrange(A1,ymd)
head(A1)
#----------------------------------------------------------------------------------------------------
#-------------------------------- Exercises RTR data ------------------------------------------------
#----------------------------------------------------------------------------------------------------
#load dataset
dir <- url("http://www.trutschnig.net/RTR2015.RData")
load(dir)
RTR <- RTR2015
summary(RTR)
p <- ggplot(data=RTR,aes(x=longitude,y=latitude))
p <- p + geom_point(size=0.3)
p <- p + theme_bw()
p
#Ex1:
#Ex2:
#yearly boxplot
#Ex3:
#monthly boxplot
#Ex4:
#monthly boxplot with other colours
cols <- c("green","magenta","gray")
#Ex5: ecdf per month and op
p <- ggplot(data=RTR,aes(x=rtr_speed_dl,color=op_name))
p <- p + stat_ecdf(geom = "step")
p <- p + facet_wrap(~mym)
p
#Ex6: Merge region info to data and aggregate
dir <- url("http://www.trutschnig.net/iso12.RData")
load(dir)
head(iso12)
summary(iso12)
#Ex7: Boxplots speed per operator and region, 2014 vs. 2015
#Ex8: Heatmap depicting the monthly development of downlink speed per region
#Ex9: same as Ex8 but distinguishing op
#Ex10: Fastest measurment per device
#Ex11: Downlink vs uplink
#Ex12: Number measurements per rectangle
#Ex13: Baselining 2014 vs 2015
#----------------------------------------------------------------------------------------------------
#-------------------------------- Interactive graphics ----------------------------------------------
#----------------------------------------------------------------------------------------------------
library(plotly)
library(htmlwidgets)
dir <- url("http://www.trutschnig.net/RTR2015.RData")
load(dir)
RTR <- RTR2015
dir <- url("http://www.trutschnig.net/iso12.RData")
load(dir)
head(iso12)
summary(iso12)
R2 <- merge(RTR,iso12)
R2 <- R2[order(R2$id),]
head(R2)
R2$my <- year(R2$mymd)
Z <- R2 %>%
group_by(my, district,region,iso_adm2) %>%
summarise(speed = median(rtr_speed_dl, na.rm = TRUE),count=n())
Z <- as.data.frame(Z)
Z <- arrange(Z,iso_adm2)
Z14 <- subset(Z,Z$my==2014,select = -my)
names(Z14)[4:5] <- paste(names(Z14)[4:5],"14",sep="")
Z15 <- subset(Z,Z$my==2015,,select = -my)
names(Z15)[4:5] <- paste(names(Z15)[4:5],"15",sep="")
Z <- merge(Z14,Z15)
p <- ggplot(data=Z,aes(x=speed14,y=speed15,size=count15,colour=region))
p <- p + geom_abline(intercept = 0,slope = 1)
p <- p + geom_point()
p <- p + theme_bw()
p <- p + ggtitle("Median download speed 2014 versus 2015 per district")
p
#make it interactive
ggplotly(p)
#include more info:
p <- ggplot(data=Z,aes(x=speed14,y=speed15,size=count15,colour=region))
p <- p + geom_abline(intercept = 0,slope = 1)
p <- p + geom_point()
p <- p + geom_point(aes(text = paste('Bezirk: ', as.character(district),
'
Bundesland: ', region,
'
Speed 2014: ', speed14,
'
Speed 2015: ', speed15,
'
Measurements 2014: ', count14,
'
Measurements 2015: ', count15,sep="")))
p <- p + theme_bw()
p <- p + ggtitle("Median download speed 2014 versus 2015 per district")
ggplotly(p,tooltip = c("text"))
#save the file
htmlwidgets::saveWidget(ggplotly(p,width=1650,height=730,tooltip = c("text")), file="baselining.html")
######################################################################################################
##### Additional useful things #######################################################################
######################################################################################################
# Clean memory: Use rm in combination with gc to free up memory
#-----------------------------------------------------------------------------------------------------
#---------- Use loop to readin/write several files --------------------------------------------------
#-----------------------------------------------------------------------------------------------------
# read-in 20 files and export them as big xlsx file with several sheets
#Step 1: Readin:
file.names <- paste("http://www.trutschnig.net/school_results/","school-",LETTERS[1:20],".txt",sep="")
LL <- vector("list",length=length(file.names))
for(i in 1:length(LL)){
print(i)
LL[[i]] <- read.table(file.names[i],head=TRUE)
}
#Step 2: Write out in one joint Excel file
write.xlsx(LL,file="school_results.xlsx")