1 Libraries

library(dplyr)
# library(ggplot2)
library(ggcorrplot)
library(sf)
library(mgcv)
library(boot)
# library(varhandle)
# library(tidyr)
# library(ggplot2)
# library(hrbrthemes)
library(lubridate)

2 Data preparation

flights <- readRDS("~/Desktop/HackDays/twist_zrh_cleaned.RDS")
flights$airline_name <- as.factor(flights$airline_name)
flights$diff_in_sec <- as.numeric(flights$diff_in_secs)
flights$precip <- as.numeric(flights$precip)
flights$month <- month(flights$date)
flights$hour <- hour(flights$planed_time)
flights$day <- day(flights$date)

#Cut-off = 30min
flights$delayed <- 0
flights[flights$diff_in_secs > 1800,]$delayed <- 1
flights$delayed <- as.factor(flights$delayed)
levels(flights$delayed) <- c("no", "yes")
flights$no_flights <- 1

2.1 Correlation matrix: Wheather data

flights$precip <- as.numeric(flights$precip)
num_cov <- flights[,c("temp_avg", "temp_min", 
                       "temp_max", "sunshine_dur_min","global_rad_avg_h", "precip", 
                       "winddir_h", "windspeed_avg_h", "windspeed_peak_h",
                          "airpres","rel_humid", "lightnings_hour_n", "lightnings_hour_f")]
# nums <- unlist(lapply(num_cov, is.numeric))  #check for numeric cov.

num_cov <- flights[,c("temp_avg", "temp_min", 
                       "temp_max", "sunshine_dur_min","global_rad_avg_h", "precip", 
                       "winddir_h", "windspeed_avg_h", "windspeed_peak_h",
                          "airpres","rel_humid", "lightnings_hour_n", "lightnings_hour_f")]
num_cov$precip <- as.numeric(num_cov$precip)
corr <- round(cor(num_cov, use="complete.obs"),2)
ggcorrplot(corr) +
  ggtitle("Wheather data") +
  theme(plot.title = element_text(hjust = 0.5))

#Subsettting

flights_s <- subset(flights, start_landing == "S")
flights_sub <- flights_s %>%
 group_by(airline_name) %>%
 filter(n() > 1000)

flights_sub <- flights_s%>%
  mutate(h=lubridate::hour(planed_time)) %>%
  group_by(date,h)%>%
  mutate(flights_per_h=n())
layout(matrix(1:4, nrow = 2))
spineplot(delayed ~ temp_avg, data = flights_sub)
spineplot(delayed ~ windspeed_avg_h, data = flights_sub)
spineplot(delayed ~ precip, data = flights_sub)
spineplot(delayed ~ rel_humid, data = flights_sub)

Showing the partial associations of discretised numeric exploratory covariates and their empirical relative frequencies plotted against the conditional frequency of delayed flights.

3 Gam-model

edelweiss <- subset(flights_sub, airline_name = "Edelweiss Air AG")
fit_bin <- gam(delayed ~
                 # s(airline_name, bs = "re")
                 s(temp_avg)
               + s(precip)
               + s(airpres)
               + s(windspeed_avg_h)
               + s(rel_humid)
               + distance_km
               + continent
               + flights_per_h
               ,data=edelweiss, family=binomial())
trans <- function(x) gaussian(link=log)$linkinv(x)
plot(fit_bin, shade=TRUE, ylim = c(0, 3), pages=1, trans=trans)