# STAT 29000
# Project 3 Solutions
# by Mark Daniel Ward
bigDF <- rbind( read.csv("/data/public/dataexpo2009/2006.csv"), read.csv("/data/public/dataexpo2009/2007.csv"), read.csv("/data/public/dataexpo2009/2008.csv") )
verybigDF <- rbind( read.csv("/data/public/dataexpo2009/1987.csv"), read.csv("/data/public/dataexpo2009/1988.csv"), read.csv("/data/public/dataexpo2009/1989.csv"), read.csv("/data/public/dataexpo2009/1990.csv"), read.csv("/data/public/dataexpo2009/1991.csv"), read.csv("/data/public/dataexpo2009/1992.csv"), read.csv("/data/public/dataexpo2009/1993.csv"), read.csv("/data/public/dataexpo2009/1994.csv"), read.csv("/data/public/dataexpo2009/1995.csv"), read.csv("/data/public/dataexpo2009/1996.csv"), read.csv("/data/public/dataexpo2009/1997.csv"), read.csv("/data/public/dataexpo2009/1998.csv"), read.csv("/data/public/dataexpo2009/1999.csv"), read.csv("/data/public/dataexpo2009/2000.csv"), read.csv("/data/public/dataexpo2009/2001.csv"), read.csv("/data/public/dataexpo2009/2002.csv"), read.csv("/data/public/dataexpo2009/2003.csv"), read.csv("/data/public/dataexpo2009/2004.csv"), read.csv("/data/public/dataexpo2009/2005.csv"), read.csv("/data/public/dataexpo2009/2006.csv"), read.csv("/data/public/dataexpo2009/2007.csv"), read.csv("/data/public/dataexpo2009/2008.csv"))
#1a We can find the NA values using is.na. If we sum the result,
# the TRUE's (from the is.na) become 1's and the FALSE's become 0's,
# so we get the number of NA's.
# Then we can divide by the total number of DepTime's to get the desired fraction,
# which is 0.01939045.
sum(is.na(bigDF$DepTime))/length(bigDF$DepTime)
#1b There are several values that are not valid times, as we can see:
levels(as.factor(bigDF$DepTime[!is.na(bigDF$DepTime)]))
# It is up to whether you included (or not) the times 0000 and/or 2400,
# i.e., how you handled the midnight time.
# For example, here I am allowing 0000 but not 2400, but other methods are possible.
# Once we have the times less than 2400, we can check to make sure that the number of minutes is less than 60,
# by taking a modulus by 100, i.e., by dividing by 100 and getting the remainder.
# We see that, once we restrict to times less than 2400, all of the times have valid minutes:
sum((bigDF$DepTime[!is.na(bigDF$DepTime) & (bigDF$DepTime < 2400)] %% 100) >= 60)
# So the only invalid minutes are those which are 2400 or larger:
length(bigDF$DepTime[!is.na(bigDF$DepTime) & (bigDF$DepTime >= 2400)])
# So there are 2695 such values that are >= 2400.
#2a We use tapply to split the ArrDelay, according to the four suggested groupings of the DepTime's.
tapply(bigDF$ArrDelay, cut(bigDF$DepTime, breaks=c(0,600,1200,1800,2400),include.lowest=TRUE), mean, na.rm=TRUE)
# The best times to depart (by this measure) are between 6 AM and 12 noon.
#2b The highest variance in ArrDelay's occur for flights departing between 12 midnight and 6 AM.
tapply(bigDF$ArrDelay, cut(bigDF$DepTime, breaks=c(0,600,1200,1800,2400),include.lowest=TRUE), var, na.rm=TRUE)
#2c Now we analyze by airline too.
tapply(bigDF$ArrDelay, list(cut(bigDF$DepTime, breaks=c(0,600,1200,1800,2400),include.lowest=TRUE), bigDF$UniqueCarrier), mean, na.rm=TRUE)
tapply(bigDF$ArrDelay, list(cut(bigDF$DepTime, breaks=c(0,600,1200,1800,2400),include.lowest=TRUE), bigDF$UniqueCarrier), var, na.rm=TRUE)
#3a The cities with the most departures are:
# ATL ORD DFW DEN LAX PHX IAH LAS DTW EWR
sort(tapply(bigDF$Origin, bigDF$Origin, length),decreasing=TRUE)[1:10]
#3b The cities with the most arrivals are:
# ATL ORD DFW DEN LAX PHX IAH LAS DTW EWR
sort(tapply(bigDF$Dest, bigDF$Dest, length),decreasing=TRUE)[1:10]
#3c Wrapping everything up into one line, we can do this as follows,
# with the columns in order from 2006 to 2008, reading left-to-right.
# Indeed, the answers are pretty consistent, from year to year.
# For most departures:
sapply(1:3, function(j) names(sort(tapply(bigDF$Origin, list(bigDF$Origin, bigDF$Year), length)[,j], decreasing=TRUE))[1:10])
# For most arrivals:
sapply(1:3, function(j) names(sort(tapply(bigDF$Dest, list(bigDF$Dest, bigDF$Year), length)[,j], decreasing=TRUE))[1:10])
#3d The 10 most popular departure/arrival pairs are:
# from OGG to HNL from HNL to OGG from LAX to LAS from LAS to LAX from SAN to LAX from LAX to SAN from BOS to LGA from LGA to BOS from SFO to LAX from HNL to LIH
sort(table(paste("from", bigDF$Origin, "to", bigDF$Dest)), decreasing=TRUE)[1:10]
#4a these 5 airports are most likely to be on time for arrivals (on average)
# HVN ITH LIH EAU ITO HTS OGG KOA PIH CDC
sort(tapply(bigDF$ArrDelay, bigDF$Dest, mean, na.rm=TRUE))[1:10]
#4b these 5 airports are most likely to be on time for departures (on average)
# GLH WYS PIH HVN ITO COD EKO CDC LIH IYK
sort(tapply(bigDF$DepDelay, bigDF$Origin, mean, na.rm=TRUE))[1:10]
#4c these 5 airports are most likely to be delayed for arrivals (on average)
# MQT OTH ACK SOP HHH ISO MCN EWR TTN CIC
sort(tapply(bigDF$ArrDelay, bigDF$Dest, mean, na.rm=TRUE), decreasing=TRUE)[1:10]
#4d these 5 airports are most likely to be delayed for departures (on average)
# ACK PIR PUB SOP OTH CEC ADK LMT CKB AKN
sort(tapply(bigDF$DepDelay, bigDF$Origin, mean, na.rm=TRUE), decreasing=TRUE)[1:10]
#5a The best day of the week to fly, to minimize delayed arrivals, is Saturday (DayOfWeek=6)
tapply(bigDF$ArrDelay, bigDF$DayOfWeek, mean, na.rm=TRUE)
#5b The portions of flights, by day, is (Monday is DayOfWeek=1; Sunday is DayOfWeek=7) the following:
# 0.1479634 0.1453808 0.1467945 0.1473238 0.1478121 0.1245277 0.1401976
tapply(bigDF$DayOfWeek, bigDF$DayOfWeek, length)/length(bigDF$DayOfWeek)
#5c The portions of flights, by time of day, is:
# 12 midnight to 6 AM: 0.02610195
# 6 AM to 12 noon: 0.38311298
# 12 noon to 6 PM: 0.37239489
# 6 PM to 12 midnight: 0.21839018
tapply(bigDF$DepTime[bigDF$DepTime<2400 & !is.na(bigDF$DepTime)], cut(bigDF$DepTime[bigDF$DepTime<2400 & !is.na(bigDF$DepTime)], breaks=c(0,600,1200,1800,2400), include.lowest=TRUE), length)/length(bigDF$DepTime[bigDF$DepTime<2400 & !is.na(bigDF$DepTime)])
#5d Also with a breakdown by day of the week, we have:
tapply(bigDF$DepTime[bigDF$DepTime<2400 & !is.na(bigDF$DepTime)], list( cut(bigDF$DepTime[bigDF$DepTime<2400 & !is.na(bigDF$DepTime)], breaks=c(0,600,1200,1800,2400), include.lowest=TRUE), bigDF$DayOfWeek[bigDF$DepTime<2400 & !is.na(bigDF$DepTime)]), length)/length(bigDF$DepTime[bigDF$DepTime<2400 & !is.na(bigDF$DepTime)])
#6a The most likely carriers to be delayed are: OH F9 EV NW TZ
sort(tapply(bigDF$ArrDelay[bigDF$ArrDelay > 0], bigDF$UniqueCarrier[bigDF$ArrDelay > 0], length) / tapply(bigDF$ArrDelay, bigDF$UniqueCarrier, length))
#6b The most likely carriers to be on time are: FL WN 9E AQ HA
sort(tapply(bigDF$ArrDelay[bigDF$ArrDelay <= 0], bigDF$UniqueCarrier[bigDF$ArrDelay <= 0], length) / tapply(bigDF$ArrDelay, bigDF$UniqueCarrier, length))
#7a Here is a month-by-month breakdown of cancelled flights:
tapply(bigDF$Cancelled[bigDF$Cancelled==1], list(bigDF$Year[bigDF$Cancelled==1], bigDF$Month[bigDF$Cancelled==1]), length)/tapply(bigDF$Cancelled, list(bigDF$Year, bigDF$Month), length)
#7b The worst months of the year for cancelled flights are, respectively, Feb (worst of all!), Dec (2nd worst), Jan (3rd worst)
sort(tapply(bigDF$Cancelled[bigDF$Cancelled==1], list(bigDF$Month[bigDF$Cancelled==1]), length)/tapply(bigDF$Cancelled, list(bigDF$Month), length),decreasing=TRUE)[1:3]
#8 First we get the counts:
ChiDF <- subset(bigDF, subset=bigDF$Origin == "ORD")
ChiCounts <- tapply(ChiDF$Year, ChiDF$Year, length)
IndDF <- subset(bigDF, subset=bigDF$Origin == "IND")
IndCounts <- tapply(IndDF$Year, IndDF$Year, length)
# Then we build the dotchart:
dotchart(rbind(ChiCounts, IndCounts))
#9 First we get the 2007 data only:
Chi2007DF <- subset(bigDF, subset=bigDF$Origin == "ORD" & Year==2007)
Ind2007DF <- subset(bigDF, subset=bigDF$Origin == "IND" & Year==2007)
# Then we get the delay counts and the total counts:
Chi2007DelayCounts <- tapply(Chi2007DF$DepDelay[Chi2007DF$DepDelay>30], Chi2007DF$Month[Chi2007DF$DepDelay>30], length)
Chi2007AllCounts <- tapply(Chi2007DF$DepDelay, Chi2007DF$Month, length)
Ind2007DelayCounts <- tapply(Ind2007DF$DepDelay[Ind2007DF$DepDelay>30], Ind2007DF$Month[Ind2007DF$DepDelay>30], length)
Ind2007AllCounts <- tapply(Ind2007DF$DepDelay, Ind2007DF$Month, length)
# Finally we make a matrix and then a dotchart:
M <- rbind(Chi2007DelayCounts/Chi2007AllCounts, Ind2007DelayCounts/Ind2007AllCounts)
row.names(M) <- c("ORD","IND")
dotchart(M)
#10 Now we rearrange the matrix from #9, switching the role of rows and columns, by taking a transpose:
newM <- t(M)
# Then we plot the data:
dotchart(newM)