# STAT 29000
# Project 6 Solutions
# by Mark Daniel Ward
# 1.
DF <- read.csv("http://llc.stat.purdue.edu/2014/29000/projects/saturn03.240.A.CT_2012_06_PD0.csv")
head(DF)
# 1a.
plot(DF$water_temperature[DF$water_temperature<500 & DF$water_electrical_conductivity<20],
DF$water_electrical_conductivity[DF$water_temperature<500 & DF$water_electrical_conductivity<20])
mylm <- lm( DF$water_electrical_conductivity[DF$water_temperature<500 & DF$water_electrical_conductivity<20]
~ DF$water_temperature[DF$water_temperature<500 & DF$water_electrical_conductivity<20])
abline(mylm)
# 1b.
plot(DF$water_temperature[DF$water_temperature<500], DF$water_salinity[DF$water_temperature<500])
mylm <- lm( DF$water_salinity[DF$water_temperature<500]
~ DF$water_temperature[DF$water_temperature<500])
abline(mylm)
# 1c.
plot(DF$water_salinity[DF$water_electrical_conductivity<20], DF$water_electrical_conductivity[DF$water_electrical_conductivity<20])
mylm <- lm( DF$water_electrical_conductivity[DF$water_electrical_conductivity<20]
~ DF$water_salinity[DF$water_electrical_conductivity<20])
abline(mylm)
# 1d. The last of these linear models seems most amenable to linear modeling,
# because salinity and electrical conductivity seem to be (perhaps) almost linearly related.
# We can also read the summaries of the 3 linear models in 1a, 1b, and 1c, to make a more precise comparison.
# 2a.
mylm <- lm( mtcars$mpg ~ mtcars$hp)
plot( mtcars$hp, mtcars$mpg)
abline(mylm)
# 2b.
mylm <- lm( mtcars$mpg ~ mtcars$hp + mtcars$disp)
# 2c.
summary(mylm)
# So we could estimate the mpg to be about 20.10484 if the car has 147 hp and 230 disp.
30.735904 - 0.024840*147 - 0.030346*230
# 3a. First we get the 1990 data.
DF <- read.csv("/data/public/dataexpo2009/1990.csv")
# 3b. Next, we take a subset with just the flights from June 1990.
juneDF <- subset(DF, Month==6)
# 3c. To predict the arrival delays from the departure delays, we build a linear model:
mylm <- lm( juneDF$ArrDelay ~ juneDF$DepDelay)
# 3d. Here is a plot of both types of delays:
plot( juneDF$DepDelay, juneDF$ArrDelay )
# 3e. Now we add the regression line
abline(mylm)
# 3f. Now we do these steps again, removing some of the outliers.
mylm <- lm( juneDF$ArrDelay[juneDF$DepDelay < 500 & juneDF$DepDelay > -50]
~ juneDF$DepDelay[juneDF$DepDelay < 500 & juneDF$DepDelay > -50])
plot( juneDF$DepDelay[juneDF$DepDelay < 500 & juneDF$DepDelay > -50],
juneDF$ArrDelay[juneDF$DepDelay < 500 & juneDF$DepDelay > -50] )
abline(mylm)
# 4a. Here are 100 (continuous) Uniform random numbers, uniformly distributed between 0 and 1:
U <- runif(100)
# 4b. Now we make the required transformation:
V <- -log(U)/3
# 4c. Here are 100 Exponential random numbers, with rate 3
X <- rexp(100, rate=3)
# 4d. Here is the desired qqplot; notice that it almost looks like a straight line.
qqplot( V, X)
# 4e. Now we re-do parts 4a through 4d with a million (instead of a hundred) numbers per variable:
U <- runif(1000000)
V <- -log(U)/3
X <- rexp(1000000, rate=3)
qqplot( V, X)
# 5a. Here is the desired matrix:
M <- matrix( runif(1000000), nrow=1000)
# 5b. Here are the row sums:
v <- apply(M, 1, sum)
# 5c. Here are the scaled-and-shifted values:
w <- (v-500)/sqrt(1000/12)
# 5d. Here is the desired qqplot
x <- rnorm(1000)
qqplot( w,x )
# 6a. Here is the desired matrix:
M <- matrix( rexp(100000000, rate=5), nrow=10000)
# 6b. Here are the row sums:
v <- apply(M, 1, sum)
# 6c. Here are the scaled-and-shifted values:
w <- (v-2000)/20
# 6d. Here is the desired qqplot
x <- rnorm(10000)
qqplot( w,x )
# 7a. Here is a multiple linear regression model
# for concentration, based on weight, dose, and time, in the Theoph data set
mylm <- lm( Theoph$conc ~ Theoph$Wt + Theoph$Dose + Theoph$Time)
# 7b.
summary(mylm)
# We could estimate the concentration to be about 4.22351 mg/L
# if the person weighs 66kg, receives a dose of 4 mg/kg,
# and it has been 6 hours since the dose was administered.
-4.45137 + 0.06832*66 + 1.16716*4 - 0.12572*4
# 8. The vector of departure delays that are longer than 30 has length 28022:
length(juneDF$DepDelay[juneDF$DepDelay > 30])
# but does not look Normal:
qqplot(juneDF$DepDelay[juneDF$DepDelay > 30], rnorm(28022))
# but does not look Uniform:
qqplot(juneDF$DepDelay[juneDF$DepDelay > 30], runif(28022))
# but it DOES look Exponential:
qqplot(juneDF$DepDelay[juneDF$DepDelay > 30], rexp(28022))
# We can estimate the rate by first taking the mean of these delays, which turns out to be: 70.65488
mean(juneDF$DepDelay[juneDF$DepDelay > 30], na.rm=T)
# and then we can take the rate to be the inverse of this mean:
qqplot(juneDF$DepDelay[juneDF$DepDelay > 30], rexp(28022, rate=1/70.65488))
# Indeed, if we explore a little more, we might find that the delays are roughly
# distributed like 30 + an Exponential, so we could do something more precise like this:
# We see that, on average, the delay is 40.65488 minutes beyond 30:
mean(juneDF$DepDelay[juneDF$DepDelay > 30] - 30, na.rm=T)
# So it makes more sense to compare the DepDelay - 30 to an Exponential:
qqplot(juneDF$DepDelay[juneDF$DepDelay > 30] - 30, rexp(28022, rate=1/40.65488))
# This fit is not exact, but it is reasonable for a very rough approximation.