# STAT 29000
# Project 2 Solutions
# by Mark Daniel Ward
# 1ab Download the data set and import it to R:
DF <- read.csv("http://llc.stat.purdue.edu/2014/29000/projects/saturn03.240.A.CT_2012_06_PD0.csv")
# 1c Use strptime to convert the times:
timevec <- strptime(DF[ ,1], "%Y/%m/%d %H:%M:%S")
# 2a The most common time (in seconds) between measurements
# is 3 seconds. A 3 second difference occurs 188681 times.
sort(table(diff(timevec)),decreasing=TRUE)[1]
# 2b The mean time between consecutive measurements
# is 12.7873 seconds. This is so skewed (to be large,
# in particular, much larger than 3 seconds!)
# because there are several large differences in times,
# e.g., when the machine collecting the data is
# broken for long periods.
mean(diff(timevec))
# 3a The machine gets stuck a total of 10688 times.
sum(diff(timevec) > 15)
# 3b The longest duration when the machine is broken
# can be achieved in either of these two equivalent ways:
t <- which(diff(timevec) == max(diff(timevec)))
t <- which.max(diff(timevec))
# The longest duration when the machine was broken was 128661 seconds:
diff(timevec)[t]
# or (equivalently) 1.489132 days:
timevec[t+1] - timevec[t]
# The longest duration when the machine is broken is from
# June 17, 2012, 5:24:00 PM, to June 19, 2012, 5:08:21 AM.
timevec[t]
timevec[t+1]
# 3c The ten longest durations
# when the machine was broken (in seconds) are:
# 128661 106344 62985 42282 33441 30930 26958 19416 12942 8613
sort(diff(timevec),decreasing=TRUE)[1:10]
# 4a We can see that there are one (or more) outliers,
# with **electrical conductivity** falsely
# reported to be about 25 or so
plot(DF$water_electrical_conductivity)
# but the rest of the points are between 11.051 and 17.845
range(DF$water_electrical_conductivity[DF$water_electrical_conductivity < 25])
# and there do not appear to be any outliers on the lower side:
plot(DF$water_electrical_conductivity[DF$water_electrical_conductivity < 25])
# There are actually TWO outliers:
which(DF$water_electrical_conductivity > 25)
# These two times both occur on June 26, 2012,
# at 2:36 PM and 3:33 PM:
timevec[168443]
timevec[168742]
# 4b The first outlier occurs at the same time
# as the outlier for the temperature data,
# i.e., at time index 168443.
# 4c We can see (visually) that there do not appear to be any outliers
# for the salinity data:
plot(DF$water_salinity)
############# start of Problem 5a #############
# Download the data set from 8.2m and import it to R:
DF820 <- read.csv("http://llc.stat.purdue.edu/2014/29000/projects/saturn03.820.A.CT_2012_06_PD0.csv")
# Use strptime to convert the times:
timevec820 <- strptime(DF820[ ,1], "%Y/%m/%d %H:%M:%S")
# The most common time (in seconds) between measurements
# is 3 seconds. A 3 second difference occurs 119805 times at depth 8.2m.
sort(table(diff(timevec820)),decreasing=TRUE)[1]
# The mean time between consecutive measurements is 19.5147 seconds at depth 8.2m.
mean(diff(timevec820))
# The machine gets stuck a total of 10447 times at depth 8.2m.
sum(diff(timevec820) > 15)
# The longest duration when the machine is broken (at depth 8.2m)
# can be achieved in either of these two equivalent ways:
t820 <- which(diff(timevec820) == max(diff(timevec820)))
t820 <- which.max(diff(timevec820))
# The longest duration when the machine
# was broken (at depth 8.2m) was 128853 seconds:
diff(timevec820)[t820]
# or (equivalently) 1.491354 days:
timevec820[t820+1] - timevec820[t820]
# The longest duration when the machine is broken
# (when working at depth 8.2m) is from
# June 17, 2012, 5:23:00 PM, to June 19, 2012, 5:10:33 AM.
timevec820[t820]
timevec820[t820+1]
# The ten longest durations
# when the machine was broken (in seconds, at depth 8.2m) are:
# 128853 100416 84018 65861 34356 26799 26070 24663 14208 8262
sort(diff(timevec820),decreasing=TRUE)[1:10]
############# end of Problem 5a #############
#5b Yes, these are roughly the same times; again, we point out:
# The longest duration when the machine is broken is from
# June 17, 2012, 5:24:00 PM, to June 19, 2012, 5:08:21 AM.
timevec[t]
timevec[t+1]
# The longest duration when the machine is broken
# (when working at depth 8.2m) is from
# June 17, 2012, 5:23:00 PM, to June 19, 2012, 5:10:33 AM.
timevec820[t820]
timevec820[t820+1]
#5c The temperature data at depth 8.2m looks like:
plot(DF820$water_temperature)
# We remove the outlier that has temperature above 3000:
plot(DF820$water_temperature[DF820$water_temperature < 3000])
# and then we also remove the outlier that has temperature below 7:
plot(DF820$water_temperature[DF820$water_temperature < 3000 & DF820$water_temperature > 7])
#6a Download the data set from 13m and import it to R:
DF1300 <- read.csv("http://llc.stat.purdue.edu/2014/29000/projects/saturn03.1300.R.CT_2012_06_PD0.csv")
#6b The average water temperatures at
# depths 2.4m, 8.2m, and 13m are (respectively):
mean(DF$water_temperature[DF$water_temperature < 500])
mean(DF820$water_temperature[DF820$water_temperature<3000])
mean(DF1300$water_temperature)
# So the average temperature is highest at depth 2.4m.
# This makes intuitive sense, since the water is warmer
# at depths that are more shallow
# (and gets colder and deeper depths)
#7a The average salinity at the 3 depths 2.4m, 8.2m, 13m are
# (respectively) 2.894666, 6.327593, 12.18957
mean(DF$water_salinity)
mean(DF820$water_salinity[DF820$water_salinity<1000])
mean(DF1300$water_salinity)
The average salinity at the 3 depths 2.4m, 8.2m, 13m are
# The variance of the salinity at the 3 depths 2.4m, 8.2m, 13m are
# (respectively) 10.58944, 44.93308, 93.84552
var(DF$water_salinity)
var(DF820$water_salinity[DF820$water_salinity<1000])
var(DF1300$water_salinity)
#7b We first extract the times at depth 13m:
timevec1300 <- strptime(DF1300[ ,1], "%Y/%m/%d %H:%M:%S")
# and now we plot the salinity at depth 13m:
plot(timevec1300, DF1300$water_salinity)
#7c We first convert the starttime and stoptime to
# seconds as needed by R (i.e., to seconds after Jan 1, 1970):
starttime <- strptime("2012/06/06 00:00:00", "%Y/%m/%d %H:%M:%S")
stoptime <- strptime("2012/06/12 23:59:59", "%Y/%m/%d %H:%M:%S")
# Now we plot the salinity data:
plot( timevec1300[timevec1300 >= starttime & timevec1300 <= stoptime], DF1300$water_salinity[timevec1300 >= starttime & timevec1300 <= stoptime])
# It appears that there are roughly 14 cycles of the salinity data,
# i.e., roughly two per day. This makes sense, since it seems to
# fit with the fact that the tide comes twice per day,
# which has a significant effect on the salinity.
#8 The counts of the temperature data are as follows:
tapply( DF$water_temperature[DF$water_temperature<500], cut(DF$water_temperature[DF$water_temperature<500], breaks=c(10,12,14,16,18)), length)
# and the percentages of the temperature in each category are:
tapply( DF$water_temperature[DF$water_temperature<500], cut(DF$water_temperature[DF$water_temperature<500], breaks=c(10,12,14,16,18)), length)/length((DF$water_temperature[DF$water_temperature<500]))
#9 The average water temperatures, week by week, were:
# 13.80297, 14.80604, 15.38551, 16.06782
time1 <-- strptime("2012/06/01 00:00:00", "%Y/%m/%d %H:%M:%S")
time2 <- strptime("2012/06/07 23:59:59", "%Y/%m/%d %H:%M:%S")
time3 <- strptime("2012/06/14 23:59:59", "%Y/%m/%d %H:%M:%S")
time4 <- strptime("2012/06/21 23:59:59", "%Y/%m/%d %H:%M:%S")
time5 <- strptime("2012/06/28 23:59:59", "%Y/%m/%d %H:%M:%S")
tapply( DF$water_temperature[DF$water_temperature<500], cut(timevec[DF$water_temperature<500], breaks=c(time1,time2,time3,time4,time5)), mean, na.rm=TRUE)
# Note: The original problem statement had a small typographical error
# on some of the dates, which is corrected here.
#10 We use a tapply with two kinds of breaks, to see that
# the counts are as follows:
# salinity > 12 and temperature > 14 --- 387 points
# salinity > 12 and temperature <= 14 --- 4533 points
# salinity <= 12 and temperature > 14 --- 190603 points
# salinity <= 12 and temperature <= 14 --- 7176 points
tapply( DF$water_temperature[DF$water_temperature<500],
list(
cut( DF$water_salinity[DF$water_temperature<500], breaks=c(0,12,24) ),
cut( DF$water_temperature[DF$water_temperature<500], breaks=c(0,14,18) )
), length)