library("ncdf4")
myfunction <- function( mystation, mylength, mymonth, myyear ) {
mystring <- paste("http://amb6400b.stccmop.org:8080/thredds/dodsC/preliminary_data/", sprintf("saturn%02d", mystation),"/", sprintf("saturn%02d", mystation), ".", mylength , ".A.CT/", myyear, sprintf("%02d",mymonth), ".nc", sep="")
mync <- nc_open(mystring)
tempDF <- as.data.frame( lapply(1:mync$nvars, function(j) {ncvar_get(mync, mync$var[[j]])} ))
names(tempDF) <- sapply(1:mync$nvars, function(j) mync$var[[j]]$name)
tempDF$time <- ncvar_get(mync, "time")
tempDF$length <- mylength
tempDF$year <- myyear
tempDF$month <- mymonth
tempDF$days <- as.POSIXlt(tempDF$time, tz="PST8PDT", origin = "1970-01-01")$mday
nc_close(mync)
return(tempDF)
}
#1a. Create a vector corresponding to the 84 months from Nov 2009 through Oct 2016, and create a second vector containing the corresponding years.
mymonths <- as.integer(format(seq(as.Date("2009/11/01"), by="month", length=84), "%m"))
myyears <- as.integer(format(seq(as.Date("2009/11/01"), by="month", length=84), "%Y"))
#1b. Use these vectors in the context of an mapply function, to obtain the 84 months of data about the water temperature, salinity, and electrical conductivity at the SATURN03 station at the depth 2.4m. The result should be a list that contains 84 data.frames.
mylist <- mapply(myfunction, 3, 240, mymonths, myyears, SIMPLIFY=FALSE)
#1c. Use the sapply function to verify that all 84 data.frames have the variable names in the same order.
sum(colnames(mylist[[1]]) != sapply(mylist, colnames))
#1d. Use the do.call function to rbind these 84 data.frames into one data.frame called bigDF24. Check that the resulting data.frame has a little more than 7 million observations.
bigDF24 <- do.call(rbind, mylist)
#1e. Repeat the steps above, to gather the data about these same 3 variables from depth 8.2m into one data.frame called bigDF82 (which will have a little less than 6 million observations).
mylist <- mapply(myfunction, 3, 820, mymonths, myyears, SIMPLIFY=FALSE)
sum(colnames(mylist[[1]]) != sapply(mylist, colnames))
bigDF82 <- do.call(rbind, mylist)
#2a. Restricting attention to the 2.4m data, what is the longest time period for which no data is available, i.e., what is the longest time period in which no data is collected?
mymax <- which.max(diff(as.POSIXct(bigDF24$time, "%Y/%m/%d %H:%M:%S", origin="1970-01-01")))
# The longest time period is 21.09657 days
as.POSIXct(bigDF24$time, "%Y/%m/%d %H:%M:%S", origin="1970-01-01")[mymax+1] - as.POSIXct(bigDF24$time, "%Y/%m/%d %H:%M:%S", origin="1970-01-01")[mymax]
#2b. This occurs from roughly Feb 4, 2014 to Feb 25, 2014
# We did not take the Pacific time zone into account yet;
# we will do that in the code in question 3.
as.POSIXct(bigDF24$time, "%Y/%m/%d %H:%M:%S", origin="1970-01-01")[mymax:(mymax+1)]
#3a. Find the daily mean values for water_temperature at depth 2.4m.
meantemp <- tapply(bigDF24$water_temperature, format(as.POSIXct(bigDF24$time-60*60*8, "%Y/%m/%d", origin="1970-01-01"), tz="PST", "%Y/%m/%d"), mean)
#3b. Plot the resulting daily mean values for water_temperature at depth 2.4m.
plot(as.Date(names(meantemp)),meantemp)
#3c. Re-consider 3a and 3b for water_electrical_conductivity, and then also for water_salinity.
meanelec <- tapply(bigDF24$water_electrical_conductivity, format(as.POSIXct(bigDF24$time-60*60*8, "%Y/%m/%d", origin="1970-01-01"), tz="PST", "%Y/%m/%d"), mean)
plot(as.Date(names(meanelec)),meanelec)
# The electrical conductivity was erroneous for
# much of the data in the previous plot, so we fix it here.
plot(as.Date(names(meanelec)[meanelec > -100]),meanelec[meanelec > -100])
meansal <- tapply(bigDF24$water_salinity, format(as.POSIXct(bigDF24$time-60*60*8, "%Y/%m/%d", origin="1970-01-01"), tz="PST", "%Y/%m/%d"), mean)
plot(as.Date(names(meansal)),meansal)
#4a/4b. Decide what constitutes a false reading, i.e., data that is probably an outlier. What are your criteria for having a false reading?
# It looks like the temperature should always be between (roughly) 0 to 30 degrees,
# and there are four false readings for the water temperature.
sum( (bigDF24$water_temperature < 0) | (bigDF24$water_temperature > 30), na.rm=T )
baddates <- (bigDF24$water_temperature < 0) | (bigDF24$water_temperature > 30)
tapply(bigDF24$water_temperature[baddates], format(as.POSIXct(bigDF24$time[baddates]-60*60*8, "%Y/%m/%d", origin="1970-01-01"), tz="PST", "%Y/%m"), length)
# It initially looks like the salinity should always be between (roughly) 0 to 25,
# and that would imply that there are 257579 false readings for the salinity
sum( (bigDF24$water_salinity < 0) | (bigDF24$water_salinity > 25), na.rm=T )
baddates <- (bigDF24$water_salinity < 0) | (bigDF24$water_salinity > 25)
tapply(bigDF24$water_salinity[baddates], format(as.POSIXct(bigDF24$time[baddates]-60*60*8, "%Y/%m/%d", origin="1970-01-01"), tz="PST", "%Y/%m"), length)
# but then, if we consider the rest of the salinity values,
# they are all less than 32,
# so the rest of the salinity values are probably OK after all.
summary(bigDF24$water_salinity[baddates])
# Perhaps all of the electrical conductivity values below -100 are faulty.
# Indeed, we can check that the electrical conductivity values
# should be nonnegative.
# Probably the values larger than 30 are erroneous.
sum( (bigDF24$water_electrical_conductivity < 0) | (bigDF24$water_electrical_conductivity > 30), na.rm=T )
baddates <- (bigDF24$water_electrical_conductivity < 0) | (bigDF24$water_electrical_conductivity > 30)
tapply(bigDF24$water_electrical_conductivity[baddates], format(as.POSIXct(bigDF24$time[baddates]-60*60*8, "%Y/%m/%d", origin="1970-01-01"), tz="PST", "%Y/%m"), length)
#5.
v <- seq(as.Date("1958-08-09"), as.Date("2016-10-08"), by="week")
mycommands <- sapply(v, function(x)
paste("curl www.billboard.com/charts/hot-100/", x, " >", x, sep="") )
sapply(mycommands, system)
#6.
myyears <- c(rep(2009:2015,each=12),rep(2016,times=6))
mymonths <- c(rep(sprintf("%02d",1:12),times=7), sprintf("%02d",1:6))
myfilestodownload <- paste("curl s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_", myyears, "-", mymonths, ".csv >/scratch/mentors/mdw/", myyears, "-", mymonths, ".csv", sep="")
sapply(myfilestodownload, system, ignore.stderr=TRUE)
#7a. On which day did the most taxi cab rides occur?
#If a ride goes past midnight, use the start of the ride for the date of the ride.
# Solution: 849414 rides occurred on 2010-09-19
system("cat /scratch/mentors/mdw/20*.csv | grep -v ickup | awk -F [,\\ ] '{a[$2] += 1} END {for (i in a) print a[i], i}' | sort -n >/scratch/mentors/mdw/resultfile7a.txt")
myDFquestion7a <- read.table("/scratch/mentors/mdw/resultfile7a.txt", sep=' ')
dim(myDFquestion7a)
tail(myDFquestion7a)
#7b. For each day, determine the distribution of the number of passengers.
#Your output should allow you to answer questions like the following:
#On January 1, 2016, how many rides had 1 passenger? 2 passengers? 3 passengers? Etc.?
# Solution:
# 219590 rides on 2016-01-01 had 1 passenger
# 63213 rides on 2016-01-01 had 2 passengers
# 19363 rides on 2016-01-01 had 3 passengers
# etc...
system("cat /scratch/mentors/mdw/20*.csv | grep -v ickup | awk -F [,\\ ] '{a[$2\" \"$6] += 1} END {for (i in a) print a[i], i}' | sort -n >/scratch/mentors/mdw/resultfile7b.txt")
myDFquestion7b <- read.table("/scratch/mentors/mdw/resultfile7b.txt", sep=' ')
dim(myDFquestion7b)
tail(myDFquestion7b)
myDFquestion7b[myDFquestion7b$V2 == "2016-01-01", ]
#8a. For each day, determine the average distance of a taxi cab ride.
system("cat /scratch/mentors/mdw/20*.csv | grep -v ickup | awk -F [,\\ ] '{a[$2] += 1; b[$2] += $7} END {for (i in a) print a[i], b[i], i}' | sort -n > /scratch/mentors/mdw/resultfile8a.txt")
myDFquestion8a <- read.table("/scratch/mentors/mdw/resultfile8a.txt", sep=' ')
dim(myDFquestion8a)
v <- myDFquestion8a$V2/myDFquestion8a$V1
names(v) <- myDFquestion8a$V3
v
#8b. For each day, determine the average number of passengers.
system("cat /scratch/mentors/mdw/20*.csv | grep -v ickup | awk -F [,\\ ] '{a[$2] += 1; b[$2] += $6} END {for (i in a) print a[i], b[i], i}' | sort -n > /scratch/mentors/mdw/resultfile8b.txt")
myDFquestion8b <- read.table("/scratch/mentors/mdw/resultfile8b.txt", sep=' ')
dim(myDFquestion8b)
w <- myDFquestion8b$V2/myDFquestion8b$V1
names(w) <- myDFquestion8b$V3
w
#9a. On which type of day (Sun, Mon, ..., Sat) is the average distance of a ride the longest?
# Solution: On Friday, the average is 6.130944
myDFquestion8a$V4 <- c("Sunday", weekdays(as.Date(myDFquestion8a$V3[-1])))
sort(tapply( myDFquestion8a$V2, myDFquestion8a$V4, sum ) / tapply( myDFquestion8a$V1, myDFquestion8a$V4, sum ))
#9b. On which type of day (Sun, Mon, ..., Sat) is the average number of passengers in a car the largest?
# Solution: On Saturday, the average is 1.769277
myDFquestion8b$V4 <- c("Sunday", weekdays(as.Date(myDFquestion8b$V3[-1])))
sort(tapply( myDFquestion8b$V2, myDFquestion8b$V4, sum ) / tapply( myDFquestion8b$V1, myDFquestion8b$V4, sum ))