library("RMySQL")
myconnection <- dbConnect(dbDriver("MySQL"), host="mydb.itap.purdue.edu", username="mdw", dbname="mdw")
# 1a. One possibility is to load all pitchers, with any number of strikeouts,
# but there are a lot of these players, so we need
# to increase the number of results we fetch
myresult <- dbSendQuery(myconnection, "SELECT m.nameFirst, m.nameLast, SUM(p.SO) FROM Master m INNER JOIN Pitching p ON m.playerID = p.playerID GROUP BY m.playerID;")
mydata <- fetch(myresult, n=10000)
names(mydata) <- c("first", "last", "strikeouts")
tail(mydata[ order(mydata$strikeouts), ], n=10)
# A different method is to go ahead and restrict the mySQL query to only
# fetching pitchers with at least 300 wins, so that we don't have to fetch so many pitchers.
myresult <- dbSendQuery(myconnection, "SELECT m.nameFirst, m.nameLast, SUM(p.SO) FROM Master m INNER JOIN Pitching p ON m.playerID = p.playerID GROUP BY m.playerID HAVING SUM(p.SO) >= 2000;")
mydata <- fetch(myresult)
names(mydata) <- c("first", "last", "strikeouts")
tail(mydata[ order(mydata$strikeouts), ], n=10)
# 1b. Again, we could either do this by loading all of the pitchers:
myresult <- dbSendQuery(myconnection, "SELECT m.nameFirst, m.nameLast, SUM(p.WP) FROM Master m INNER JOIN Pitching p ON m.playerID = p.playerID GROUP BY m.playerID;")
mydata <- fetch(myresult, n=10000)
names(mydata) <- c("first", "last", "wildpitches")
tail(mydata[ order(mydata$wildpitches, na.last=FALSE), ], n=10)
# Notice, in the last line above, we had to make sure that the NA's were not put at the end of the list.
# A different method is to go ahead and restrict the mySQL query to (say) 150 or more wild pitches
myresult <- dbSendQuery(myconnection, "SELECT m.nameFirst, m.nameLast, SUM(p.WP) FROM Master m INNER JOIN Pitching p ON m.playerID = p.playerID GROUP BY m.playerID HAVING SUM(p.WP) >= 150;")
mydata <- fetch(myresult)
names(mydata) <- c("first", "last", "wildpitches")
tail(mydata[ order(mydata$wildpitches), ], n=10)
# 1c. Again, we could either do this by loading all of the pitchers:
myresult <- dbSendQuery(myconnection, "SELECT m.nameFirst, m.nameLast, SUM(p.IPOuts) FROM Master m INNER JOIN Pitching p ON m.playerID = p.playerID GROUP BY m.playerID;")
mydata <- fetch(myresult, n=10000)
names(mydata) <- c("first", "last", "IPOuts")
tail(mydata[ order(mydata$IPOuts), ], n=10)
# A different method is to go ahead and restrict the mySQL query to (say) 12000 or more IPOuts
myresult <- dbSendQuery(myconnection, "SELECT m.nameFirst, m.nameLast, SUM(p.IPOuts) FROM Master m INNER JOIN Pitching p ON m.playerID = p.playerID GROUP BY m.playerID HAVING SUM(p.IPOuts) >= 12000;")
mydata <- fetch(myresult)
names(mydata) <- c("first", "last", "IPOuts")
tail(mydata[ order(mydata$IPOuts), ], n=10)
# 2a. The New York Yankees have the most home runs altogether.
myresult <- dbSendQuery(myconnection, "SELECT t.name, SUM(b.HR), t.yearID FROM Batting b INNER JOIN Teams t ON b.yearID=t.yearID AND b.teamID=t.teamID GROUP BY t.name, t.yearID;")
mydata <- fetch(myresult, n=10000)
names(mydata) <- c("team", "homeruns", "year")
sort(tapply(mydata$homeruns, mydata$team, sum), decreasing=T)[1:1]
# 2b. We start with the same data from question 2a
# Then we determine the number of active years for each team
years <- sapply(tapply(mydata$year, mydata$team, range), diff)+1
# and the number of home runs per team
homeruns <- tapply(mydata$homeruns, mydata$team, sum)
# and finally we divide, and then sort, and extract the largest:
sort(homeruns / years, decreasing=T)[1]
# 3a. California is the birthplace of 2133 players, the most of any State.
myresult <- dbSendQuery(myconnection, "SELECT m.birthState, COUNT(*) FROM Master m GROUP BY m.birthState;")
mydata <- fetch(myresult)
names(mydata) <- c("state", "peoplecount")
tail(mydata[order(mydata$peoplecount), ], n=1)
# 3b. Altogether, 6.17 percent use both hands for batting; 25.87 percent use the left hand; 61.30 percent use the right hand.
myresult <- dbSendQuery(myconnection, "SELECT m.bats, COUNT(*) FROM Master m GROUP BY m.bats;")
mydata <- fetch(myresult)
mydata
mydata[ ,2]/sum(mydata[ ,2])
# Now we load the RCurl library
library(RCurl)
# 4a. First we get the URL data from each of the 11 pages of colleges
myURLdata <- sapply(1:11, function(x) getURL(paste("http://colleges.usnews.rankingsandreviews.com/best-colleges/rankings/national-universities/data/page+", x, sep="")))
# Then we write the data to 11 separate files
sapply(1:11, function(x) writeLines(myURLdata[x], paste("collegedata", x, ".html", sep="") ) )
# Then we close the files
sapply(1:11, function(x) close(file( paste("collegedata", x, ".html", sep=""))))
# Now we load the XML library:
library(XML)
# We parse the XML data in each file
mydocs <- sapply(1:11, function(x) htmlParse( paste("collegedata", x, ".html", sep="")))
# Now we extractthe data with the enrollments:
myresults <- sapply(1:11, function(x) xpathSApply(mydocs[[x]], "//*/td[@class='column-even table-column-even total_all_students ']/child::text()", xmlValue))
# and we unlist the results into a vector:
v <- unlist(myresults)
# Then we remove the spaces, the new line characters, the commas, and change each "N/A" to NA
v <- gsub(" *", "", v)
v <- gsub("\n", "", v)
v <- gsub(",", "", v)
v <- gsub("N/A", NA, v)
v <- as.numeric(v)
v
# Finally, we only want the odd-numbered entries:
enrollments <- v[seq(1,length(v),by=2)]
enrollments
# Now we get the university names
nameresults <- sapply(1:11, function(x) xpathSApply(mydocs[[x]], "//*/a[@class='school-name']/child::text()", xmlValue))
mynames <- unlist(nameresults)
# We use the names of the universities as the names of the enrollments vector
names(enrollments) <- mynames
# Now we have an enrollments vector that we are ready to sort:
enrollments
# The 10 largest universities, according to enrollment, are:
sort(enrollments, decreasing=T)[1:10]
# 4b. Now we extract the state data:
stateresults <- sapply(1:11, function(x) xpathSApply(mydocs[[x]], "//*/p[@class='location']/child::text()", xmlValue))
mystates <- unlist(stateresults)
# There are various ways to get the state abbreviations.
# One way is to substitute everything before the comma, and the comma itself, and the next space, and replace it with nothing.
# Precisely, what we do is remove any number of alphabetic characters, dashes, spaces, or periods,
# which are followed by a comma and a space, and we replace all of this with nothing.
abbreviations <- gsub("([[:alpha:]]|-| |.)*, ", "", mystates)
# The number of universities per state is then found by making a table:
table(abbreviations)
# 4c. Now we extract the tuition data:
tuitionresults <- sapply(1:11, function(x) xpathSApply(mydocs[[x]], "//*/td[@class='column-odd table-column-odd search_tuition_display ']/child::text()", xmlValue))
# and we unlist the results
v <- unlist(tuitionresults)
# then we remove spaces and new line characters and commas
v <- gsub(" ", "", v)
v <- gsub("\n", "", v)
v <- gsub(",", "", v)
# finally we remove all of the unwanted symbols that appear before a dollar sign, and we replace with nothing
v <- gsub("([[:alpha:]]|[0-9]|-|\\$|:)*\\$", "", v)
v <- gsub("N/A", NA, v)
v <- as.numeric(v)
v
# We only want the odd-numbered entries:
tuition <- v[seq(1,length(v),by=2)]
# We use the names of the universities as the names of the tuition vector
names(tuition) <- mynames
# Now we have an enrollments vector that we are ready to sort:
tuition
# The 10 most expensive universities, according to tuition, are:
sort(tuition, decreasing=T)[1:10]
# 5a. First we extract the iTunes data
iTunesDoc <- xmlParse("/data/public/iTunes/iTunesMusicLibrary.xml")
iTunesvec <- xpathSApply(iTunesDoc, "//*/dict/dict/dict/child::*/child::text()", xmlValue)
# We extract the indices that are equal to "Artist", and then we add 1 (to move to the next element) and extract the values
# Then we build a table with those artists:
sort(table(iTunesvec[which(iTunesvec == "Artist") + 1]),decreasing=T)[1:10]
# 5b. We use a similar method as in 5a, but looking for the genre instead of the artist.
sort(table(iTunesvec[which(iTunesvec == "Genre") + 1]),decreasing=T)[1:3]
# 6. Now we extract the data for each of the states.
# We first get the state names, replacing the capital letters by lowercase, and using dashes instead of spaces
statenames <- gsub(" ", "-", tolower(row.names(state.x77)))
mydocs <- sapply(statenames, function(x) htmlParse(paste("http://www.politico.com/2012-election/results/president/", x, sep="")))
# We extract the names of the candidates for President, within each state
names <- sapply(1:50, function(x) xpathSApply(mydocs[[x]], "//*/th[@scope='row' and @class='results-candidate']/child::text()", xmlValue))
# We also extract the votes from each county of each state
votes <- sapply(1:50, function(x) xpathSApply(mydocs[[x]], "//*/td[@class='results-popular']/child::text()", xmlValue))
# We clean up the names of the candidates, removing spaces at the start or end, and removing any trailing "(i)", etc.
names <- sapply(1:50, function(x) gsub("^ *", "", names[[x]]))
names <- sapply(1:50, function(x) gsub(" *$", "", names[[x]]))
names <- sapply(1:50, function(x) gsub(" *\\(i*\\)", "", names[[x]]))
names
# similarly, we remove the spaces and commas from the votes, and convert them each to numeric values:
votes <- sapply(1:50, function(x) gsub(" *", "", votes[[x]]))
votes <- sapply(1:50, function(x) gsub(",", "", votes[[x]]))
votes <- sapply(1:50, function(x) as.numeric(votes[[x]]))
votes
# now we extract the number of candidates in each state:
candidatecounts <- sapply(1:50, function(x) length(levels(as.factor(names[[x]]))))
# so the number of counties in each state, plus the summary, is:
sapply(1:50, function(x) length(votes[[x]]))/candidatecounts
# In each state, we remove the summary values.
# (In Alaska, this will mean that we remove all of the votes, because Alaska only has a summary given.)
# Finally, we remove the first several entries of the votes and the names,
# since they each contain the summary data.
# (For example, in Indiana, there were 3 candidates, so we remove the first 3 entries, which summarize each candidate's performance.)
# The reason for doing this is that we don't want to double-count, i.e., we don't want to count the summaries and the county data too.
countycandidatenames <- sapply(1:50, function(x) names[[x]][-(1:candidatecounts[x])])
countyvotes <- sapply(1:50, function(x) votes[[x]][-(1:candidatecounts[x])])
# Finally, here are the sums of the results, tabulated over the counties, for each state.
bigresults <- sapply(1:50, function(x) tapply(countyvotes[[x]], countycandidatenames[[x]], sum))
names(bigresults) <- statenames
bigresults