#1.
months <- c(31,28,31,30,31,30,31,31,30,31,30,31)
names(months) <- 1:12
myYY <- rep(1958:2015, each=365)
myMM <- unlist(sapply(1:12, function(x) sprintf("%02d", rep( x,times=months[x])) ))
myDD <- unlist(sapply(1:12, function(x) sprintf("%02d",1:months[x]) ))
myYYMMDD <- paste(myYY,myMM,myDD,sep="-")
myYYMMDD <- c(myYYMMDD, sapply(seq(1960,2012,by=4), function(x) paste(x, "-02-29", sep="")))
v <- sort(myYYMMDD)
v <- v[215:21137]
v <- v[(1:length(v))%%7==0]
#2.
mycommands <- sapply(v, function(x) paste("wget www.billboard.com/charts/hot-100/", x, sep="") )
sapply(mycommands, system)
#3.
install.packages("XML")
library(XML)
mydoc <- htmlParse("1958-08-09")
mysongs <- xpathSApply(mydoc, "//*/article/*/div[@class='row-title']/h2", xmlValue)
mysongs
mysongs <- sub("^\\s+", "", mysongs)
mysongs <- sub("\\s+$", "", mysongs)
mysongs
myartists <- xpathSApply(mydoc, "//*/article/*/div[@class='row-title']/h3", xmlValue)
myartists
myartists <- sub("^\\s+", "", myartists)
myartists <- sub("\\s+$", "", myartists)
myartists
#4.
songfunction <- function(x) {
sub("\\s+$", "", sub("^\\s+", "",
xpathSApply(htmlParse(x), "//*/article/*/div[@class='row-title']/h2", xmlValue))) }
artistfunction <- function(x) {
sub("\\s+$", "", sub("^\\s+", "",
xpathSApply(htmlParse(x), "//*/article/*/div[@class='row-title']/h3", xmlValue))) }
#5.
mysonglist <- sapply(v, songfunction)
myartistlist <- sapply(v, artistfunction)
# Note: Some of the songs and artists are missing from the Billboard charts.
# There is nothing we can do about that!
# We can check to see which are missing!
class(mysonglist)
length(mysonglist)
class(myartistlist)
length(myartistlist)
# These are the weeks with missing data:
sapply(myartistlist, length)[sapply(myartistlist, length) != 100]
sapply(mysonglist, length)[sapply(mysonglist, length) != 100]
# These are the lengths for all of the weeks:
mylengths <- sapply(myartistlist, length)
myweeks <- rep(names(mylengths), times=mylengths)
length(myweeks)
alltheartists <- unlist(myartistlist)
length(alltheartists)
allthesongs <- unlist(mysonglist)
length(allthesongs)
# Since some of the songs are missing some songs, it is helpful to get the song positions.
# Here is how to do that for one week (which happens to be missing a song):
mydoc <- htmlParse("1961-09-16")
mypositions <- xpathSApply(mydoc, "//*/span[@class='this-week']", xmlValue)
as.integer(mypositions)
# Here is a function for finding the position in a given week:
positionfunction <- function(x) {
as.integer(xpathSApply(htmlParse(x), "//*/span[@class='this-week']", xmlValue)) }
# Here are all of the song positions across all of the weeks:
mypositionlist <- sapply(v, positionfunction)
allthepositions <- unlist(mypositionlist)
# Now we build a data frame with all of this data:
myBB <- data.frame(alltheartists, allthesongs, myweeks, allthepositions)
names(myBB) <- c("artist", "song", "week", "position")
length(myBB$artist)
length(myBB$song)
length(myBB$week)
length(myBB$position)
# Now we use the myBB data frame to answer the questions.
#6a.
# We might initially try to just look at the song titles,
head(sort(table(myBB$song),decreasing=T))
# but some common song titles were sung by more than one person.
# so it is better to take the artist name into account too.
head(sort(table( paste(myBB$song, "by", myBB$artist) ),decreasing=T))
#6b.
# Now we do something similar, but we restrict attention to songs at position #1.
head(sort(table( paste(myBB$song, "by", myBB$artist)[myBB$position == 1] ),decreasing=T), n=8)
#6c.
# Now we do something similar, but we restrict attention to songs at position <= 10
head(sort(table( paste(myBB$song, "by", myBB$artist)[myBB$position <= 10] ),decreasing=T))
# All the answers from question 6 agree with those in Wikipedia:
# https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_chart_achievements_and_milestones#Most_total_weeks_on_the_Hot_100
#7a.
# The results for the artist will depend on exact matches of the artists.
# For instance, if an artist name is listed differently, or with another artist together,
# then it will not show up. So our results here are
# slightly different than the Wikipedia page mentioned above.
head(sort(tapply(myBB$song, myBB$artist, function(x) length(unique(x)) ),decreasing=T))
#7b.
# Here are the most #1 songs.
# The same kinds of differences with Wikipedia apply here.
# For instance, Wikipedia shows 20 songs at #1 for the Beatles,
# but we only have 19 here. The one we are missing is "Get Back"
# because it is listed as having artist "The Beatles With Billy Preston"
head(sort(tapply(myBB$song[myBB$position == 1], myBB$artist[myBB$position == 1], function(x) length(unique(x)) ),decreasing=T))
#7c.
# For this question, it depends if we allow an artist to appear two or more times in the same week, e.g., with different songs.
# If we allow an artist to count every week that they appear, with possible repetitions for multiple songs, then this is easy:
head(sort(table(myBB$artist),decreasing=T))
# If we prefer to only allow an artist to be counted at most 1 time each week,
# then we can work a little harder.
# This has, for instance, a big impact on Taylor Swift, who has been in the chart a lot lately,
# and has multiple songs in the chart at once, but has not been on the charts for as long as other artists.
# So she will not be represented as strongly, with this method.
# Notice that each count will be less, with this method, than the previous method,
# because we are simply just counting each week at most once per artist.
head(sort(tapply(myBB$week, myBB$artist, function(x) length(unique(x)) ),decreasing=T))
#8.
# We look at the number 1 songs, grouped according to the title.
# We cannot be sure that these are actually the same songs, without doing more research.
head(sort(tapply(myBB$artist[myBB$position==1], myBB$song[myBB$position==1], function(x) length(unique(x)) ),decreasing=T),n=26)
#9.
# When working with a vector of years, the challenging thing is to find the longest consecutive string of years.
# First we get the years.
M <- matrix(unlist(strsplit(as.character(myBB$week), "-")),ncol=3,byrow=T)
allyears <- M[ ,1]
length(allyears)
# As an example, here are the years in which the Beatles were on the charts.
w <- as.numeric(unique(allyears[myBB$artist=="The Beatles"]))
# We can use the rle function to do this pretty easily.
max(rle(diff(w))$lengths)
# Now we go apply this function to all of the years that an artist had a number 1 hit,
# grouping by the artist. We also put "0" into each of the lists of consecutive years,
# so that we do not get any trivial values.
head(sort(tapply( as.numeric(allyears)[myBB$position==1], myBB$artist[myBB$position==1],
function(x) max(c(0,rle(diff(sort(unique(x))))$lengths))) ,decreasing=T))
#10.
# The Beatles managed to get 6 songs at #1 in 1964, and also 5 songs at #1 in 1965. Remarkable!
head(sort(tapply( myBB$song[myBB$position==1], paste(myBB$artist[myBB$position==1], "in", allyears[myBB$position==1]),
function(x) length(unique(x)) ),decreasing=T),n=14)