Rcode: Bike Sharing Demand


Useful Packages

lubridate can be used to deal with date and time; dplyr can be used to manipulate large data frame. Suggest to take a look of an online tutorial or youtube video.

library(lubridate) 
library(ggplot2)
library(dplyr)        

Load the Data into R

train = read.csv("train.csv")
test = read.csv("test.csv")

dim(train)
dim(test)

names(train)
names(test)

summary(train)
summary(test)

## The lubridate package can help us to extract useful 
## information from "datetime". 
train[1,1]
year(train[1,1])
month(train[1,1])
wday(train[1,1])      ## Start with Sunday and end with Monday

Time-Range for Training and Test

The training set consists of data from each month day 1 to 19. The total number of obs for each month (19 days) should be 19×24=456. So apparently, some hours are missing.

for (i_year in unique(year(train$datetime))){
  for (i_month in unique(month(train$datetime))){
    cat("Year: ", i_year, "\tMonth: ", i_month, "\n")
    mySubset = train[year(train$datetime)==i_year & month(train$datetime)==i_month, ]
    cat("Days: ", range(day(mySubset$datetime)), "\t total obs: ", dim(mySubset)[1], "\n")
  }    
}

The test set consists of data from each month day 20 to the end of the month.

for (i_year in unique(year(test$datetime))){
  for (i_month in unique(month(test$datetime))){
    cat("Year: ", i_year, "\tMonth: ", i_month, "\n")
    mySubset = test[year(test$datetime)==i_year & month(test$datetime)==i_month, ]
    cat("Days: ", range(day(mySubset$datetime)), "\t total obs: ", dim(mySubset)[1], "\n")
  }    
}

Exploratory Data Analysis (EDA)

In addition to histograms/scatter plots, which provide graphical displays of Y vs X1 (one predictor), we can also provide graphical displays of Y vs X1 and X2.

library(scales)

## Add some new features, such as hour of the day, time, day
train$hour  = hour(train$datetime)
train$times = as.POSIXct(strftime(ymd_hms(train$datetime), 
             format="%H:%M:%S"), format="%H:%M:%S")
train$jitter_times = train$times + minutes(round(runif(nrow(train),min=0,max=59)))

train$day = wday(train$datetime, label=TRUE)

## In lubridate, the week starts with Sunday. 
## Using the next command, we have the week to begin on 
## Monday (coded as 1) and end on Sunday (coded as 7)

train$day = ifelse(wday(train$datetime)==1, 7, wday(train$datetime)-1)
train$day = as.factor(train$day)
levels(train$day)=c("Mon", "Tue", "Wed", "Thurs", "Fri", "Sat", "Sun")

x_axis = "jitter_times"
y_axis = "count"
color  = "temp" 

ggplot(train[train$workingday==1,], 
  aes_string(x=x_axis, y=y_axis, color=color)) +
  geom_point(position=position_jitter(w=0.0, h=0.4)) +
  theme_light(base_size=20) + xlab("Hour of the Day") +
  scale_x_datetime(breaks = date_breaks("4 hours"), 
      labels=date_format("%I:%M %p")) + 
  ylab("Number of Bike Rentals") +
  scale_colour_gradientn("Temp (°C)", 
       colours=c("#5e4fa2", "#3288bd", "#66c2a5", 
                 "#abdda4", "#e6f598", "#fee08b", 
                 "#fdae61", "#f46d43", "#d53e4f", 
                 "#9e0142")) +
  ggtitle("Bike Rentals on Workdays") +
  theme(plot.title=element_text(size=18))

ggplot(train[train$workingday==0,], 
  aes_string(x=x_axis, y=y_axis, color=color)) +
  geom_point(position=position_jitter(w=0.0, h=0.4)) +
  theme_light(base_size=20) +
  xlab("Hour of the Day") +
  scale_x_datetime(breaks = date_breaks("4 hours"), 
       labels=date_format("%I:%M %p")) + 
  ylab("Number of Bike Rentals") +
  scale_colour_gradientn("Temp (°C)", 
        colours=c("#5e4fa2", "#3288bd", "#66c2a5", 
                  "#abdda4", "#e6f598", "#fee08b", 
                  "#fdae61", "#f46d43", "#d53e4f", 
                  "#9e0142")) +
  ggtitle("Bike Rentals on Weekends/Holidays") +
  theme(plot.title=element_text(size=18))

You can change “color” to other numerical variables

color  = "windspeed" 
ggplot(train[train$workingday==0,], 
  aes_string(x=x_axis, y=y_axis, color=color)) +
  geom_point(position=position_jitter(w=0.0, h=0.4)) +
  theme_light(base_size=20) +
  xlab("Hour of the Day") +
  scale_x_datetime(breaks = date_breaks("4 hours"), 
                 labels=date_format("%I:%M %p")) + 
  ylab("Number of Bike Rentals") +
  scale_colour_gradientn("Windspeed", 
            colours=c("#5e4fa2", "#3288bd", "#66c2a5", 
                      "#abdda4", "#e6f598", "#fee08b", 
                      "#fdae61", "#f46d43", "#d53e4f", 
                      "#9e0142")) +
  ggtitle("Bike Rentals on Workdays") +
  theme(plot.title=element_text(size=18))

Display the rental for X1-by-X2, where X1: day (of the week) and X2: hour. You can replace X1 and/or X2 by other categorical variables such as month, year, season, etc. First calculate the average count for each day/time, store in a dataframe and then draw a heatmap using ggplot

hourly = group_by(train, day, hour)
day_hour_counts = summarise(hourly, count=mean(count))
day_hour_counts[1:5,]

ggplot(day_hour_counts, aes(x = hour, y = day)) + 
  geom_tile(aes(fill = count)) + 
  scale_fill_gradient(name="Average Counts", low="white", high="green") + 
  theme(axis.title.y = element_blank())

Display the rental for X1-by-X2, where X1: season and X2: hour.

hourly = group_by(train, season, hour)
day_hour_counts = summarise(hourly, count=mean(train$count))

ggplot(day_hour_counts, aes(x = hour, y = season)) + 
  geom_tile(aes(fill = count)) + 
  scale_fill_gradient(name="Average Counts", low="white", high="green") +
  scale_y_continuous(breaks=c(1, 2, 3, 4),
            labels=c("Spring", "Summer", "Fall", "Winter")) + 
  theme(axis.title.y = element_blank())

Grouped Bar Plot.

ggplot(day_hour_counts, 
  aes(x=hour, y=count)) +
  geom_bar(aes(fill = as.factor(season)), 
            position = "dodge", stat="identity")

hourly = group_by(train, workingday, hour)
workingday_hour_counts = summarise(hourly, count=mean(train$count))
ggplot(workingday_hour_counts, aes(x=hour, y=count)) +
  geom_bar(aes(fill = as.factor(workingday)), position = "dodge", stat="identity")

Create Some New Variables

DataFeature = function(data) {
  old.features = c("holiday",
                   "season",
                   "workingday",
                   "weather",
                   "temp",
                   "atemp",
                   "humidity",
                   "windspeed");
  newdata = data[, old.features];
  newdata$weather = as.factor(newdata$weather)
  newdata$season = as.factor(newdata$season)
  levels(newdata$season)=c("Spring", "Summer", "Fall", "Winter")
  
  newdata$hour= as.factor(hour(data$datetime))
  newdata$year = as.factor(year(data$datetime))
  newdata$month = as.factor(month(data$datetime))
 
  newdata$wday = ifelse(wday(data$datetime)==1, 7, wday(data$datetime)-1)
  newdata$wday = as.factor(newdata$wday)
  levels(newdata$wday)=c("Mon", "Tue", "Wed", "Thurs", "Fri", "Sat", "Sun")
  return(newdata)
}


train = read.csv("train.csv")
test = read.csv("test.csv")

Newtrain = DataFeature(train)
Newtrain$lcount = log(train$count +1)
Newtest = DataFeature(test)

A Simple Model

Predict by the average counts from the same month, the same hour of the day, and the same day of the week.

submission = data.frame(datetime=test$datetime, 
                        hour=Newtest$hour, 
                        wday=Newtest$wday)

  
for (i_year in unique(Newtest$year)){
  for (i_month in unique(Newtest$month)) {
    cat("Year: ", i_year, "\tMonth: ", i_month, "\n")
    trainSubset = filter(Newtrain, year==i_year, month==i_month)
    by_wday_hour = group_by(trainSubset, wday, hour)
    wday_hour_lcounts = summarise(by_wday_hour, lcounts=mean(lcount))
    
    testLocs = Newtest$year ==i_year & Newtest$month == i_month
    tmp = submission[testLocs, ]
    tmp = inner_join(tmp, wday_hour_lcounts)
    submission[testLocs, "count"] = exp(tmp$lcounts)-1
  }
}
submission=submission[, c(1, 4)]
write.csv(submission, file = "Submission1_simple_model.csv", row.names=FALSE)

A Simple Linear Regression Model

Use only past data to make predictions on the test set, that is, train a different model for each test batch.

The code below show you how to estimate a linear model based on “temp”, “atemp”, “humidity”, “windspeed” and “workingday”.

submission = data.frame(datetime=test$datetime, 
                        hour=Newtest$hour, 
                        wday=Newtest$wday)

for (i_year in unique(year(ymd_hms(test$datetime)))) {
  for (i_month in unique(month(ymd_hms(test$datetime)))) {
    cat("Year: ", i_year, "\tMonth: ", i_month, "\n")
    testLocs = year(ymd_hms(test$datetime))==i_year & month(ymd_hms(test$datetime))==i_month
    trainLocs = ymd_hms(train$datetime) <= min(ymd_hms(test$datetime[testLocs]))

    myfit = lm(lcount ~ temp + atemp + humidity + windspeed + workingday, data=Newtrain[trainLocs,])
    mypred.count = exp(predict(myfit, Newtest[testLocs,]))-1
    mypred.count[mypred.count < 0] = 0; 
    submission[testLocs, "count"] = mypred.count
  }
}
submission=submission[, c(1, 4)]
write.csv(submission, file = "Submission1_simple_LS_model.csv", row.names=FALSE)

In the code above, I didn’t use all the features. Here is why: let’s fit a regression model using all available features in Newtrain to predict bike rentals for 2011, Jan 20 – end of Jan

i_year = 2011; 
i_month = 1;
testLocs = year(ymd_hms(test$datetime))==i_year & month(ymd_hms(test$datetime))==i_month
trainLocs = ymd_hms(train$datetime) <= min(ymd_hms(test$datetime[testLocs]))
trainSubset = Newtrain[trainLocs,]
myfit = lm(lcount ~ ., data=trainSubset)

You’ll receive an error message which indicates something is wrong with the factor variables. This is because we do not have all the levels for some categorical variables, e.g., “year” has two levels, but we only observe one in the data from Jan 2011; “weather” has four levels but we only observe three in this particular trainSubset table(trainSubset$weather).

You can remove those categorical variables.

i_year = 2011; 
i_month = 1;
testLocs = year(ymd_hms(test$datetime))==i_year & month(ymd_hms(test$datetime))==i_month
trainLocs = ymd_hms(train$datetime) <= min(ymd_hms(test$datetime[testLocs]))
trainSubset = Newtrain[trainLocs,]
drop.vars = c("year", "month", "weather", "season", "wday")
myfit = lm(lcount ~ ., data=trainSubset[, ! names(trainSubset) %in% drop.vars])
junk=predict(myfit, Newtest[testLocs,])

The variables you need to remove will be different for different test batches. To avoid adding “if” lines to decide which variables to move for which test batch, which could be a little messy, here is what I would do (this may not be optimal; please let me know if you come with some better ideas): code the dummy variables at the beginning by myself; the new columns corresponding to missing levels would be collinear with the intercept, therefore have NA coefficients, which will be removed automatically by R.

myDummyCols = function(data){
  if (is.data.frame(data)==FALSE) stop("Data should be a dataframe.")
  var.names = names(data)
   p=length(var.names)
  newdata = NULL;
  remove.var = p+1
  data$junk=rep(0, dim(data)[1])
  for(j in 1:p){
    if (is.factor(data[,j])){
      level.names = levels(data[,j])
      tmp=lm(paste("junk ~ ", var.names[j], "-1"), data)
      newdata = cbind(newdata, model.matrix(tmp))
      remove.var = c(remove.var, j)
    }
  }
  return(as.data.frame(cbind(data[, -remove.var], newdata)))
}

# Form a new data matrix
Newtrain2 = myDummyCols(Newtrain)
Newtest2 = myDummyCols(Newtest)

i_year = 2011; 
i_month = 1;
testLocs = year(ymd_hms(test$datetime))==i_year &amp; month(ymd_hms(test$datetime))==i_month
trainLocs = ymd_hms(train$datetime) &lt;= min(ymd_hms(test$datetime[testLocs]))
trainSubset = Newtrain2[trainLocs,]
myfit = lm(lcount ~ ., data=trainSubset);
mycoef = coef(myfit)
drop.vars = names(which(is.na(mycoef)))
myfit = lm(lcount ~ ., data=trainSubset[, ! names(trainSubset) %in% drop.vars])
junk=predict(myfit, Newtest2[testLocs,])


## Put the code into a loop
submission = data.frame(datetime=test$datetime, 
                        hour=Newtest$hour, 
                        wday=Newtest$wday)

for (i_year in unique(year(ymd_hms(test$datetime)))) {
  for (i_month in unique(month(ymd_hms(test$datetime)))) {
    cat("Year: ", i_year, "\tMonth: ", i_month, "\n")
    testLocs = year(ymd_hms(test$datetime))==i_year &amp; month(ymd_hms(test$datetime))==i_month
    trainLocs = ymd_hms(train$datetime) &lt;= min(ymd_hms(test$datetime[testLocs]))
    trainSubset = Newtrain2[trainLocs,]
    
    myfit = lm(lcount ~ ., data=trainSubset);
    mycoef = coef(myfit)
    drop.vars = names(which(is.na(mycoef)))
    myfit = lm(lcount ~ ., data=trainSubset[, ! names(trainSubset) %in% drop.vars])
    mypred.count = exp(predict(myfit, Newtest2[testLocs,]))-1
    mypred.count[mypred.count < 0] = 0; 
    submission[testLocs, "count"] = mypred.count
  }
}
submission=submission[, c(1, 4)]
write.csv(submission, file ="Submission2_simple_LS_model.csv", row.names=FALSE)