Useful Packages
lubridate can be used to deal with date and time; dplyr can be used to manipulate large data frame. Suggest to take a look of an online tutorial or youtube video.
library(lubridate) library(ggplot2) library(dplyr)
Load the Data into R
train = read.csv("train.csv") test = read.csv("test.csv") dim(train) dim(test) names(train) names(test) summary(train) summary(test) ## The lubridate package can help us to extract useful ## information from "datetime". train[1,1] year(train[1,1]) month(train[1,1]) wday(train[1,1]) ## Start with Sunday and end with Monday
Time-Range for Training and Test
The training set consists of data from each month day 1 to 19. The total number of obs for each month (19 days) should be 19×24=456. So apparently, some hours are missing.
for (i_year in unique(year(train$datetime))){ for (i_month in unique(month(train$datetime))){ cat("Year: ", i_year, "\tMonth: ", i_month, "\n") mySubset = train[year(train$datetime)==i_year & month(train$datetime)==i_month, ] cat("Days: ", range(day(mySubset$datetime)), "\t total obs: ", dim(mySubset)[1], "\n") } }
The test set consists of data from each month day 20 to the end of the month.
for (i_year in unique(year(test$datetime))){ for (i_month in unique(month(test$datetime))){ cat("Year: ", i_year, "\tMonth: ", i_month, "\n") mySubset = test[year(test$datetime)==i_year & month(test$datetime)==i_month, ] cat("Days: ", range(day(mySubset$datetime)), "\t total obs: ", dim(mySubset)[1], "\n") } }
Exploratory Data Analysis (EDA)
In addition to histograms/scatter plots, which provide graphical displays of Y vs X1 (one predictor), we can also provide graphical displays of Y vs X1 and X2.
library(scales) ## Add some new features, such as hour of the day, time, day train$hour = hour(train$datetime) train$times = as.POSIXct(strftime(ymd_hms(train$datetime), format="%H:%M:%S"), format="%H:%M:%S") train$jitter_times = train$times + minutes(round(runif(nrow(train),min=0,max=59))) train$day = wday(train$datetime, label=TRUE) ## In lubridate, the week starts with Sunday. ## Using the next command, we have the week to begin on ## Monday (coded as 1) and end on Sunday (coded as 7) train$day = ifelse(wday(train$datetime)==1, 7, wday(train$datetime)-1) train$day = as.factor(train$day) levels(train$day)=c("Mon", "Tue", "Wed", "Thurs", "Fri", "Sat", "Sun") x_axis = "jitter_times" y_axis = "count" color = "temp" ggplot(train[train$workingday==1,], aes_string(x=x_axis, y=y_axis, color=color)) + geom_point(position=position_jitter(w=0.0, h=0.4)) + theme_light(base_size=20) + xlab("Hour of the Day") + scale_x_datetime(breaks = date_breaks("4 hours"), labels=date_format("%I:%M %p")) + ylab("Number of Bike Rentals") + scale_colour_gradientn("Temp (°C)", colours=c("#5e4fa2", "#3288bd", "#66c2a5", "#abdda4", "#e6f598", "#fee08b", "#fdae61", "#f46d43", "#d53e4f", "#9e0142")) + ggtitle("Bike Rentals on Workdays") + theme(plot.title=element_text(size=18)) ggplot(train[train$workingday==0,], aes_string(x=x_axis, y=y_axis, color=color)) + geom_point(position=position_jitter(w=0.0, h=0.4)) + theme_light(base_size=20) + xlab("Hour of the Day") + scale_x_datetime(breaks = date_breaks("4 hours"), labels=date_format("%I:%M %p")) + ylab("Number of Bike Rentals") + scale_colour_gradientn("Temp (°C)", colours=c("#5e4fa2", "#3288bd", "#66c2a5", "#abdda4", "#e6f598", "#fee08b", "#fdae61", "#f46d43", "#d53e4f", "#9e0142")) + ggtitle("Bike Rentals on Weekends/Holidays") + theme(plot.title=element_text(size=18))
You can change “color” to other numerical variables
color = "windspeed" ggplot(train[train$workingday==0,], aes_string(x=x_axis, y=y_axis, color=color)) + geom_point(position=position_jitter(w=0.0, h=0.4)) + theme_light(base_size=20) + xlab("Hour of the Day") + scale_x_datetime(breaks = date_breaks("4 hours"), labels=date_format("%I:%M %p")) + ylab("Number of Bike Rentals") + scale_colour_gradientn("Windspeed", colours=c("#5e4fa2", "#3288bd", "#66c2a5", "#abdda4", "#e6f598", "#fee08b", "#fdae61", "#f46d43", "#d53e4f", "#9e0142")) + ggtitle("Bike Rentals on Workdays") + theme(plot.title=element_text(size=18))
Display the rental for X1-by-X2, where X1: day (of the week) and X2: hour. You can replace X1 and/or X2 by other categorical variables such as month, year, season, etc. First calculate the average count for each day/time, store in a dataframe and then draw a heatmap using ggplot
hourly = group_by(train, day, hour) day_hour_counts = summarise(hourly, count=mean(count)) day_hour_counts[1:5,] ggplot(day_hour_counts, aes(x = hour, y = day)) + geom_tile(aes(fill = count)) + scale_fill_gradient(name="Average Counts", low="white", high="green") + theme(axis.title.y = element_blank())
Display the rental for X1-by-X2, where X1: season and X2: hour.
hourly = group_by(train, season, hour) day_hour_counts = summarise(hourly, count=mean(train$count)) ggplot(day_hour_counts, aes(x = hour, y = season)) + geom_tile(aes(fill = count)) + scale_fill_gradient(name="Average Counts", low="white", high="green") + scale_y_continuous(breaks=c(1, 2, 3, 4), labels=c("Spring", "Summer", "Fall", "Winter")) + theme(axis.title.y = element_blank())
Grouped Bar Plot.
ggplot(day_hour_counts, aes(x=hour, y=count)) + geom_bar(aes(fill = as.factor(season)), position = "dodge", stat="identity") hourly = group_by(train, workingday, hour) workingday_hour_counts = summarise(hourly, count=mean(train$count)) ggplot(workingday_hour_counts, aes(x=hour, y=count)) + geom_bar(aes(fill = as.factor(workingday)), position = "dodge", stat="identity")
Create Some New Variables
DataFeature = function(data) { old.features = c("holiday", "season", "workingday", "weather", "temp", "atemp", "humidity", "windspeed"); newdata = data[, old.features]; newdata$weather = as.factor(newdata$weather) newdata$season = as.factor(newdata$season) levels(newdata$season)=c("Spring", "Summer", "Fall", "Winter") newdata$hour= as.factor(hour(data$datetime)) newdata$year = as.factor(year(data$datetime)) newdata$month = as.factor(month(data$datetime)) newdata$wday = ifelse(wday(data$datetime)==1, 7, wday(data$datetime)-1) newdata$wday = as.factor(newdata$wday) levels(newdata$wday)=c("Mon", "Tue", "Wed", "Thurs", "Fri", "Sat", "Sun") return(newdata) } train = read.csv("train.csv") test = read.csv("test.csv") Newtrain = DataFeature(train) Newtrain$lcount = log(train$count +1) Newtest = DataFeature(test)
A Simple Model
Predict by the average counts from the same month, the same hour of the day, and the same day of the week.
submission = data.frame(datetime=test$datetime, hour=Newtest$hour, wday=Newtest$wday) for (i_year in unique(Newtest$year)){ for (i_month in unique(Newtest$month)) { cat("Year: ", i_year, "\tMonth: ", i_month, "\n") trainSubset = filter(Newtrain, year==i_year, month==i_month) by_wday_hour = group_by(trainSubset, wday, hour) wday_hour_lcounts = summarise(by_wday_hour, lcounts=mean(lcount)) testLocs = Newtest$year ==i_year & Newtest$month == i_month tmp = submission[testLocs, ] tmp = inner_join(tmp, wday_hour_lcounts) submission[testLocs, "count"] = exp(tmp$lcounts)-1 } } submission=submission[, c(1, 4)] write.csv(submission, file = "Submission1_simple_model.csv", row.names=FALSE)
A Simple Linear Regression Model
Use only past data to make predictions on the test set, that is, train a different model for each test batch.
The code below show you how to estimate a linear model based on “temp”, “atemp”, “humidity”, “windspeed” and “workingday”.
submission = data.frame(datetime=test$datetime, hour=Newtest$hour, wday=Newtest$wday) for (i_year in unique(year(ymd_hms(test$datetime)))) { for (i_month in unique(month(ymd_hms(test$datetime)))) { cat("Year: ", i_year, "\tMonth: ", i_month, "\n") testLocs = year(ymd_hms(test$datetime))==i_year & month(ymd_hms(test$datetime))==i_month trainLocs = ymd_hms(train$datetime) <= min(ymd_hms(test$datetime[testLocs])) myfit = lm(lcount ~ temp + atemp + humidity + windspeed + workingday, data=Newtrain[trainLocs,]) mypred.count = exp(predict(myfit, Newtest[testLocs,]))-1 mypred.count[mypred.count < 0] = 0; submission[testLocs, "count"] = mypred.count } } submission=submission[, c(1, 4)] write.csv(submission, file = "Submission1_simple_LS_model.csv", row.names=FALSE)
In the code above, I didn’t use all the features. Here is why: let’s fit a regression model using all available features in Newtrain to predict bike rentals for 2011, Jan 20 – end of Jan
i_year = 2011; i_month = 1; testLocs = year(ymd_hms(test$datetime))==i_year & month(ymd_hms(test$datetime))==i_month trainLocs = ymd_hms(train$datetime) <= min(ymd_hms(test$datetime[testLocs])) trainSubset = Newtrain[trainLocs,] myfit = lm(lcount ~ ., data=trainSubset)
You’ll receive an error message which indicates something is wrong with the factor variables. This is because we do not have all the levels for some categorical variables, e.g., “year” has two levels, but we only observe one in the data from Jan 2011; “weather” has four levels but we only observe three in this particular trainSubset table(trainSubset$weather)
.
You can remove those categorical variables.
i_year = 2011; i_month = 1; testLocs = year(ymd_hms(test$datetime))==i_year & month(ymd_hms(test$datetime))==i_month trainLocs = ymd_hms(train$datetime) <= min(ymd_hms(test$datetime[testLocs])) trainSubset = Newtrain[trainLocs,] drop.vars = c("year", "month", "weather", "season", "wday") myfit = lm(lcount ~ ., data=trainSubset[, ! names(trainSubset) %in% drop.vars]) junk=predict(myfit, Newtest[testLocs,])
The variables you need to remove will be different for different test batches. To avoid adding “if” lines to decide which variables to move for which test batch, which could be a little messy, here is what I would do (this may not be optimal; please let me know if you come with some better ideas): code the dummy variables at the beginning by myself; the new columns corresponding to missing levels would be collinear with the intercept, therefore have NA coefficients, which will be removed automatically by R.
myDummyCols = function(data){ if (is.data.frame(data)==FALSE) stop("Data should be a dataframe.") var.names = names(data) p=length(var.names) newdata = NULL; remove.var = p+1 data$junk=rep(0, dim(data)[1]) for(j in 1:p){ if (is.factor(data[,j])){ level.names = levels(data[,j]) tmp=lm(paste("junk ~ ", var.names[j], "-1"), data) newdata = cbind(newdata, model.matrix(tmp)) remove.var = c(remove.var, j) } } return(as.data.frame(cbind(data[, -remove.var], newdata))) } # Form a new data matrix Newtrain2 = myDummyCols(Newtrain) Newtest2 = myDummyCols(Newtest) i_year = 2011; i_month = 1; testLocs = year(ymd_hms(test$datetime))==i_year & month(ymd_hms(test$datetime))==i_month trainLocs = ymd_hms(train$datetime) <= min(ymd_hms(test$datetime[testLocs])) trainSubset = Newtrain2[trainLocs,] myfit = lm(lcount ~ ., data=trainSubset); mycoef = coef(myfit) drop.vars = names(which(is.na(mycoef))) myfit = lm(lcount ~ ., data=trainSubset[, ! names(trainSubset) %in% drop.vars]) junk=predict(myfit, Newtest2[testLocs,]) ## Put the code into a loop submission = data.frame(datetime=test$datetime, hour=Newtest$hour, wday=Newtest$wday) for (i_year in unique(year(ymd_hms(test$datetime)))) { for (i_month in unique(month(ymd_hms(test$datetime)))) { cat("Year: ", i_year, "\tMonth: ", i_month, "\n") testLocs = year(ymd_hms(test$datetime))==i_year & month(ymd_hms(test$datetime))==i_month trainLocs = ymd_hms(train$datetime) <= min(ymd_hms(test$datetime[testLocs])) trainSubset = Newtrain2[trainLocs,] myfit = lm(lcount ~ ., data=trainSubset); mycoef = coef(myfit) drop.vars = names(which(is.na(mycoef))) myfit = lm(lcount ~ ., data=trainSubset[, ! names(trainSubset) %in% drop.vars]) mypred.count = exp(predict(myfit, Newtest2[testLocs,]))-1 mypred.count[mypred.count < 0] = 0; submission[testLocs, "count"] = mypred.count } } submission=submission[, c(1, 4)] write.csv(submission, file ="Submission2_simple_LS_model.csv", row.names=FALSE)