17 Time Series Forecasting
Time series forecasting is widely used in sciences with the intended purpose of predicting the future based on present data. This type of machine learning technique has been used for weather forecasting, glacier melting outlooks, and carbon dioxide concentration predictions. In this tutorial, we will use the Willow Creek Weather Datasets and build an Air Temperature Forecast using Linear Regression Modeling. The goal of this exercise is to build a linear regression model using existing training data to accurately predict existing testing data. This is an isolated tutorial that does not account for other variables (precipitation, shortwave radiation, etc.) that may have an influence on Air Temperature. While it may not serve as the best predictive model for air temperature, it serves as a first step in linear regression modeling and machine learning.
17.1 Load the Packages and Data
Let’s use the read.wcr
function that we previously created to load in our dataset.
library(ncdf4)
= function(fname) {
read.wcr = strsplit(fname, "/")
fullname = fullname[[1]][length(fullname[[1]])]
dataset_str = strsplit(dataset_str, "_")[[1]][1]
datname = substr(dataset_str, nchar(dataset_str)-6, nchar(dataset_str)-3)
data.year = seq(from=as.POSIXct(paste0(data.year,"-1-1 0:00", tz="UTC")),to=as.POSIXct(paste0(data.year,"-12-31 23:00", tz="UTC")),by="hour")
data.date <- data.frame(CF.name = c("date", "air_temperature", "precipitation_flux", "surface_downwelling_shortwave_flux_in_air",
vars.info "specific_humidity", "surface_downwelling_longwave_flux_in_air", "air_pressure",
"eastward_wind", "northward_wind", "wind_speed"))
<- list()
df <- ncdf4::nc_open(fname)
tem <- tem$dim
dim for (j in seq_along(vars.info$CF.name)) {
if (exists(as.character(vars.info$CF.name[j]), tem$var)) {
<- ncdf4::ncvar_get(tem, as.character(vars.info$CF.name[j]))
df[[j]] else {
} = NA
df[[j]]
}
}names(df) <- vars.info$CF.name
<- data.frame(df)
df nc_close(tem)
if(all(is.na(df$date))){
$date = data.date
df
}return(df)
}
Now, let’s throw the read.wcr
function into a loop so we can read in the data for each year.
# create a sequence of years for the years we have data
= seq(2010,2012)
yearSeq
# begin the for loop to open each year of data
for (y in yearSeq){
# use our read.wcr() function and paste in the filename that changes for each year of data
= read.wcr(fname = paste0("~/Documents/Github/geog473-673/datasets/WCr_1hr.",y,".nc"))
tem_df # != means DOES NOT EQUAL. This statement reads as - if y DOES NOT EQUAL the first year in yearSeq, proceed to use rbind
# otherwise, if y is indeed the first year in yearSeq, we need to initialize wcr_df BEFORE we overwrite tem_df
if (y != yearSeq[1]){
= rbind(wcr_df, tem_df)
wcr_df else {
} = tem_df
wcr_df
}
}
length(wcr_df$date)
## [1] 26304
head(wcr_df)
## date air_temperature precipitation_flux
## 1 2010-01-01 00:00:00 260.1135 0
## 2 2010-01-01 01:00:00 260.0315 0
## 3 2010-01-01 02:00:00 259.9805 0
## 4 2010-01-01 03:00:00 259.9005 0
## 5 2010-01-01 04:00:00 259.7585 0
## 6 2010-01-01 05:00:00 259.6825 0
## surface_downwelling_shortwave_flux_in_air specific_humidity
## 1 0 0.001176792
## 2 0 0.001170627
## 3 0 0.001167514
## 4 0 0.001160727
## 5 0 0.001142952
## 6 0 0.001133232
## surface_downwelling_longwave_flux_in_air air_pressure eastward_wind
## 1 237.6100 95704.5 1.522976
## 2 241.6900 95735.5 1.624952
## 3 240.3500 95765.5 1.696362
## 4 240.9325 95795.5 1.493329
## 5 247.3875 95824.5 1.921457
## 6 246.5075 95853.0 1.963118
## northward_wind wind_speed
## 1 -1.462524 NA
## 2 -1.563508 NA
## 3 -1.553177 NA
## 4 -1.702995 NA
## 5 -1.460475 NA
## 6 -1.667144 NA
Our data has been read in. Let’s convert the Air Temperature units from degrees Kelvin to degrees Fahrenheit. Then, let’s plot our time series.
library(ggplot2)
library(lubridate)
# convert the data to Fahrenheit
$air_temperature = (wcr_df$air_temperature - 273.15) * (9/5) + 32
wcr_df
ggplot(data = wcr_df, aes(x = date, y = air_temperature)) +
geom_rect(xmin = as.POSIXct("2010-01-01"),
xmax = as.POSIXct("2012-06-01"),
ymin = -10, ymax = 100,
fill = "lightblue", alpha = 0.1) +
annotate("text", x = as.POSIXct("2011-01-01") , y = 80,
color = "blue", label = "Train Region") +
annotate("text", x = as.POSIXct("2012-12-01"), y = 80,
color = "coral", label = "Test Region") +
geom_point(size=0.5, alpha = 0.5, color = "black") +
labs(title = "Willow Creek Hourly Air Temperature - 2010-2013",y = "Air Temperature (F)", x = "")
17.2 Aggregate Data
Let’s aggregate our dataset from an hourly resolution to a daily resolution. This can be done using the xts
(a time series library) library which has a convenient function called apply.daily()
that averages the default resolution data to a daily resolution. This will enhance speed but also decrease accuracy as we are removing data. In real practice and with proper computer resources we would rather stick with our original hourly resolution.
library(xts)
library(zoo)
# create time series data frame based on our datetime
= xts(wcr_df, order.by = wcr_df$date)
xt # aggregate to daily resolution
= xts::apply.daily(xt,mean)
daily # recreate the date variable which has been deemed as the index of our daily dataframe - while it is the index, it's easier to use if it's a variable.
= data.frame(date=index(daily), coredata(daily))
daily # remove artifact of daily aggregation
$date.1 = NULL
daily# plot the new daily air temperature
ggplot(data = daily, aes(x = date, y = air_temperature)) +
geom_rect(xmin = as.POSIXct("2010-01-01"),
xmax = as.POSIXct("2012-06-01"),
ymin = -10, ymax = 85,
fill = "lightblue", alpha = 0.1) +
annotate("text", x = as.POSIXct("2011-01-01") , y = 70,
color = "blue", label = "Train Region") +
annotate("text", x = as.POSIXct("2012-12-01"), y = 70,
color = "coral", label = "Test Region") +
geom_point(alpha = 0.5, color = "black") +
labs(title = "Willow Creek Daily Air Temperature - 2010-2013",y = "Air Temperature (F)", x = "")
17.3 Prepping the Model
Now that we have our data in daily resolution, we can split our dataset into 2; a test dataset, and a train dataset. When we build a linear regression model for a time series dataset, we select a training period and a testing period. For our case, we’re going to train the dataset with data from 2010-01-01 to 2012-05-31. In other words, we’re using 5/6ths of our dataset to train the model. Then, we’ll use the remaining 1/6th (2012-06-01 to 2013-01-01) to test the model to see how well it predicts the temperature using linear regression.
= daily[1:which(daily$date == as.POSIXct("2012-05-31 23:00:00 EDT")),]
train = daily[which(daily$date == as.POSIXct("2012-06-01 23:00:00 EDT")):length(daily$date),]
test
= data.frame(date = train$date, air_temperature = train$air_temperature)
train = data.frame(date = test$date, air_temperature = test$air_temperature)
test
# print out the results of our split
head(train)
## date air_temperature
## 1 2010-01-01 23:00:00 4.352076
## 2 2010-01-02 23:00:00 -4.916312
## 3 2010-01-03 23:00:00 4.264553
## 4 2010-01-04 23:00:00 9.920745
## 5 2010-01-05 23:00:00 11.206665
## 6 2010-01-06 23:00:00 12.708645
head(test)
## date air_temperature
## 1 2012-06-01 23:00:00 54.73873
## 2 2012-06-02 23:00:00 59.44044
## 3 2012-06-03 23:00:00 63.61205
## 4 2012-06-04 23:00:00 64.61727
## 5 2012-06-05 23:00:00 61.49611
## 6 2012-06-06 23:00:00 62.09221
We have our test and train datasets, but now we must prepare these datasets for the linear regression model. Remember, we are trying to train the model using the train dataset and use that model to accurately predict the temperature values of the test dataset. We’ll need to provide the model with some information regarding the number of variables that go into the model.
library(timetk)
library(recipes)
# create a recipe for the air temperature variable
= recipe(air_temperature ~ ., data = train)
rec_train = step_timeseries_signature(recipe = rec_train, "date")
recipe_ts
recipe_ts
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Operations:
##
## Timeseries signature features from "date"
We only have 1 predictor (day of year) and 1 outcome variable (air temperature). When we initiate a linear regression model for a time series, the model will add extra columns (second, minute, hour, day of year, week of year, weekday, etc.) to the training dataset. Remember, we’re creating a recipe that holds instructions for the actual model. We can add recipe instructions on whether to evaluate the interaction of these columns and Air Temperature or not. We’ll remove seconds, minutes, hours since we are at a daily resolution. These instructions are used on the training dataframe while it’s training the particular model but does not impact the actual dataframe itself.
= recipe_ts
recipe_spec # when we run the model, extra dat values will be added automatically - so let's remove our default date column when teh model is run - note this does not change the dataframe
= step_rm(recipe_spec, date)
recipe_spec # some other variables will be added in automatically as well, let's remove the ones that aren't imoprtant.
= step_rm(recipe_spec, contains("iso"),
recipe_spec contains("second"), contains("minute"), contains("hour"),
contains("am.pm"), contains("xts"))
# create new columns based on the interaction of these variables
= step_interact(recipe_spec, ~ date_month.lbl * date_day)
recipe_spec = step_interact(recipe_spec, ~ date_month.lbl * date_mweek)
recipe_spec = step_interact(recipe_spec, ~ date_month.lbl * date_wday.lbl * date_yday)
recipe_spec = step_dummy(recipe_spec, contains("lbl"), one_hot = TRUE)
recipe_spec_final
recipe_spec_final
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Operations:
##
## Timeseries signature features from "date"
## Delete terms date
## Delete terms contains("iso"), contains("second"), ...
## Interactions with date_month.lbl * date_day
## Interactions with date_month.lbl * date_mweek
## Interactions with date_month.lbl * date_wday.lbl * date_yday
## Dummy variables from contains("lbl")
Our instructions have been specified, now let’s use the linear_reg
function from the parsnip
package to create the instance. We’ll need to set an engine to run the model (there are multiple regression model engines that are beyond the scope of this tutorial) and that engine will be glmnet
. We’ll initiate the model, create a workflow for the model where we add in our prewritten recipe above, then we’ll add our model to the workflow. Once it’s ready, we can run the model and it’s workflow using the fit
function. Our model has been trained. Now, we can use our trained model to predict the test air temperature based on the test date.
17.4 Running the Model
library(workflows)
library(parsnip)
library(recipes)
library(yardstick)
library(glmnet)
library(tidyverse)
# initiate the model with a penalty of 2 and no mixture - these parameters can be changed for more variability
<- parsnip::linear_reg(mode = "regression", penalty = 2, mixture = 0)
model_spec_glmnet set_engine(model_spec_glmnet, "glmnet")
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 2
## mixture = 0
##
## Computational engine: glmnet
# add our recipe to the workflow, then add the model to the workflow
= add_recipe(workflow(), recipe_spec_final)
workflow_glmnet = add_model(workflow_glmnet, model_spec_glmnet)
workflow_glmnet workflow_glmnet
## ══ Workflow ════════════════════════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────────────────────────
## 7 Recipe Steps
##
## ● step_timeseries_signature()
## ● step_rm()
## ● step_rm()
## ● step_interact()
## ● step_interact()
## ● step_interact()
## ● step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 2
## mixture = 0
# tune the model based on the training data
<- workflow_glmnet
workflow_trained = fit(workflow_glmnet, data = train)
workflow_trained
# predict the test air temperature based on our model
= predict(workflow_trained, test)
prediction_tbl = bind_cols(prediction_tbl, test)
prediction_tbl prediction_tbl
## # A tibble: 214 x 3
## .pred date air_temperature
## <dbl> <dttm> <dbl>
## 1 64.5 2012-06-01 23:00:00 54.7
## 2 65.8 2012-06-02 23:00:00 59.4
## 3 67.1 2012-06-03 23:00:00 63.6
## 4 72.4 2012-06-04 23:00:00 64.6
## 5 77.6 2012-06-05 23:00:00 61.5
## 6 69.0 2012-06-06 23:00:00 62.1
## 7 64.6 2012-06-07 23:00:00 66.3
## 8 69.1 2012-06-08 23:00:00 68.3
## 9 69.6 2012-06-09 23:00:00 74.9
## 10 65.3 2012-06-10 23:00:00 73.8
## # … with 204 more rows
# plot our modeled test air temperature against the actual test air temperature
ggplot(data = daily, aes(x = date, y = air_temperature)) +
geom_rect(xmin = as.POSIXct("2010-01-01"),
xmax = as.POSIXct("2012-06-01"),
ymin = -10, ymax = 85,
fill = "lightblue", alpha = 0.01) +
annotate("text", x = as.POSIXct("2011-01-01") , y = 75,
color = "blue", label = "Train Region") +
annotate("text", x = as.POSIXct("2012-12-01"), y = 75,
color = "coral", label = "Test Region") +
geom_point(alpha = 0.5, color = "black") +
geom_point(aes(x = date, y = .pred), data = prediction_tbl,
alpha = 0.5, color = "darkblue") +
labs(title = "GLM: Out-Of-Sample Forecast")
17.5 Validation and Accuracy
The plot seems to show that the model predicted the test air temperature fairly well, but let’s evaluate the metrics the see how it does based on the statistics. Then, let’s plot the residuals - the difference between the actual test air temperature and the predicted air temperature.
# calculate the metrics of test air temperature vs. predicted air temperature
metrics(prediction_tbl, air_temperature, .pred)
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 11.2
## 2 rsq standard 0.778
## 3 mae standard 8.89
# plot the residuals
ggplot(data = prediction_tbl, aes(x = date, y = air_temperature - .pred)) +
geom_hline(yintercept = 0, color = "black") +
geom_point(color = "blue", alpha = 0.5) +
geom_smooth(span = 0.05, color = "red") +
geom_smooth(span = 1.00, se = FALSE) +
labs(title = "GLM Model Residuals, Out-of-Sample", y = "Air Temperature: Actual - Predicted", x = "") +
scale_y_continuous(limits = c(-5, 5))
Remember, we don’t want to overfit our model and make it too accurate! Overfitting of our model to the training dataset can create unrealistic outcomes with unrealistic regularity. Temperatures are variable!
17.6 Future Forecasting
The residuals show the model performed fairly well. Now, can we forecast the temperature into the future? We’ll follow the same procedure as before but now we’ll create extra time series values and fill those using the model we’ve built
# Extract daily index
<- tk_index(daily)
idx # Get time series summary from index
<- tk_get_timeseries_summary(idx)
daily_summary
# add 180 days to the model
<- tk_make_future_timeseries(idx, n_future = 180)
idx_future # create a tibble table
<- tibble(date = idx_future)
future_tbl head(future_tbl)
## # A tibble: 6 x 1
## date
## <dttm>
## 1 2013-01-01 00:00:00
## 2 2013-01-02 00:00:00
## 3 2013-01-03 00:00:00
## 4 2013-01-04 00:00:00
## 5 2013-01-05 00:00:00
## 6 2013-01-06 00:00:00
# run the model using the entire dataset, predict the temperature for the dates of future_tbl, and bind the columns together.
= fit(workflow_glmnet, data = daily)
future_predictions_tbl = predict(future_predictions_tbl, future_tbl)
future_predictions_tbl = bind_cols(future_predictions_tbl, future_tbl)
future_predictions_tbl
# plot the future forecasted temperature
ggplot(data = daily, aes(x = date, y = air_temperature)) +
geom_rect(xmin = as.POSIXct("2010-01-01"),
xmax = as.POSIXct("2012-06-01"),
ymin = -10, ymax = 85,
fill = "lightblue", alpha = 0.01) +
annotate("text", x = as.POSIXct("2011-01-01") , y = 75,
color = "blue", label = "Train Region") +
annotate("text", x = as.POSIXct("2012-12-01"), y = 75,
color = "coral", label = "Test Region") +
geom_point(alpha = 0.5, color = "black") +
geom_point(aes(x = as.POSIXct(date), y = .pred), data = future_predictions_tbl,
alpha = 0.5, color = "dodgerblue1") +
geom_smooth(aes(x = as.POSIXct(date), y = .pred), data = future_predictions_tbl,
method = 'loess', color = "dodgerblue4") +
labs(title = "Willow Creek Air Temperature: 6-Month Forecast", y = "Air Temperature (F)", x = "")
Let’s add in some error regions. We’ll calculate the spread of the 95th percentile and the 80th percentile confidence intervals. 95% of the area under a normal curve lies within roughly 1.96 standard deviations of the mean, and due to the central limit theorem, this number is therefore used in the construction of approximate 95% confidence intervals. For 80%, the are under a normal curve lies within roughly 1.28 standard deviations. We’ll use these values to create our prediction error thresholds.
# Calculate standard deviation of residuals (actual temp subtracted from residuals)
<- summarize(prediction_tbl, stdev = sd(air_temperature- .pred))
test_resid_sd # create lo.95 and other variables for the error areas
<- mutate(future_predictions_tbl,
future_predictions_tbl lo.95 = .pred - 1.96 * test_resid_sd$stdev,
lo.80 = .pred - 1.28 * test_resid_sd$stdev,
hi.80 = .pred + 1.28 * test_resid_sd$stdev,
hi.95 = .pred + 1.96 * test_resid_sd$stdev)
# plot the error areas
ggplot(data = daily, aes(x = date, y = air_temperature)) +
geom_point(alpha = 0.5, color = "black") +
geom_ribbon(aes(y = .pred, x = as.POSIXct(date),ymin = lo.95, ymax = hi.95),
data = future_predictions_tbl,
fill = "#D5DBFF", color = NA, size = 0) +
geom_ribbon(aes(y = .pred, x = as.POSIXct(date),ymin = lo.80, ymax = hi.80, fill = key),
data = future_predictions_tbl,
fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
geom_point(aes(x = as.POSIXct(date), y = .pred), data = future_predictions_tbl,
alpha = 0.5, color = "red") +
geom_smooth(aes(x = as.POSIXct(date), y = .pred), data = future_predictions_tbl,
method = 'loess', color = "white")+
labs(title = "Willow Creek Daily Air Temperature: 6 Month Forecast",y = "Air Temperature (F)", x = "")
Remember, no forecast is ever perfect!
17.7 Assignment
Perform the same analysis for the Willow Creek Air Temperature (2010,2011, and 2012 datasets) but this time aggregate to a weekly resolution using the apply.weekly()
function from xts
. Proceed with the same workflow and create the same plots. Add plots to a document (Word, Google Docs, or RMarkdown file, your choice) and answer the following questions;
- How does the weekly resolution model compare to the daily resolution model?
- Why do you think it performs better/worse?
- How could we enhance the accuracy of the model?
Upload file (as word doc, PDF, whatever works) to UD Canvas.