Time Series Analysis and Demand Forecasting
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import pylab as pl
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
%matplotlib inline
import math
Upload and explore the data
For the exercises in this lab, we will use the oil price dataset from Kaggle. We will consider that you are using Google Colab so upload your data files to the directory sample_data.
Read the data
= pd.read_csv('sample_data/oil.csv', header = 0,
df_oil ='"',sep=",",
quotechar= ['na', '-', '.', ''], low_memory=False) na_values
Explore the data
We take a look at the data by plotting the whole data
# Use scatter plot to plot the 'Oil Price' against the 'Date'
'font.size': 16})
plt.rcParams.update({= plt.subplots()
fig, ax # Plot the data as scatter plot
= 0
start = len(df_oil)
limit = start + limit - 1
end '-')
ax.plot(df_oil.date[start:end], df_oil.dcoilwtico[start:end], # Changing the labels on the x-axis
= [df_oil.date.iloc[start], df_oil.date.iloc[start + math.floor(limit / 4)], \
x + math.floor(limit / 2)], df_oil.date.iloc[start + math.floor(3 * limit / 4)], \
df_oil.date.iloc[start
df_oil.date.iloc[end]]
ax.set_xticks(x)= 45)
plt.xticks(rotation # put the label for the x-axis 'Date' and for the y-axis 'Oil Price'
set(xlabel='Date', ylabel='Oil Price')
ax."oilData.pdf", format="pdf", bbox_inches="tight")
plt.savefig( plt.show()
Look at the details
We cannot see a lot of details, let’s consider a smaller period
'font.size': 16})
plt.rcParams.update({= plt.subplots()
fig, ax = 300
start = 300
limit = start + limit - 1
end '-')
ax.plot(df_oil.date[start:end], df_oil.dcoilwtico[start:end], = [df_oil.date.iloc[start], df_oil.date.iloc[start + math.floor(limit / 4)], \
x + math.floor(limit / 2)], df_oil.date.iloc[start + math.floor(3 * limit / 4)], \
df_oil.date.iloc[start
df_oil.date.iloc[end]]
ax.set_xticks(x)= 45)
plt.xticks(rotation set(xlabel='Date', ylabel='Oil Price')
ax."oilData_300.pdf", format="pdf", bbox_inches="tight")
plt.savefig( plt.show()
Depicting the trend of the Oil Price
A trend exists when there is a long-term increase or decrease in the data. It does not have to be linear. Researchers also refer to a trend as “changing direction”, when it might go from an increasing trend to a decreasing trend and vice versa.
1- Using semi-average
# TODO: use the semi-average technique from the slides to summarize the time series trend
= df_oil.dcoilwtico[0:math.floor(len(df_oil)/2)].mean()
m1 = df_oil.dcoilwtico[math.floor(len(df_oil)/2)+1:len(df_oil)].mean()
m2
= 0
start = math.floor(len(df_oil)/4)
first = 3 * math.floor(len(df_oil)/4)
second = len(df_oil)
limit = start + limit - 1
end = (m2 - m1) / (second - first) * (start - first) + m1
y_start = (m2 - m1) / (second - first) * (end - first) + m1
y_end = [y_start, m1, m2, y_end]
semi_avg = [df_oil.date.iloc[start], df_oil.date.iloc[first], df_oil.date.iloc[second], df_oil.date.iloc[end]]
x = plt.subplots()
fig, ax '.')
ax.plot(df_oil.date[start:end], df_oil.dcoilwtico[start:end], 'r', '-')
ax.plot(x, semi_avg,
ax.set_xticks(x)= 45)
plt.xticks(rotation set(xlabel='Date', ylabel='Oil Price')
ax."oilData_SemiAverage.pdf", format="pdf", bbox_inches="tight")
plt.savefig( plt.show()
2- Using moving-average
Use the rolling API for depicting the trend using the moving average method.
First: use rolling(window = 50)
Second: use rolling(window = 50, min_periods = 1)
Comment on the differences
# TODO: use pd.DataFrame.rolling(window = ma_win_size), comment on the output
= 50
ma_win_size = df_oil.copy()
df_oil_ma 'ma_dcoilwtico'] = df_oil_ma['dcoilwtico'].rolling(window = ma_win_size).mean()
df_oil_ma[
= ma_win_size
start = len(df_oil_ma) - ma_win_size
limit = start + limit - 1
end
= plt.subplots()
fig, ax
'-')
ax.plot(df_oil_ma.date[start:end], df_oil_ma.ma_dcoilwtico[start:end], = [df_oil_ma.date.iloc[start], df_oil_ma.date.iloc[start + math.floor(limit / 4)], \
x + math.floor(limit / 2)], df_oil_ma.date.iloc[start + math.floor(3 * limit / 4)], \
df_oil_ma.date.iloc[start
df_oil_ma.date.iloc[end]]
ax.set_xticks(x)= 45)
plt.xticks(rotation set(xlabel='Date', ylabel='Oil Price')
ax.
"oilData_MovingAverageNA.pdf", format="pdf", bbox_inches="tight")
plt.savefig( plt.show()
# TODO: use pd.DataFrame.rolling(window = ma_win_size, min_periods = 1), comment on the output
= 50
ma_win_size 'ma_dcoilwtico'] = df_oil['dcoilwtico'].rolling(window = ma_win_size, min_periods = 1).mean()
df_oil_ma[
= ma_win_size
start = len(df_oil_ma) - ma_win_size
limit = start + limit - 1
end
= plt.subplots()
fig, ax
'-')
ax.plot(df_oil_ma.date[start:end], df_oil_ma.ma_dcoilwtico[start:end], = [df_oil_ma.date.iloc[start], df_oil_ma.date.iloc[start + math.floor(limit / 4)], \
x + math.floor(limit / 2)], df_oil_ma.date.iloc[start + math.floor(3 * limit / 4)], \
df_oil_ma.date.iloc[start
df_oil_ma.date.iloc[end]]
ax.set_xticks(x)= 45)
plt.xticks(rotation set(xlabel='Date', ylabel='Oil Price')
ax.
"oilData_MovingAverage.pdf", format="pdf", bbox_inches="tight")
plt.savefig( plt.show()
There is a problem because of the missing values. Check the documentation for min_periods at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html
Optional: It might be useful sometime to show part of the data next to the figure, we can do that as follows:
# Display part of the data next to figure
= df_oil_ma
df_oil_new = ma_win_size
start = len(df_oil_new) - ma_win_size - 1
limit = start + limit
end
= plt.subplots()
fig, ax
# TODO: Plot the trend of the data that was generated using moving average technique
# in the previous cell
'-')
ax.plot(df_oil_new.date[start:end], df_oil_new.ma_dcoilwtico[start:end], = [df_oil_new.date.iloc[start], df_oil_new.date.iloc[start + math.floor(limit / 4)], \
x + math.floor(limit / 2)], df_oil_new.date.iloc[start + math.floor(3 * limit / 4)], \
df_oil_new.date.iloc[start
df_oil_new.date.iloc[end]]
ax.set_xticks(x)= 45)
plt.xticks(rotation
# Now, we plot a snippet of the data next to the figure
= df_oil_new.date[start:start + 10]
a = [round(num, 2) for num in df_oil_new.dcoilwtico[start:start + 10]]
b = [list(a), list(b)]
data = np.array(data)
numpy_array = numpy_array.T.tolist()
data = plt.table(cellText = data,
tbl =['Date', 'Oil Price'],
colLabels='left',
loc=[-0.7, 0, 0.5, 1])
bbox
set(xlabel='Date', ylabel='Oil Price')
ax."oilData_MovingAverageWData.pdf", format="pdf", bbox_inches="tight")
plt.savefig( plt.show()
3- Using Linear Regression
We will start by testing the method of computing the coefficients of the linear regression as discussed in the lecture and compare the found coefficients with the coefficients that are computed using the built-in functions.
'''
Our function for computing the linear regression coefficients.
This function accepts only one dimensional data
'''
def mySimpleLinearRegression(x, y):
= np.mean(x)
x_mean = np.mean(y)
y_mean
= x - x_mean
x_diff = y - y_mean
y_diff
= sum(x_diff * y_diff) / sum(x_diff ** 2)
coef = y_mean - coef * x_mean
intercept return (intercept , coef)
'''
Preprocess the dataframe by removing the missing values and re-indexing the data
'''
= df_oil.dropna()
df_oil_nona =True, inplace=True)
df_oil_nona.reset_index(drop= math.floor(len(df_oil_nona) / 2)
offset 'x'] = df_oil_nona.index - offset
df_oil_nona['''
TODO: use the function in the previous cell to compute the linear regression parameters
'''
= np.array(df_oil_nona.x)
xc = np.array(df_oil_nona.dcoilwtico)
yc = mySimpleLinearRegression(xc, yc)
a0, a1 print('Using the formula from the slides: ')
print('intercept = ', a0, '\nCoef. = ', a1)
'''
TODO: use the scikit learn LinearRegression class to compute the linear regression parameters
'''
= LinearRegression()
myLR len(xc),1), yc.reshape(len(yc),1))
myLR.fit(xc.reshape(# Print the intercept and the coefficient of the model
print('\n\nUsing the scikit learn package:')
print( "Intercept:", myLR.intercept_[0])
print( "Coef.:", myLR.coef_[0][0])
'''
TODO: now Plot the data and the line that is generated by linear regression
'''
= 0
start = len(df_oil_nona) - 1
limit = start + limit
end
= plt.subplots()
fig, ax
'-')
ax.plot(df_oil_nona.x[start:end], df_oil_nona.dcoilwtico[start:end], = [df_oil_nona.x.iloc[start], \
x + math.floor(limit / 4)], \
df_oil_nona.x.iloc[start + math.floor(limit / 2)], \
df_oil_nona.x.iloc[start + math.floor(3 * limit / 4)], \
df_oil_nona.x.iloc[start
df_oil_nona.x.iloc[end]]= [df_oil_nona.date.iloc[start], \
xticks + math.floor(limit / 4)], \
df_oil_nona.date.iloc[start + math.floor(limit / 2)], \
df_oil_nona.date.iloc[start + math.floor(3 * limit / 4)], \
df_oil_nona.date.iloc[start
df_oil_nona.date.iloc[end]]
plt.xticks(x, xticks)= 45)
plt.xticks(rotation
= [a0 + a1 * x for x in df_oil_nona.x]
model 'r');
plt.plot(df_oil_nona.x, model,
set(xlabel='Date', ylabel='Oil Price') ax.
4- Using Exponential Smoothing
# TODO: Apply the exponential Smoothing on the data and display the resulting time series
# use different values for alpha and comment on the results
# TODO: Apply the exponential Smoothing on the data and display the resulting time series
# use different values for alpha and comment on the results
= df_oil.dropna()
df_oil_nona =True, inplace=True)
df_oil_nona.reset_index(drop
= 0.5
alpha = np.zeros(len(df_oil_nona) - 1)
exp_smoothing 0] = exp_smoothing[1] = np.NaN
exp_smoothing[2] = alpha * df_oil_nona.loc[0, 'dcoilwtico'] \
exp_smoothing[+ (1 - alpha) * df_oil_nona.loc[1, 'dcoilwtico']
for i in range(2, len(df_oil_nona) - 2):
+1] = alpha * df_oil_nona.loc[i, 'dcoilwtico'] \
exp_smoothing[i+ (1 - alpha) * exp_smoothing[i]
= 0
start = len(df_oil_nona) - 1
limit = start + limit
end
= plt.subplots()
fig, ax
'-')
ax.plot(df_oil_nona.date[start:end], exp_smoothing, = [df_oil_nona.date.iloc[start], \
x + math.floor(limit / 4)], \
df_oil_nona.date.iloc[start + math.floor(limit / 2)], \
df_oil_nona.date.iloc[start + math.floor(3 * limit / 4)], \
df_oil_nona.date.iloc[start
df_oil_nona.date.iloc[end]]
ax.set_xticks(x)= 45)
plt.xticks(rotation
set(xlabel='Date', ylabel='Oil Price')
ax."oilData_ExpSmoothing.pdf", format="pdf", bbox_inches="tight")
plt.savefig( plt.show()
# TODO: repeat the same smoothing as in the previous cell but with alpha = 0.001
# Comment on the results
# TODO: Apply the exponential Smoothing on the data and display the resulting time series
# use different values for alpha and comment on the results
= df_oil.dropna()
df_oil_nona =True, inplace=True)
df_oil_nona.reset_index(drop
= 0.05
alpha = np.zeros(len(df_oil_nona) - 1)
exp_smoothing 0] = exp_smoothing[1] = np.NaN
exp_smoothing[2] = alpha * df_oil_nona.loc[0, 'dcoilwtico'] \
exp_smoothing[+ (1 - alpha) * df_oil_nona.loc[1, 'dcoilwtico']
for i in range(2, len(df_oil_nona) - 2):
+1] = alpha * df_oil_nona.loc[i, 'dcoilwtico'] \
exp_smoothing[i+ (1 - alpha) * exp_smoothing[i]
= 0
start = len(df_oil_nona) - 1
limit = start + limit
end
= plt.subplots()
fig, ax
'-')
ax.plot(df_oil_nona.date[start:end], exp_smoothing, = [df_oil_nona.date.iloc[start], \
x + math.floor(limit / 4)], \
df_oil_nona.date.iloc[start + math.floor(limit / 2)], \
df_oil_nona.date.iloc[start + math.floor(3 * limit / 4)], \
df_oil_nona.date.iloc[start
df_oil_nona.date.iloc[end]]
ax.set_xticks(x)= 45)
plt.xticks(rotation
set(xlabel='Date', ylabel='Oil Price')
ax."oilData_ExpSmoothing.pdf", format="pdf", bbox_inches="tight")
plt.savefig( plt.show()
df_oil_nona
Seasonal Component
A seasonal pattern occurs when a time series is affected by seasonal factors such as the time of the year or the day of the week. Seasonality is always of a fixed and known frequency.
# Let's read the data again
= pd.read_csv('sample_data/oil.csv', header = 0,
df_oil_SC ='"',sep=",",
quotechar= ['na', '-', '.', ''], low_memory=False) na_values
from datetime import datetime
= [df_oil_SC.iloc[i,0] for i in range(len(df_oil_SC))]
date_str = '%Y-%m-%d'
date_format = [datetime.strptime(d_str, date_format) for d_str in date_str] date_obj
= pd.to_datetime(date_obj, format='%y-%m-%d %H:%M:%S')
df_oil_SC.index = df_oil_SC.groupby(by = [df_oil_SC.index.month, df_oil_SC.index.year]).mean(numeric_only=True)
df_G = df_G.groupby(level = 0).mean() # Monthly mean over the years
df_MM df_MM
= {'dcoilwtico':'Monthly mean'}) df_SMM.rename(columns
# Now, we compute the seasonal index by dividing the monthly mean over the mean of the means
= df_MM/ (df_MM.mean()) * 100
df_SI = {'dcoilwtico':'Seasonal Index'}) df_SI.rename(columns
Time series decomposition
There are three types of time series patterns: trend, seasonality and cycles. When decomposing a time series into components, we usually combine the trend and cycle into a single trend-cycle component (sometimes called the trend for simplicity). Thus, we think of a time series as comprising three components: a trend-cycle (or trend) component, a seasonal component, and a remainder component (containing anything else in the time series).
During the lecture, you learned about different techniques (‘Additive’, ‘Multiplicative’ and ‘STL’) for decomposing time series. Now, you need to apply them on the oil dataset and comment on your findings.
'''
TODO: Apply the Additive time series decomposition on the oil dataset.
'''
from pandas import Series
from matplotlib import pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
%matplotlib inline
= list(df_oil.dcoilwtico.dropna())
series = seasonal_decompose(series, model='additive', period=100)
result
result.plot()"oil_price.pdf", format="pdf", bbox_inches="tight")
plt.savefig( plt.show()
'''
TODO: Apply the Multiplicative time series decomposition on the oil dataset.
'''
from pandas import Series
from matplotlib import pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
%matplotlib inline
= list(df_oil.dcoilwtico.dropna())
series = seasonal_decompose(series, model='multiplicative', period=100)
result
result.plot()"oil_price.pdf", format="pdf", bbox_inches="tight")
plt.savefig( plt.show()
'''
TODO: Apply the STL time series decomposition on the oil dataset.
'''
from statsmodels.tsa.seasonal import STL
= df_oil.dropna()
df_oil_new = list(df_oil_new.dcoilwtico)
dcoilwtico = pd.Series(
oil_data =df_oil_new.date, name="OIL")
dcoilwtico, index= STL(oil_data, period = 100)
stl = stl.fit()
res = res.plot() fig
Demand Forecasting
We will also use the oil dataset. We will remove the values of the last two weeks from the data and use it to check the different forecasting models. We will test forecasting using moving average, linear regression, ARIMA and Exponential Smoothing from the STL package.
We will start by reading the data and splitting the data into train and test (the test will contain the readings of the last two week in the time series).
# Let's read the data again
= pd.read_csv('sample_data/oil.csv', header = 0,
df_oil_FC ='"',sep=",",
quotechar= ['na', '-', '.', ''], low_memory=False) na_values
remove the last two weeks of the data and store it as test data
'''
TODO: store the last two weeks of the data in a test dataframe
the data from all the weeks except the last two weeks in a dataframe train
'''
= df_oil_FC.dropna()
df_oil_FC =True, inplace=True)
df_oil_FC.reset_index(drop= df_oil_FC.index[df_oil_FC.date == '2017-08-16'].tolist()
idx = df_oil_FC.iloc[0:idx[0]]
train = df_oil_FC.iloc[idx[0]:len(df_oil_FC)]
test test
Perform demand forecasting using moving average
'''
TODO: predict the values of the last two weeks using the moving average with k = 5
'''
= 5
k = len(test)
t = len(train) - k
start = len(train)
end = train.dcoilwtico.iloc[start:end].tolist()
new_list
for item in test.date:
= round(np.mean(new_list[-k:]), 2)
windowMA
new_list.append(windowMA)-t:] new_list[
'''
Show the original test values
'''
test.dcoilwtico
'''
TODO: measure the performance of your predictor by computing the MSE
between the predicted and the actual values
'''
= len(test)
t = np.mean((new_list[-t:] - test.dcoilwtico)**2)
MSE MSE
Perform demand forecasting using Linear Regression
'''
TODO: predict the values of the last two weeks using the Linear Regression
'''
= train.dropna()
train =True, inplace=True)
train.reset_index(drop= math.floor(len(train) / 2)
offset 'x'] = train.index - offset
train[
= np.array(train.x)
xc = np.array(train.dcoilwtico)
yc
= LinearRegression()
myLR len(xc),1), yc.reshape(len(yc),1))
myLR.fit(xc.reshape(# Print the intercept and the coefficient of the model
= myLR.intercept_[0]
a0 = myLR.coef_[0][0]
a1 = list()
new_list = np.max(train.x) + 1
xi for item in test.date:
= a0 + a1 * xi
yi
new_list.append(yi)+= 1
xi new_list
'''
TODO: measure the performance of your predictor by computing the MSE
between the predicted and the actual values
'''
= np.mean((new_list - test.dcoilwtico)**2)
MSE MSE
'''
TODO: predict the values of the last two weeks using the Linear Regression
but use the last n data points (n of your choice)
'''
= train.dropna()
train = train[-11:]
train =True, inplace=True)
train.reset_index(drop= math.floor(len(train) / 2)
offset 'x'] = train.index - offset
train[
= np.array(train.x)
xc = np.array(train.dcoilwtico)
yc
= LinearRegression()
myLR len(xc),1), yc.reshape(len(yc),1))
myLR.fit(xc.reshape(# Print the intercept and the coefficient of the model
= myLR.intercept_[0]
a0 = myLR.coef_[0][0]
a1 = list()
new_list = np.max(train.x) + 1
xi for item in test.date:
= a0 + a1 * xi
yi
new_list.append(yi)+= 1
xi new_list
'''
TODO: measure the performance of your predictor by computing the MSE
between the predicted and the actual values
'''
= np.mean((new_list - test.dcoilwtico)**2)
MSE MSE
= train.dropna()
train train
Perform demand forecasting using STL forecasting
We will re-read the data to make sure that the changes that we made earlier are not going to affect the analysis. We will also split the data intio training and testing
'''
TODO: Read the data again and split it into training and testing
where the testing contains the data from the last two weeks
'''
= pd.read_csv('sample_data/oil.csv', header = 0,
df_oil_FC ='"',sep=",",
quotechar= ['na', '-', '.', ''], low_memory=False)
na_values = df_oil_FC.dropna()
df_oil_FC =True, inplace=True)
df_oil_FC.reset_index(drop= df_oil_FC.index[df_oil_FC.date == '2017-08-16'].tolist()
idx = df_oil_FC.iloc[0:idx[0]]
train = df_oil_FC.iloc[idx[0]:len(df_oil_FC)] test
'''
TODO: Use ARIMA from STL package for forecasting the oil prices for the last two weeks.
'''
from statsmodels.tsa.api import STLForecast
from statsmodels.tsa.arima.model import ARIMA
= train.fillna(method = 'pad')
oilData = [i for i in range(oilData.shape[0])]
oilData.index
= STLForecast(oilData.dcoilwtico, ARIMA, \
stlf = dict(order = (1, 1, 0), trend = "t"), period = 100)
model_kwargs = stlf.fit()
res = res.forecast(len(test))
arima_forecasts arima_forecasts
'''
TODO: Use exponential_smoothing from STL package for forecasting the oil prices for the last two weeks.
'''
from statsmodels.tsa.statespace import exponential_smoothing
= exponential_smoothing.ExponentialSmoothing
ES = {"trend": True}
config = STLForecast(oilData.dcoilwtico, ES, model_kwargs=config, period = 100)
stlf = stlf.fit()
res = res.forecast(len(test))
es_forecasts es_forecasts
'''
TODO: measure the performance of your predictor by computing the MSE
between the predicted and the actual values
'''
= np.mean((arima_forecasts - test.dcoilwtico)**2)
MSE_ARIMA = np.mean((es_forecasts - test.dcoilwtico)**2)
MSE_ES print(MSE_ARIMA, MSE_ES)