0. Data processing¶

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

import numpy as np
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
from sklearn import metrics

%matplotlib inline
from matplotlib.pylab import rcParams
import matplotlib.dates as mdates
import seaborn as sns

import warnings
import itertools
warnings.filterwarnings("ignore") # specify to ignore warning messages

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

Load input data.

data = pd.read_csv('/content/avocado.csv')

data.head(5)

data.shape

(18249, 14)

Some relevant variables from the dataset:

Date - The date of the observation
AveragePrice - the average price of a single avocado
type - conventional or organic
year - the year
Region - the city or region of the observation
Total Volume - Total number of avocados sold

data.isnull().sum()     # is there any NULL variable in the dataset?

Unnamed: 0      0
Date            0
AveragePrice    0
Total Volume    0
4046            0
4225            0
4770            0
Total Bags      0
Small Bags      0
Large Bags      0
XLarge Bags     0
type            0
year            0
region          0
dtype: int64

data.describe().round(2)

# convert Date column's format;

data['Date'] =pd.to_datetime(data.Date)

data.sort_values(by=['Date'], inplace=True, ascending=True)

data.head()

# TIME SERIES ANALYSIS
df = data[['Date', 'AveragePrice']]
df = df.set_index('Date')

weekly_df = df.resample('W').mean()
w_df = weekly_df.reset_index().dropna()

w_df.sort_values(by=['Date'])
w_df.head()

# Plotting the weekly average prices by month;

fig = plt.figure(figsize = (27, 10))
ax = plt.axes()
#set ticks every month
ax.xaxis.set_major_locator(mdates.MonthLocator())
#set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
plt.plot(w_df['Date'],w_df['AveragePrice'],color='b', linewidth=1)
plt.xlabel("Date")
plt.ylabel("Avocado Price USD")
plt.show()

Facebook Prophet¶

import logging
logging.getLogger('fbprophet').setLevel(logging.ERROR)
import fbprophet
Prophet = fbprophet.Prophet

Prophet requires the variable names in the time series to be:

y – Target
ds – Datetime

# Time Series Forecasts using Facebook's Prophet()
w_df1= w_df
w_df.columns = ['ds', 'y']
TEST_SIZE = 12
train, test = w_df.iloc[:-TEST_SIZE], w_df.iloc[-TEST_SIZE:]

model_P=Prophet(interval_width=0.95, yearly_seasonality=True, daily_seasonality=True, weekly_seasonality=False, changepoint_range=1) 
#interval_width sets the uncertainty interval to produce a confidence interval around the forecast

model_P.add_seasonality(name='monthly', period=30.5, fourier_order=5, prior_scale=0.02)

model_P.fit(train)

<fbprophet.forecaster.Prophet at 0x7f5d068a7850>

future = model_P.make_future_dataframe(freq='W', periods=12)
forecast = model_P.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

from fbprophet.plot import add_changepoints_to_plot
fig = model_P.plot(forecast)
a = add_changepoints_to_plot(fig.gca(), model_P, forecast)

plt.rcParams.update({'font.size': 12})
f = model_P.plot_components(forecast)
plt.savefig('forecast.png')
plt.show()

strt='2017-12-31'
end='2018-03-25'

predic = forecast[(forecast['ds']>strt) & (forecast['ds']<=end)]
print('MAE:', metrics.mean_absolute_error(test['y'],abs(predic ['yhat'])))
print('MSE:', metrics.mean_squared_error(test['y'],abs(predic ['yhat'])))
print('RMSE:', np.sqrt(metrics.mean_squared_error(test['y'],abs(predic ['yhat']))))

MAE: 0.12816040945292032
MSE: 0.019015796943692553
RMSE: 0.13789777715283358

XGBoost¶

import xgboost as xgb
from xgboost import plot_importance, plot_tree

TEST_SIZE = 12
train, test = w_df1.iloc[:-TEST_SIZE], w_df1.iloc[-TEST_SIZE:]

X_train = np.arange(train.shape[0]).reshape((-1,1))
y_train = train['y'].values.reshape((-1,1))
X_test = np.arange(test.shape[0]).reshape((-1,1))
y_test = test['y'].values.reshape((-1,1))
X_train.shape , y_test.shape

((157, 1), (12, 1))

reg = xgb.XGBRegressor(max_depth=4, n_estimators=100)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=20,
       verbose=False)

[10:56:36] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

Forecast on Test Set

pred= reg.predict(X_test)

# plot the results 
rcParams['figure.figsize']=7,5
plt.plot(y_test, color = 'blue',  label= 'real' )
plt.plot(pred, color = 'red', ls='-.', label = 'predictions')
plt.ylabel("Avocado Price USD")
plt.legend(loc=4)
plt.show()

print('MAE:', metrics.mean_absolute_error(y_test, pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

MAE: 0.024610141545166147
MSE: 0.0009717965220466834
RMSE: 0.03117365108624082

SVR¶

for k in ['linear','poly','rbf','sigmoid']:
    clf = SVR(kernel=k)
    clf.fit(X_train, y_train)
    confidence = clf.score(X_train, y_train)
    print(k,confidence)

linear 0.22345923739280782
poly 0.33433323459615655
rbf 0.6142189264984235
sigmoid -3900.9893999837946

model_svr=SVR(kernel='rbf', C=1, gamma= 0.5)   # Parameter Tuning to get the best accuracy
model_svr.fit(X_train,y_train)
print(model_svr.score(X_train,y_train))

0.7715274615319683

Forecast on Test Set

pred1= model_svr.predict(X_test)

print('MAE:', metrics.mean_absolute_error(y_test, pred1))
print('MSE:', metrics.mean_squared_error(y_test, pred1))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred1)))

MAE: 0.058830839262631295
MSE: 0.004429573739030649
RMSE: 0.06655504292711897

# plot the results 
rcParams['figure.figsize']=7,5
plt.plot(y_test, color = 'blue',  label= 'real' )
plt.plot(pred1, color = 'red', ls='-.', label = 'predictions SVR')
plt.plot(pred, color = 'orange', ls='--', label = 'predictions XGBoost')
plt.ylabel("Avocado Price USD")
plt.legend(loc=4)
plt.show()

Comparing the RMSE values of the models¶

print('RMSE Facebook Prophet:', np.sqrt(metrics.mean_squared_error(test['y'],abs(predic ['yhat']))))
print('RMSE XGBoost:', np.sqrt(metrics.mean_squared_error(y_test, pred)))
print('RMSE SVR:', np.sqrt(metrics.mean_squared_error(y_test, pred1)))

RMSE Facebook Prophet: 0.13789777715283358
RMSE XGBoost: 0.03117365108624082
RMSE SVR: 0.06655504292711897

	Unnamed: 0	Date	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	type	year	region
0	0	2015-12-27	1.33	64236.62	1036.74	54454.85	48.16	8696.87	8603.62	93.25	conventional	2015	Albany
1	1	2015-12-20	1.35	54876.98	674.28	44638.81	58.33	9505.56	9408.07	97.49	conventional	2015	Albany
2	2	2015-12-13	0.93	118220.22	794.70	109149.67	130.50	8145.35	8042.21	103.14	conventional	2015	Albany
3	3	2015-12-06	1.08	78992.15	1132.00	71976.41	72.58	5811.16	5677.40	133.76	conventional	2015	Albany
4	4	2015-11-29	1.28	51039.60	941.48	43838.39	75.78	6183.95	5986.26	197.69	conventional	2015	Albany

	Unnamed: 0	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	XLarge Bags	year
count	18249.00	18249.00	18249.00	18249.00	18249.00	18249.00	18249.00	18249.00	18249.00	18249.00	18249.00
mean	24.23	1.41	850644.01	293008.42	295154.57	22839.74	239639.20	182194.69	54338.09	3106.43	2016.15
std	15.48	0.40	3453545.36	1264989.08	1204120.40	107464.07	986242.40	746178.51	243965.96	17692.89	0.94
min	0.00	0.44	84.56	0.00	0.00	0.00	0.00	0.00	0.00	0.00	2015.00
25%	10.00	1.10	10838.58	854.07	3008.78	0.00	5088.64	2849.42	127.47	0.00	2015.00
50%	24.00	1.37	107376.76	8645.30	29061.02	184.99	39743.83	26362.82	2647.71	0.00	2016.00
75%	38.00	1.66	432962.29	111020.20	150206.86	6243.42	110783.37	83337.67	22029.25	132.50	2017.00
max	52.00	3.25	62505646.52	22743616.17	20470572.61	2546439.11	19373134.37	13384586.80	5719096.61	551693.65	2018.00

	Unnamed: 0	Date	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	type	year	region
11569	51	2015-01-04	1.75	27365.89	9307.34	3844.81	615.28	13598.46	13061.10	537.36	organic	2015	Southeast
9593	51	2015-01-04	1.49	17723.17	1189.35	15628.27	0.00	905.55	905.55	0.00	organic	2015	Chicago
10009	51	2015-01-04	1.68	2896.72	161.68	206.96	0.00	2528.08	2528.08	0.00	organic	2015	HarrisburgScranton
1819	51	2015-01-04	1.52	54956.80	3013.04	35456.88	1561.70	14925.18	11264.80	3660.38	conventional	2015	Pittsburgh
9333	51	2015-01-04	1.64	1505.12	1.27	1129.50	0.00	374.35	186.67	187.68	organic	2015	Boise

	Date	AveragePrice
0	2015-01-04	1.301296
1	2015-01-11	1.370648
2	2015-01-18	1.391111
3	2015-01-25	1.397130
4	2015-02-01	1.247037

	ds	yhat	yhat_lower	yhat_upper
164	2018-02-25	1.494845	1.364522	1.633604
165	2018-03-04	1.502636	1.377978	1.634513
166	2018-03-11	1.506324	1.369522	1.629194
167	2018-03-18	1.536801	1.396259	1.658263
168	2018-03-25	1.543474	1.412064	1.670589