0. Data processing

In [1]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [2]:
import numpy as np
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
from sklearn import metrics

%matplotlib inline
from matplotlib.pylab import rcParams
import matplotlib.dates as mdates
import seaborn as sns
In [3]:
import warnings
import itertools
warnings.filterwarnings("ignore") # specify to ignore warning messages
In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

Load input data.

In [5]:
data = pd.read_csv('/content/avocado.csv')

data.head(5)
Out[5]:
Unnamed: 0 Date AveragePrice Total Volume 4046 4225 4770 Total Bags Small Bags Large Bags XLarge Bags type year region
0 0 2015-12-27 1.33 64236.62 1036.74 54454.85 48.16 8696.87 8603.62 93.25 0.0 conventional 2015 Albany
1 1 2015-12-20 1.35 54876.98 674.28 44638.81 58.33 9505.56 9408.07 97.49 0.0 conventional 2015 Albany
2 2 2015-12-13 0.93 118220.22 794.70 109149.67 130.50 8145.35 8042.21 103.14 0.0 conventional 2015 Albany
3 3 2015-12-06 1.08 78992.15 1132.00 71976.41 72.58 5811.16 5677.40 133.76 0.0 conventional 2015 Albany
4 4 2015-11-29 1.28 51039.60 941.48 43838.39 75.78 6183.95 5986.26 197.69 0.0 conventional 2015 Albany
In [6]:
data.shape
Out[6]:
(18249, 14)

Some relevant variables from the dataset:

  • Date - The date of the observation

  • AveragePrice - the average price of a single avocado

  • type - conventional or organic

  • year - the year

  • Region - the city or region of the observation

  • Total Volume - Total number of avocados sold

In [7]:
data.isnull().sum()     # is there any NULL variable in the dataset?
Out[7]:
Unnamed: 0      0
Date            0
AveragePrice    0
Total Volume    0
4046            0
4225            0
4770            0
Total Bags      0
Small Bags      0
Large Bags      0
XLarge Bags     0
type            0
year            0
region          0
dtype: int64
In [8]:
data.describe().round(2)
Out[8]:
Unnamed: 0 AveragePrice Total Volume 4046 4225 4770 Total Bags Small Bags Large Bags XLarge Bags year
count 18249.00 18249.00 18249.00 18249.00 18249.00 18249.00 18249.00 18249.00 18249.00 18249.00 18249.00
mean 24.23 1.41 850644.01 293008.42 295154.57 22839.74 239639.20 182194.69 54338.09 3106.43 2016.15
std 15.48 0.40 3453545.36 1264989.08 1204120.40 107464.07 986242.40 746178.51 243965.96 17692.89 0.94
min 0.00 0.44 84.56 0.00 0.00 0.00 0.00 0.00 0.00 0.00 2015.00
25% 10.00 1.10 10838.58 854.07 3008.78 0.00 5088.64 2849.42 127.47 0.00 2015.00
50% 24.00 1.37 107376.76 8645.30 29061.02 184.99 39743.83 26362.82 2647.71 0.00 2016.00
75% 38.00 1.66 432962.29 111020.20 150206.86 6243.42 110783.37 83337.67 22029.25 132.50 2017.00
max 52.00 3.25 62505646.52 22743616.17 20470572.61 2546439.11 19373134.37 13384586.80 5719096.61 551693.65 2018.00
In [9]:
# convert Date column's format;

data['Date'] =pd.to_datetime(data.Date)

data.sort_values(by=['Date'], inplace=True, ascending=True)

data.head()
Out[9]:
Unnamed: 0 Date AveragePrice Total Volume 4046 4225 4770 Total Bags Small Bags Large Bags XLarge Bags type year region
11569 51 2015-01-04 1.75 27365.89 9307.34 3844.81 615.28 13598.46 13061.10 537.36 0.0 organic 2015 Southeast
9593 51 2015-01-04 1.49 17723.17 1189.35 15628.27 0.00 905.55 905.55 0.00 0.0 organic 2015 Chicago
10009 51 2015-01-04 1.68 2896.72 161.68 206.96 0.00 2528.08 2528.08 0.00 0.0 organic 2015 HarrisburgScranton
1819 51 2015-01-04 1.52 54956.80 3013.04 35456.88 1561.70 14925.18 11264.80 3660.38 0.0 conventional 2015 Pittsburgh
9333 51 2015-01-04 1.64 1505.12 1.27 1129.50 0.00 374.35 186.67 187.68 0.0 organic 2015 Boise
In [10]:
# TIME SERIES ANALYSIS
df = data[['Date', 'AveragePrice']]
df = df.set_index('Date')

weekly_df = df.resample('W').mean()
w_df = weekly_df.reset_index().dropna()

w_df.sort_values(by=['Date'])
w_df.head()
Out[10]:
Date AveragePrice
0 2015-01-04 1.301296
1 2015-01-11 1.370648
2 2015-01-18 1.391111
3 2015-01-25 1.397130
4 2015-02-01 1.247037
In [11]:
# Plotting the weekly average prices by month;

fig = plt.figure(figsize = (27, 10))
ax = plt.axes()
#set ticks every month
ax.xaxis.set_major_locator(mdates.MonthLocator())
#set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
plt.plot(w_df['Date'],w_df['AveragePrice'],color='b', linewidth=1)
plt.xlabel("Date")
plt.ylabel("Avocado Price USD")
plt.show()

Facebook Prophet

In [12]:
import logging
logging.getLogger('fbprophet').setLevel(logging.ERROR)
import fbprophet
Prophet = fbprophet.Prophet

Prophet requires the variable names in the time series to be:

  • y – Target

  • ds – Datetime

In [13]:
# Time Series Forecasts using Facebook's Prophet()
w_df1= w_df
w_df.columns = ['ds', 'y']
TEST_SIZE = 12
train, test = w_df.iloc[:-TEST_SIZE], w_df.iloc[-TEST_SIZE:]
In [14]:
model_P=Prophet(interval_width=0.95, yearly_seasonality=True, daily_seasonality=True, weekly_seasonality=False, changepoint_range=1) 
#interval_width sets the uncertainty interval to produce a confidence interval around the forecast

model_P.add_seasonality(name='monthly', period=30.5, fourier_order=5, prior_scale=0.02)

model_P.fit(train)
Out[14]:
<fbprophet.forecaster.Prophet at 0x7f5d068a7850>
In [15]:
future = model_P.make_future_dataframe(freq='W', periods=12)
forecast = model_P.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
Out[15]:
ds yhat yhat_lower yhat_upper
164 2018-02-25 1.494845 1.364522 1.633604
165 2018-03-04 1.502636 1.377978 1.634513
166 2018-03-11 1.506324 1.369522 1.629194
167 2018-03-18 1.536801 1.396259 1.658263
168 2018-03-25 1.543474 1.412064 1.670589
In [16]:
from fbprophet.plot import add_changepoints_to_plot
fig = model_P.plot(forecast)
a = add_changepoints_to_plot(fig.gca(), model_P, forecast)
In [17]:
plt.rcParams.update({'font.size': 12})
f = model_P.plot_components(forecast)
plt.savefig('forecast.png')
plt.show()
In [18]:
strt='2017-12-31'
end='2018-03-25'

predic = forecast[(forecast['ds']>strt) & (forecast['ds']<=end)]
print('MAE:', metrics.mean_absolute_error(test['y'],abs(predic ['yhat'])))
print('MSE:', metrics.mean_squared_error(test['y'],abs(predic ['yhat'])))
print('RMSE:', np.sqrt(metrics.mean_squared_error(test['y'],abs(predic ['yhat']))))
MAE: 0.12816040945292032
MSE: 0.019015796943692553
RMSE: 0.13789777715283358

XGBoost

In [19]:
import xgboost as xgb
from xgboost import plot_importance, plot_tree
In [20]:
TEST_SIZE = 12
train, test = w_df1.iloc[:-TEST_SIZE], w_df1.iloc[-TEST_SIZE:]
In [21]:
X_train = np.arange(train.shape[0]).reshape((-1,1))
y_train = train['y'].values.reshape((-1,1))
X_test = np.arange(test.shape[0]).reshape((-1,1))
y_test = test['y'].values.reshape((-1,1))
X_train.shape , y_test.shape
Out[21]:
((157, 1), (12, 1))
In [22]:
reg = xgb.XGBRegressor(max_depth=4, n_estimators=100)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=20,
       verbose=False)
[10:56:36] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[22]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

Forecast on Test Set

In [23]:
pred= reg.predict(X_test)
In [24]:
# plot the results 
rcParams['figure.figsize']=7,5
plt.plot(y_test, color = 'blue',  label= 'real' )
plt.plot(pred, color = 'red', ls='-.', label = 'predictions')
plt.ylabel("Avocado Price USD")
plt.legend(loc=4)
plt.show()
In [25]:
print('MAE:', metrics.mean_absolute_error(y_test, pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))
MAE: 0.024610141545166147
MSE: 0.0009717965220466834
RMSE: 0.03117365108624082

SVR

In [26]:
for k in ['linear','poly','rbf','sigmoid']:
    clf = SVR(kernel=k)
    clf.fit(X_train, y_train)
    confidence = clf.score(X_train, y_train)
    print(k,confidence)
linear 0.22345923739280782
poly 0.33433323459615655
rbf 0.6142189264984235
sigmoid -3900.9893999837946
In [27]:
model_svr=SVR(kernel='rbf', C=1, gamma= 0.5)   # Parameter Tuning to get the best accuracy
model_svr.fit(X_train,y_train)
print(model_svr.score(X_train,y_train))
0.7715274615319683

Forecast on Test Set

In [28]:
pred1= model_svr.predict(X_test)
In [29]:
print('MAE:', metrics.mean_absolute_error(y_test, pred1))
print('MSE:', metrics.mean_squared_error(y_test, pred1))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred1)))
MAE: 0.058830839262631295
MSE: 0.004429573739030649
RMSE: 0.06655504292711897
In [30]:
# plot the results 
rcParams['figure.figsize']=7,5
plt.plot(y_test, color = 'blue',  label= 'real' )
plt.plot(pred1, color = 'red', ls='-.', label = 'predictions SVR')
plt.plot(pred, color = 'orange', ls='--', label = 'predictions XGBoost')
plt.ylabel("Avocado Price USD")
plt.legend(loc=4)
plt.show()

Comparing the RMSE values of the models

In [31]:
print('RMSE Facebook Prophet:', np.sqrt(metrics.mean_squared_error(test['y'],abs(predic ['yhat']))))
print('RMSE XGBoost:', np.sqrt(metrics.mean_squared_error(y_test, pred)))
print('RMSE SVR:', np.sqrt(metrics.mean_squared_error(y_test, pred1)))
RMSE Facebook Prophet: 0.13789777715283358
RMSE XGBoost: 0.03117365108624082
RMSE SVR: 0.06655504292711897