Load required packages:
import numpy as np
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
from sklearn import metrics
%matplotlib inline
from matplotlib.pylab import rcParams
import seaborn as sns
import warnings
import itertools
warnings.filterwarnings("ignore") # specify to ignore warning messages
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
In this section, we study a store's customer data set. The customers have been rated with a score ranging from 1 to 100 (Spending Score (1-100)) according to their purchase frequency and other conditions.
data_x = pd.read_csv('/content/customers.csv')
data_x.head(5)
Let's set "Spending Score (1-100)" as our target variable to be predicted.
data_x.describe()
Split our data into training and test sets.
x = np.arange(data_x.shape[0]).reshape((-1,1))
y= data_x['Spending Score (1-100)'].values.reshape((-1,1))
from sklearn.model_selection import train_test_split
# split training data 82.5:17.5 into training:testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.70, random_state=123)
print("len(X): {} len(y): {} \nlen(X_train): {}, len(X_test): \
{} \nlen(y_train): {}, len(y_test): {}".format(len(x), len(y),\
len(X_train), len(X_test), len(y_train), \
len(y_test)))
#Train the model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
#predictions on test dataset
pred = regressor.predict(X_test)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))
print('MAE:', metrics.mean_absolute_error(y_test, pred))
plt.figure(figsize=(4, 3))
plt.scatter(y_test, pred)
plt.axis('tight')
plt.xlabel('True price')
plt.ylabel('Predicted price')
plt.tight_layout()
sns.distplot((y_test - pred), bins=50);
data = pd.read_csv('/content/avocado.csv')
data.head(5)
Some relevant columns in the dataset:
Date - The date of the observation
AveragePrice - the average price of a single avocado
X = np.arange(data.shape[0]).reshape((-1,1))
Y= data['AveragePrice'].values.reshape((-1,1))
# split training data 82.5:17.5 into training:testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.90, random_state=123)
print("len(X): {} len(y): {} \nlen(X_train): {}, len(X_test): \
{} \nlen(y_train): {}, len(y_test): {}".format(len(X), len(Y),\
len(X_train), len(X_test), len(y_train), \
len(y_test)))
#Train the model
regressor2 = LinearRegression()
regressor2.fit(X_train, y_train)
#Predictions on test data
pred2 = regressor2.predict(X_test)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred2)))
print('MAE:', metrics.mean_absolute_error(y_test, pred2))
plt.figure(figsize=(7, 4))
plt.scatter(y_test, pred2)
plt.axis('tight')
plt.xlabel('True AveragePrice')
plt.ylabel('Predicted AveragePrice')
plt.tight_layout()
sns.distplot((y_test - pred2), bins=50);