In [20]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, mutual_info_classif, SelectKBest, chi2
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import warnings
warnings.filterwarnings('ignore')
In [2]:
# Load the data set
df = pd.read_csv('wine_quality.csv')
print(df.shape)
df.head()
Out[2]:
In [3]:
# Split the data into training and testing sets
y = df.quality
X = df.drop(columns=['quality'])
Checking our dataset if well balance.¶
In [4]:
df.quality.value_counts()
Out[4]:
Building Logistic Regression¶
In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)
In [6]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
In [7]:
lr = LogisticRegression(solver = 'liblinear', max_iter = 1000, random_state = 0)
In [8]:
lr.fit(x_train, y_train)
Out[8]:
Evaluation¶
Scores for both training and testing set. We will focus on accuracy for this project.
In [9]:
# predictions for the x_train data
y_pred_train = lr.predict(x_train)
# predictions for the x_test data
y_pred_test = lr.predict(x_test)
Training set score
In [10]:
# true value vs prediction on training set
print(classification_report(y_train,y_pred_train))
Testing set score
In [11]:
# true value vs prediction on testing set
print(classification_report(y_test,y_pred_test))
- Training set accuracy: 75%
- Testing set accuracy: 75%
GridSearchCV¶
Set up code for visualization
In [21]:
# Determining the array range to be use in our gridsearchcv
C_array_initial = [0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4]
training_score_array = []
test_score_array = []
for x in C_array_initial:
clf = LogisticRegression(solver='liblinear', penalty ='l1', C = x, random_state=0) # default penalty is Ridge
clf.fit(x_train, y_train)
# prediction for the training set
y_pred_train = clf.predict(x_train)
# prediction for the testing set
y_pred_test = clf.predict(x_test)
# actual training values vs predicted training value
training_score_array.append(accuracy_score(y_train, y_pred_train))
# actual testing values vs predicted testing value
test_score_array.append(accuracy_score(y_test, y_pred_test))
# print(training_score_array)
# print(test_score_array)
Findingthe optimal C and Penalty value
In [22]:
# search between 0.0001 and 100 in 100 separated values
C_array = np.logspace(-6, 3, 100)
grid_params = [{'penalty' : ['l1','l2'],
'C' : C_array}]
Implementing GridSearchCV¶
In [23]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(estimator = lr,
param_grid = grid_params,
scoring = 'accuracy',
cv = 5, )
# fit trainning dataset
gs.fit(x_train, y_train)
Out[23]:
In [15]:
print(gs.best_params_, gs.best_score_)
Visualization¶
In [24]:
plt.figure(figsize = (15,7))
plt.plot(C_array_initial, training_score_array, color='r', marker = 'o', label='training score')
plt.plot(C_array_initial, test_score_array, color='b', marker = 'o', label='testing score')
plt.xscale('log' )
# make z and y ticks bigger
plt.xlabel('array', fontsize = 20)
plt.ylabel('score', fontsize = 20)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.axvline(x = gs.best_params_['C'], color ='green', linestyle = '--', label = 'best_param')
plt.legend()
plt.show()
In [25]:
# Setting y_pred_train variable as our predictions for the x_test data
y_pred_train = gs.predict(x_train)
# Setting y_pred_test variable as our predictions for the x_test data
y_pred_test = gs.predict(x_test)
In [26]:
# true value vs prediction on training set
print(classification_report(y_train,y_pred_train))
In [28]:
# true value vs prediction on testing set
print(classification_report(y_test,y_pred_test))
Theres no major accuracy difference in our tuned and untune model.
- Training set accuracy: 75%
- Testing set accuracy: 75%