GridSearchCV

GridSearchCV
In [20]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, mutual_info_classif, SelectKBest, chi2
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import warnings
warnings.filterwarnings('ignore')
In [2]:
# Load the data set
df = pd.read_csv('wine_quality.csv')
print(df.shape)
df.head()
(1599, 12)
Out[2]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 0
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 0
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 0
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 1
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 0
In [3]:
# Split the data into training and testing sets
y = df.quality
X = df.drop(columns=['quality'])

Checking our dataset if well balance.

In [4]:
df.quality.value_counts()
Out[4]:
1    855
0    744
Name: quality, dtype: int64

Building Logistic Regression

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y,  test_size = 0.30, random_state = 0)
In [6]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
In [7]:
lr = LogisticRegression(solver = 'liblinear', max_iter = 1000, random_state = 0)
In [8]:
lr.fit(x_train, y_train)
Out[8]:
LogisticRegression(max_iter=1000, random_state=0, solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Evaluation

Scores for both training and testing set. We will focus on accuracy for this project.

In [9]:
# predictions for the x_train data
y_pred_train = lr.predict(x_train)

# predictions for the x_test data
y_pred_test = lr.predict(x_test)

Training set score

In [10]:
# true value vs prediction on training set
print(classification_report(y_train,y_pred_train))
              precision    recall  f1-score   support

           0       0.73      0.72      0.73       518
           1       0.76      0.77      0.77       601

    accuracy                           0.75      1119
   macro avg       0.75      0.75      0.75      1119
weighted avg       0.75      0.75      0.75      1119

Testing set score

In [11]:
# true value vs prediction on testing set
print(classification_report(y_test,y_pred_test))
              precision    recall  f1-score   support

           0       0.74      0.73      0.73       226
           1       0.76      0.77      0.76       254

    accuracy                           0.75       480
   macro avg       0.75      0.75      0.75       480
weighted avg       0.75      0.75      0.75       480

  • Training set accuracy: 75%
  • Testing set accuracy: 75%

GridSearchCV

Set up code for visualization

In [21]:
# Determining the array range to be use in our gridsearchcv
C_array_initial = [0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4]

training_score_array = []
test_score_array = []

for x in C_array_initial:
    clf = LogisticRegression(solver='liblinear', penalty ='l1', C = x, random_state=0) # default penalty is Ridge
    clf.fit(x_train, y_train)
    
    # prediction for the training set
    y_pred_train = clf.predict(x_train)
    
    # prediction for the testing set
    y_pred_test = clf.predict(x_test)

    # actual training values vs predicted training value
    training_score_array.append(accuracy_score(y_train, y_pred_train))
    # actual testing values vs predicted testing value
    test_score_array.append(accuracy_score(y_test, y_pred_test))
    
# print(training_score_array)
# print(test_score_array)

Findingthe optimal C and Penalty value

In [22]:
# search between 0.0001 and 100 in 100 separated values
C_array =  np.logspace(-6, 3, 100)

grid_params = [{'penalty' : ['l1','l2'],
                'C'       : C_array}]

Implementing GridSearchCV

In [23]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(estimator = lr, 
                  param_grid = grid_params, 
                  scoring = 'accuracy',
                  cv = 5, )

# fit trainning dataset
gs.fit(x_train, y_train)
Out[23]:
GridSearchCV(cv=5,
             estimator=LogisticRegression(max_iter=1000, random_state=0,
                                          solver='liblinear'),
             param_grid=[{'C': array([1.00000000e-06, 1.23284674e-06, 1.51991108e-06, 1.87381742e-06,
       2.31012970e-06, 2.84803587e-06, 3.51119173e-06, 4.32876128e-06,
       5.33669923e-06, 6.57933225e-06, 8.11130831e-06, 1.00000000e-05,
       1.23284674e-05, 1.51991108e-05, 1.87381742e-05, 2.310...
       8.11130831e+00, 1.00000000e+01, 1.23284674e+01, 1.51991108e+01,
       1.87381742e+01, 2.31012970e+01, 2.84803587e+01, 3.51119173e+01,
       4.32876128e+01, 5.33669923e+01, 6.57933225e+01, 8.11130831e+01,
       1.00000000e+02, 1.23284674e+02, 1.51991108e+02, 1.87381742e+02,
       2.31012970e+02, 2.84803587e+02, 3.51119173e+02, 4.32876128e+02,
       5.33669923e+02, 6.57933225e+02, 8.11130831e+02, 1.00000000e+03]),
                          'penalty': ['l1', 'l2']}],
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
In [15]:
print(gs.best_params_, gs.best_score_)
{'C': 0.35111917342151344, 'penalty': 'l1'} 0.7488669122357463

Visualization

In [24]:
plt.figure(figsize = (15,7))
plt.plot(C_array_initial, training_score_array, color='r', marker = 'o', label='training score')
plt.plot(C_array_initial, test_score_array,     color='b', marker = 'o', label='testing score')
plt.xscale('log' )

# make z and y ticks bigger
plt.xlabel('array', fontsize = 20)
plt.ylabel('score', fontsize = 20)
plt.tick_params(axis='both', which='major', labelsize=20)

plt.axvline(x = gs.best_params_['C'], color ='green', linestyle = '--', label = 'best_param') 
plt.legend()
plt.show()
In [25]:
# Setting y_pred_train variable as our predictions for the x_test data
y_pred_train = gs.predict(x_train)

# Setting y_pred_test variable as our predictions for the x_test data
y_pred_test = gs.predict(x_test)
In [26]:
# true value vs prediction on training set
print(classification_report(y_train,y_pred_train))
              precision    recall  f1-score   support

           0       0.72      0.73      0.73       518
           1       0.76      0.76      0.76       601

    accuracy                           0.75      1119
   macro avg       0.74      0.74      0.74      1119
weighted avg       0.75      0.75      0.75      1119

In [28]:
# true value vs prediction on testing set
print(classification_report(y_test,y_pred_test))
              precision    recall  f1-score   support

           0       0.73      0.74      0.73       226
           1       0.76      0.76      0.76       254

    accuracy                           0.75       480
   macro avg       0.75      0.75      0.75       480
weighted avg       0.75      0.75      0.75       480

Theres no major accuracy difference in our tuned and untune model.

  • Training set accuracy: 75%
  • Testing set accuracy: 75%

Leave a Reply