import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, mutual_info_classif, SelectKBest, chi2
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import warnings
# Load the data set
df = pd.read_csv('wine_quality.csv')
(1599, 12)
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 0
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 0
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 0
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 1
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 0
# Split the data into training and testing sets
y = df.quality
X = df.drop(columns=['quality'])

Checking our dataset if well balance.

1    855
0    744
Name: quality, dtype: int64

Building Logistic Regression

x_train, x_test, y_train, y_test = train_test_split(X, y,  test_size = 0.30, random_state = 0)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
lr = LogisticRegression(solver = 'liblinear', max_iter = 1000, random_state = 0)
LogisticRegression(max_iter=1000, random_state=0, solver='liblinear')
Scores for both training and testing set. We will focus on accuracy for this project.

# predictions for the x_train data
y_pred_train = lr.predict(x_train)

# predictions for the x_test data
y_pred_test = lr.predict(x_test)

Training set score

# true value vs prediction on training set
              precision    recall  f1-score   support

           0       0.73      0.72      0.73       518
           1       0.76      0.77      0.77       601

    accuracy                           0.75      1119
   macro avg       0.75      0.75      0.75      1119
weighted avg       0.75      0.75      0.75      1119

Testing set score

# true value vs prediction on testing set
              precision    recall  f1-score   support

           0       0.74      0.73      0.73       226
           1       0.76      0.77      0.76       254

    accuracy                           0.75       480
   macro avg       0.75      0.75      0.75       480
weighted avg       0.75      0.75      0.75       480

  • Training set accuracy: 75%
  • Testing set accuracy: 75%


Set up code for visualization

# Determining the array range to be use in our gridsearchcv
C_array_initial = [0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4]

training_score_array = []
test_score_array = []

for x in C_array_initial:
    clf = LogisticRegression(solver='liblinear', penalty ='l1', C = x, random_state=0) # default penalty is Ridge, y_train)
    # prediction for the training set
    y_pred_train = clf.predict(x_train)
    # prediction for the testing set
    y_pred_test = clf.predict(x_test)

    # actual training values vs predicted training value
    training_score_array.append(accuracy_score(y_train, y_pred_train))
    # actual testing values vs predicted testing value
    test_score_array.append(accuracy_score(y_test, y_pred_test))
# print(training_score_array)
# print(test_score_array)

Findingthe optimal C and Penalty value

# search between 0.0001 and 100 in 100 separated values
C_array =  np.logspace(-6, 3, 100)

grid_params = [{'penalty' : ['l1','l2'],
                'C'       : C_array}]

Implementing GridSearchCV

from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(estimator = lr, 
                  param_grid = grid_params, 
                  scoring = 'accuracy',
                  cv = 5, )

# fit trainning dataset, y_train)
             estimator=LogisticRegression(max_iter=1000, random_state=0,
             param_grid=[{'C': array([1.00000000e-06, 1.23284674e-06, 1.51991108e-06, 1.87381742e-06,
       2.31012970e-06, 2.84803587e-06, 3.51119173e-06, 4.32876128e-06,
       5.33669923e-06, 6.57933225e-06, 8.11130831e-06, 1.00000000e-05,
       1.23284674e-05, 1.51991108e-05, 1.87381742e-05, 2.310...
       8.11130831e+00, 1.00000000e+01, 1.23284674e+01, 1.51991108e+01,
       1.87381742e+01, 2.31012970e+01, 2.84803587e+01, 3.51119173e+01,
       4.32876128e+01, 5.33669923e+01, 6.57933225e+01, 8.11130831e+01,
       1.00000000e+02, 1.23284674e+02, 1.51991108e+02, 1.87381742e+02,
       2.31012970e+02, 2.84803587e+02, 3.51119173e+02, 4.32876128e+02,
       5.33669923e+02, 6.57933225e+02, 8.11130831e+02, 1.00000000e+03]),
                          'penalty': ['l1', 'l2']}],
print(gs.best_params_, gs.best_score_)
{'C': 0.35111917342151344, 'penalty': 'l1'} 0.7488669122357463


plt.figure(figsize = (15,7))
plt.plot(C_array_initial, training_score_array, color='r', marker = 'o', label='training score')
plt.plot(C_array_initial, test_score_array,     color='b', marker = 'o', label='testing score')
plt.xscale('log' )

# make z and y ticks bigger
plt.xlabel('array', fontsize = 20)
plt.ylabel('score', fontsize = 20)
plt.tick_params(axis='both', which='major', labelsize=20)

plt.axvline(x = gs.best_params_['C'], color ='green', linestyle = '--', label = 'best_param') 
# Setting y_pred_train variable as our predictions for the x_test data
y_pred_train = gs.predict(x_train)

# Setting y_pred_test variable as our predictions for the x_test data
y_pred_test = gs.predict(x_test)
# true value vs prediction on training set
              precision    recall  f1-score   support

           0       0.72      0.73      0.73       518
           1       0.76      0.76      0.76       601

    accuracy                           0.75      1119
   macro avg       0.74      0.74      0.74      1119
weighted avg       0.75      0.75      0.75      1119

# true value vs prediction on testing set
              precision    recall  f1-score   support

           0       0.73      0.74      0.73       226
           1       0.76      0.76      0.76       254

    accuracy                           0.75       480
   macro avg       0.75      0.75      0.75       480
weighted avg       0.75      0.75      0.75       480

Theres no major accuracy difference in our tuned and untune model.

  • Training set accuracy: 75%
  • Testing set accuracy: 75%

