import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, mutual_info_classif, SelectKBest, chi2
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import warnings
warnings.filterwarnings('ignore')

# Load the data set
df = pd.read_csv('wine_quality.csv')
print(df.shape)
df.head()

(1599, 12)

# Split the data into training and testing sets
y = df.quality
X = df.drop(columns=['quality'])

Checking our dataset if well balance.¶

df.quality.value_counts()

1    855
0    744
Name: quality, dtype: int64

Building Logistic Regression¶

x_train, x_test, y_train, y_test = train_test_split(X, y,  test_size = 0.30, random_state = 0)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

lr = LogisticRegression(solver = 'liblinear', max_iter = 1000, random_state = 0)

lr.fit(x_train, y_train)

LogisticRegression(max_iter=1000, random_state=0, solver='liblinear')

LogisticRegression(max_iter=1000, random_state=0, solver='liblinear')

Evaluation¶

Scores for both training and testing set. We will focus on accuracy for this project.

# predictions for the x_train data
y_pred_train = lr.predict(x_train)

# predictions for the x_test data
y_pred_test = lr.predict(x_test)

Training set score

# true value vs prediction on training set
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.73      0.72      0.73       518
           1       0.76      0.77      0.77       601

    accuracy                           0.75      1119
   macro avg       0.75      0.75      0.75      1119
weighted avg       0.75      0.75      0.75      1119

Testing set score

# true value vs prediction on testing set
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.74      0.73      0.73       226
           1       0.76      0.77      0.76       254

    accuracy                           0.75       480
   macro avg       0.75      0.75      0.75       480
weighted avg       0.75      0.75      0.75       480

Training set accuracy: 75%
Testing set accuracy: 75%

GridSearchCV¶

Set up code for visualization

# Determining the array range to be use in our gridsearchcv
C_array_initial = [0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4]

training_score_array = []
test_score_array = []

for x in C_array_initial:
    clf = LogisticRegression(solver='liblinear', penalty ='l1', C = x, random_state=0) # default penalty is Ridge
    clf.fit(x_train, y_train)
    
    # prediction for the training set
    y_pred_train = clf.predict(x_train)
    
    # prediction for the testing set
    y_pred_test = clf.predict(x_test)

    # actual training values vs predicted training value
    training_score_array.append(accuracy_score(y_train, y_pred_train))
    # actual testing values vs predicted testing value
    test_score_array.append(accuracy_score(y_test, y_pred_test))
    
# print(training_score_array)
# print(test_score_array)

Findingthe optimal C and Penalty value

# search between 0.0001 and 100 in 100 separated values
C_array =  np.logspace(-6, 3, 100)

grid_params = [{'penalty' : ['l1','l2'],
                'C'       : C_array}]

Implementing GridSearchCV¶

from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(estimator = lr, 
                  param_grid = grid_params, 
                  scoring = 'accuracy',
                  cv = 5, )

# fit trainning dataset
gs.fit(x_train, y_train)

GridSearchCV(cv=5,
             estimator=LogisticRegression(max_iter=1000, random_state=0,
                                          solver='liblinear'),
             param_grid=[{'C': array([1.00000000e-06, 1.23284674e-06, 1.51991108e-06, 1.87381742e-06,
       2.31012970e-06, 2.84803587e-06, 3.51119173e-06, 4.32876128e-06,
       5.33669923e-06, 6.57933225e-06, 8.11130831e-06, 1.00000000e-05,
       1.23284674e-05, 1.51991108e-05, 1.87381742e-05, 2.310...
       8.11130831e+00, 1.00000000e+01, 1.23284674e+01, 1.51991108e+01,
       1.87381742e+01, 2.31012970e+01, 2.84803587e+01, 3.51119173e+01,
       4.32876128e+01, 5.33669923e+01, 6.57933225e+01, 8.11130831e+01,
       1.00000000e+02, 1.23284674e+02, 1.51991108e+02, 1.87381742e+02,
       2.31012970e+02, 2.84803587e+02, 3.51119173e+02, 4.32876128e+02,
       5.33669923e+02, 6.57933225e+02, 8.11130831e+02, 1.00000000e+03]),
                          'penalty': ['l1', 'l2']}],
             scoring='accuracy')

GridSearchCV(cv=5,
             estimator=LogisticRegression(max_iter=1000, random_state=0,
                                          solver='liblinear'),
             param_grid=[{'C': array([1.00000000e-06, 1.23284674e-06, 1.51991108e-06, 1.87381742e-06,
       2.31012970e-06, 2.84803587e-06, 3.51119173e-06, 4.32876128e-06,
       5.33669923e-06, 6.57933225e-06, 8.11130831e-06, 1.00000000e-05,
       1.23284674e-05, 1.51991108e-05, 1.87381742e-05, 2.310...
       8.11130831e+00, 1.00000000e+01, 1.23284674e+01, 1.51991108e+01,
       1.87381742e+01, 2.31012970e+01, 2.84803587e+01, 3.51119173e+01,
       4.32876128e+01, 5.33669923e+01, 6.57933225e+01, 8.11130831e+01,
       1.00000000e+02, 1.23284674e+02, 1.51991108e+02, 1.87381742e+02,
       2.31012970e+02, 2.84803587e+02, 3.51119173e+02, 4.32876128e+02,
       5.33669923e+02, 6.57933225e+02, 8.11130831e+02, 1.00000000e+03]),
                          'penalty': ['l1', 'l2']}],
             scoring='accuracy')

LogisticRegression(max_iter=1000, random_state=0, solver='liblinear')

LogisticRegression(max_iter=1000, random_state=0, solver='liblinear')

print(gs.best_params_, gs.best_score_)

{'C': 0.35111917342151344, 'penalty': 'l1'} 0.7488669122357463

Visualization¶

plt.figure(figsize = (15,7))
plt.plot(C_array_initial, training_score_array, color='r', marker = 'o', label='training score')
plt.plot(C_array_initial, test_score_array,     color='b', marker = 'o', label='testing score')
plt.xscale('log' )

# make z and y ticks bigger
plt.xlabel('array', fontsize = 20)
plt.ylabel('score', fontsize = 20)
plt.tick_params(axis='both', which='major', labelsize=20)

plt.axvline(x = gs.best_params_['C'], color ='green', linestyle = '--', label = 'best_param') 
plt.legend()
plt.show()

# Setting y_pred_train variable as our predictions for the x_test data
y_pred_train = gs.predict(x_train)

# Setting y_pred_test variable as our predictions for the x_test data
y_pred_test = gs.predict(x_test)

# true value vs prediction on training set
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.72      0.73      0.73       518
           1       0.76      0.76      0.76       601

    accuracy                           0.75      1119
   macro avg       0.74      0.74      0.74      1119
weighted avg       0.75      0.75      0.75      1119

# true value vs prediction on testing set
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.73      0.74      0.73       226
           1       0.76      0.76      0.76       254

    accuracy                           0.75       480
   macro avg       0.75      0.75      0.75       480
weighted avg       0.75      0.75      0.75       480

Theres no major accuracy difference in our tuned and untune model.

Training set accuracy: 75%
Testing set accuracy: 75%

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	0
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	0
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	0
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	1
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	0

Checking our dataset if well balance.¶

Building Logistic Regression¶

Evaluation¶

GridSearchCV¶

Implementing GridSearchCV¶

Visualization¶

You Might Also Like

A/B Test

K-Means Clustering

Naive Bayes

Leave a Reply Cancel reply