import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, mutual_info_classif, SelectKBest, chi2
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import warnings
# Load the data set
df = pd.read_csv('wine_quality.csv')
# Split the data into training and testing sets
y = df.quality
X = df.drop(columns=['quality'])
Checking our dataset if well balance.¶
Building Logistic Regression¶
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
lr = LogisticRegression(solver = 'liblinear', max_iter = 1000, random_state = 0)
In [8]:, y_train)
Scores for both training and testing set. We will focus on accuracy for this project.
# predictions for the x_train data
y_pred_train = lr.predict(x_train)
# predictions for the x_test data
y_pred_test = lr.predict(x_test)
Training set score
# true value vs prediction on training set
Testing set score
# true value vs prediction on testing set
- Training set accuracy: 75%
- Testing set accuracy: 75%
Set up code for visualization
# Determining the array range to be use in our gridsearchcv
C_array_initial = [0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4]
training_score_array = []
test_score_array = []
for x in C_array_initial:
clf = LogisticRegression(solver='liblinear', penalty ='l1', C = x, random_state=0) # default penalty is Ridge, y_train)
# prediction for the training set
y_pred_train = clf.predict(x_train)
# prediction for the testing set
y_pred_test = clf.predict(x_test)
# actual training values vs predicted training value
training_score_array.append(accuracy_score(y_train, y_pred_train))
# actual testing values vs predicted testing value
test_score_array.append(accuracy_score(y_test, y_pred_test))
# print(training_score_array)
# print(test_score_array)
Findingthe optimal C and Penalty value
# search between 0.0001 and 100 in 100 separated values
C_array = np.logspace(-6, 3, 100)
grid_params = [{'penalty' : ['l1','l2'],
'C' : C_array}]
Implementing GridSearchCV¶
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(estimator = lr,
param_grid = grid_params,
scoring = 'accuracy',
cv = 5, )
# fit trainning dataset, y_train)
print(gs.best_params_, gs.best_score_)
plt.figure(figsize = (15,7))
plt.plot(C_array_initial, training_score_array, color='r', marker = 'o', label='training score')
plt.plot(C_array_initial, test_score_array, color='b', marker = 'o', label='testing score')
plt.xscale('log' )
# make z and y ticks bigger
plt.xlabel('array', fontsize = 20)
plt.ylabel('score', fontsize = 20)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.axvline(x = gs.best_params_['C'], color ='green', linestyle = '--', label = 'best_param')
# Setting y_pred_train variable as our predictions for the x_test data
y_pred_train = gs.predict(x_train)
# Setting y_pred_test variable as our predictions for the x_test data
y_pred_test = gs.predict(x_test)
# true value vs prediction on training set
# true value vs prediction on testing set
Theres no major accuracy difference in our tuned and untune model.
- Training set accuracy: 75%
- Testing set accuracy: 75%