import numpy as np; np.random.seed(123)
import matplotlib.pyplot as plt
%matplotlib inline
import warnings; warnings.filterwarnings('ignore')
# XOR dataset
n_samples = 1000
mu, sigma = 2, 3
X = sigma * np.random.randn(n_samples, 2) + mu
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
# Plot data
markers = ('s', 'x')
colors = ('red', 'blue')
target_names = ['Class 0', 'Class 1']
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x= X[y == cl, 0], y= X[y == cl, 1],
alpha= 0.8, c= colors[idx],
marker= markers[idx], label= target_names[cl], edgecolor= 'black')
plt.xlabel('$x_1$'); plt.xticks([])
plt.ylabel('$x_2$'); plt.yticks([])
plt.title('Scatter plot of data')
plt.legend(loc="upper left")
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size= 0.25,
stratify= y,
random_state= 123)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, roc_curve, auc
from matplotlib.colors import ListedColormap
import seaborn as sns
import warnings; warnings.filterwarnings('ignore')
def run_classifier(clf, param_grid, title):
# -----------------------------------------------------
cv = StratifiedKFold(n_splits= 3, shuffle = True, random_state= 123)
# Randomized grid search
n_iter_search = 10
gs = RandomizedSearchCV(clf,
param_distributions = param_grid,
n_iter = n_iter_search,
cv = cv,
iid = False,
scoring= 'accuracy')
# -----------------------------------------------------
# Train model
gs.fit(X_train, y_train)
print("The best parameters are %s" % (gs.best_params_))
# Predict on test set
y_pred = gs.best_estimator_.predict(X_test)
# Get Probability estimates
y_prob = gs.best_estimator_.predict_proba(X_test)[:, 1]
# -----------------------------------------------------
print('Accuracy score: %.2f%%' %(accuracy_score(y_test, y_pred)*100))
print('Precision score: %.2f%%' % (precision_score(y_test, y_pred)*100))
print('Recall score: %.2f%%' % (recall_score(y_test, y_pred)*100))
# -----------------------------------------------------
fig, [ax1, ax2, ax3] = plt.subplots(1, 3, figsize=(21, 7))
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot = True, cbar = False, fmt = "d", linewidths = .5, cmap = "Blues", ax = ax1)
ax1.set_title("Confusion Matrix")
ax1.set_xlabel("Predicted class")
ax1.set_ylabel("Actual class")
fig.tight_layout()
# -----------------------------------------------------
# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
ax2.plot(fpr, tpr, lw = 2, label = 'AUC: {:.2f}'.format(auc(fpr, tpr)))
ax2.plot([0, 1], [0, 1],
linestyle = '--',
color = (0.6, 0.6, 0.6),
label = 'Random guessing')
ax2.plot([0, 0, 1], [0, 1, 1],
linestyle = ':',
color = 'black',
label = 'Perfect performance')
ax2.set_xlim([-0.05, 1.05])
ax2.set_ylim([-0.05, 1.05])
ax2.set_xlabel('False Positive Rate (FPR)')
ax2.set_ylabel('True Positive Rate (TPR)')
ax2.set_title('Receiver Operator Characteristic (ROC) Curve')
ax2.legend(loc = "lower right")
fig.tight_layout()
# -----------------------------------------------------
# Plot the decision boundary
cmap = ListedColormap(colors[:len(np.unique(y_test))])
x1_min, x1_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1
x2_min, x2_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1
resolution = 0.01 # step size in the mesh
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = gs.best_estimator_.predict(np.c_[xx1.ravel(), xx2.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
# plot class samples
for idx, cl in enumerate(np.unique(y_test)):
plt.scatter(x= X_test[y_test == cl, 0],
y= X_test[y_test == cl, 1],
alpha= 0.8,
c= colors[idx],
marker= markers[idx],
label= target_names[cl],
edgecolor= 'black')
ax3.set_title('Decision boundary of '+ str(title))
ax3.set_xlabel("$x_1$")
ax3.set_ylabel("$x_2$")
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
plt.xticks([]); plt.yticks([])
plt.legend(loc='lower left')
plt.show()
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
param_grid = {'penalty': ['l2'],
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
run_classifier(lr, param_grid, 'Logistic Regression')
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1,15),
'weights': ['uniform', 'distance'],
'leaf_size':[1, 3, 5],
'algorithm':['auto', 'kd_tree']}
run_classifier(knn, param_grid, 'Nearest Neighbors')
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
param_grid = {'priors': [None]}
run_classifier(nb, param_grid, 'Naive Bayes')
from sklearn.svm import SVC
svm_linear = SVC(kernel="linear", probability = True)
param_grid = {'gamma': np.logspace(-2, 2, 5),
'C': np.logspace(-2, 2, 5)}
run_classifier(svm_linear, param_grid, 'Linear SVM')
svm_rbf = SVC(kernel="rbf", probability=True)
param_grid = {'gamma': np.logspace(-2, 2, 5),
'C': np.logspace(-2, 2, 5)}
run_classifier(svm_rbf, param_grid, "RBF SVM")
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
param_grid = {'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'max_depth': np.arange(1, 20, 2),
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4, 10],
'max_features': ['auto', 'sqrt', 'log2', None]}
run_classifier(dtree, param_grid, "Decision Tree")
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
param_grid = {'n_estimators': [100, 200],
'max_depth': [10, 20, 100, None],
'max_features': ['auto', 'sqrt', None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4, 10],
'bootstrap': [True, False],
'criterion': ['gini', 'entropy']}
run_classifier(rf, param_grid, 'Random Forest')
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
param_grid = {'priors': [None],
'reg_param': np.arange(0., 1., 0.1)}
run_classifier(qda, param_grid, "QDA")
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators=[('knn', knn), ('dt', dtree), ('svc', svm_rbf)],
voting='soft')
param_grid = {'weights': [[1, 1, 1], [2, 1, 2], [3, 1, 3]]}
run_classifier(vc, param_grid, "Voting Classifier")
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
param_grid = {'hidden_layer_sizes': [(10,), (50,), (10, 10), (50, 50)],
'activation': ['identity', 'logistic', 'tanh', 'relu'],
'solver': ['lbfgs', 'sgd', 'adam'],
'alpha': np.logspace(-5, 3, 5),
'learning_rate': ['constant', 'invscaling','adaptive'],
'max_iter': [100, 500, 1000]}
run_classifier(mlp, param_grid, 'Neural Net')