from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit, train_test_split,
StratifiedKFold, GroupShuffleSplit,
GroupKFold, StratifiedShuffleSplit)
from matplotlib.patches import Patch
np.random.seed(1338)
cmap_data = plt.cm.brg
cmap_group = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 4
# Generate the class/group data
n_points = 100
X = np.random.randn(100, 10)
percentiles_classes = [.1, .3, .6]
y = np.hstack([[ii] * int(100 * perc)
for ii, perc in enumerate(percentiles_classes)])
# Evenly spaced groups repeated once
rng = np.random.RandomState(42)
group_prior = rng.dirichlet([2]*10)
rng.multinomial(100, group_prior)
groups = np.repeat(np.arange(10), rng.multinomial(100, group_prior))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
fig = plt.figure()
ax = fig.add_subplot(111)
mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train,
markers='o', ax=ax)
mglearn.discrete_scatter(X_test[:, 0], X_test[:, 1], y_test,
markers='^', ax=ax)
ax.legend(["Train class 0", "Train class 1", "Train class 2", "Test class 0",
"Test class 1", "Test class 2"], ncol=6, loc=(-0.1, 1.1));
def plot_cv_indices(cv, X, y, group, ax, lw=2, show_groups=False, s=700, legend=True):
"""Create a sample plot for indices of a cross-validation object."""
n_splits = cv.get_n_splits(X, y, group)
# Generate the training/testing visualizations for each CV split
for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
# Fill in indices with the training/test groups
indices = np.array([np.nan] * len(X))
indices[tt] = 1
indices[tr] = 0
# Visualize the results
ax.scatter([n_splits - ii - 1] * len(indices), range(len(indices)),
c=indices, marker='_', lw=lw, cmap=cmap_cv,
vmin=-.2, vmax=1.2, s=s)
# Plot the data classes and groups at the end
ax.scatter([-1] * len(X), range(len(X)),
c=y, marker='_', lw=lw, cmap=cmap_data, s=s)
yticklabels = ['class'] + list(range(1, n_splits + 1))
if show_groups:
ax.scatter([-2] * len(X), range(len(X)),
c=group, marker='_', lw=lw, cmap=cmap_group, s=s)
yticklabels.insert(0, 'group')
# Formatting
ax.set(xticks=np.arange(-1 - show_groups, n_splits), xticklabels=yticklabels,
ylabel='Sample index', xlabel="CV iteration",
xlim=[-1.5 - show_groups, n_splits+.2], ylim=[-6, 100])
ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
if legend:
ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.2))],
['Testing set', 'Training set'], loc=(1.02, .8))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.set_yticks(())
return ax
fig, ax = plt.subplots(figsize=(6, 3.5))
cv = KFold(5)
plot_cv_indices(cv, X, y, groups, ax, s=700);
fig, ax = plt.subplots(figsize=(6, 3.9))
cv = StratifiedKFold(5)
plot_cv_indices(cv, X, y, groups, ax, s=700)
ax.set_ylim((-6, 100));
Can you explain this result?
kfold = KFold(n_splits=3)
print("Cross-validation scores KFold(n_splits=3):\n{}".format(
cross_val_score(logreg, iris.data, iris.target, cv=kfold)))
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
iris = load_iris()
logreg = LogisticRegression()
kfold = KFold(n_splits=3)
print("Cross-validation scores KFold(n_splits=3):\n{}".format(
cross_val_score(logreg, iris.data, iris.target, cv=kfold)))
Cross-validation scores KFold(n_splits=3): [0. 0. 0.]
train_size
) randomly as the training settest_size
), handy when using very large datasetsfig, ax = plt.subplots(figsize=(6, 4))
cv = ShuffleSplit(8, test_size=.2)
plot_cv_indices(cv, X, y, groups, ax, n_splits, s=700)
ax.set_ylim((-6, 100))
ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.2))],
['Testing set', 'Training set'], loc=(.95, .8));
Note: this is related to bootstrapping:
n_iter
times, obtaining n_iter
scorestrain_size=0.66
, test_size=0.34
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib.patches import Rectangle
fig, ax = plt.subplots(figsize=(10, 3.9))
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
plot_cv_indices(cv, X, y, groups, ax, lw=2, s=400, legend=False)
ax.set_ylim((-6, 102))
xticklabels = ["class"] + [f"{repeat}x{split}" for repeat in range(1, 4) for split in range(1, 6)]
ax.set_xticklabels(xticklabels)
for i in range(3):
rect = Rectangle((-.5 + i * 5, -2.), 5, 103, edgecolor='k', facecolor='none')
ax.add_artist(rect)
fig, ax = plt.subplots(figsize=(6, 3.9))
cv = GroupKFold(5)
plot_cv_indices(cv, X, y, groups, ax, s=700, show_groups=True)
ax.set_ylim((-6, 100));
When the data is ordered, random test sets are not a good idea
import pandas as pd
approval = pd.read_csv("https://projects.fivethirtyeight.com/trump-approval-data/approval_topline.csv")
adults = approval.groupby("subgroup").get_group('Adults')
adults = adults.set_index('modeldate')[::-1]
adults.approve_estimate.plot()
ax = plt.gca()
plt.rcParams["figure.figsize"] = [8,4]
ax.set_xlabel("")
xlim, ylim = ax.get_xlim(), ax.get_ylim()
for i in range(20):
rect = Rectangle((np.random.randint(0, xlim[1]), ylim[0]), 10, ylim[1]-ylim[0], facecolor='#FFAAAA')
ax.add_artist(rect)
plt.title("Presidential approval estimates by fivethirtyeight")
plt.legend([rect], ['Random Test Set'] );
TimeSeriesSplit
from sklearn.utils import shuffle
fig, ax = plt.subplots(figsize=(6, 3.9))
cv = TimeSeriesSplit(5, max_train_size=20)
plot_cv_indices(cv, X, shuffle(y), groups, ax, s=700, lw=2)
ax.set_ylim((-6, 100))
ax.set_title("TimeSeriesSplit(5, max_train_size=20)");
No strict rules, only guidelines:
Keep the end-goal in mind
mglearn.plots.plot_binary_confusion_matrix()
confusion_matrix
and accuracy_score
from sklearn.metrics
.from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, stratify=data.target, random_state=0)
lr = LogisticRegression().fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("confusion_matrix(y_test, y_pred): \n", confusion_matrix(y_test, y_pred))
print("accuracy_score(y_test, y_pred): ", accuracy_score(y_test, y_pred))
print("model.score(X_test, y_test): ", lr.score(X_test, y_test))
confusion_matrix(y_test, y_pred): [[48 5] [ 5 85]] accuracy_score(y_test, y_pred): 0.9300699300699301 model.score(X_test, y_test): 0.9300699300699301
plot_measure(accuracy_score)
Precision is used when the goal is to limit FPs
from sklearn.metrics import precision_score
plot_measure(precision_score)
Recall is used when the goal is to limit FNs
from sklearn.metrics import recall_score
plot_measure(recall_score)
Comparison
F1-score or F1-measure trades off precision and recall:
\begin{equation} \text{F1} = 2 \cdot \frac{\text{precision} \cdot \text{recall}}{\text{precision} + \text{recall}} \end{equation}from sklearn.metrics import f1_score
plot_measure(f1_score)
Classification measure Zoo
https://en.wikipedia.org/wiki/Precision_and_recall
Averaging scores per class
classification_report
)from sklearn.metrics import classification_report
def report(y_pred):
print(classification_report(y_true, y_pred))
fig, ax = plt.subplots(figsize=(2, 2))
plt.rcParams['figure.dpi'] = 100 # Use 300 for PDF, 100 for slides
plot_confusion_matrix(confusion_matrix(y_true, y_pred), cmap='gray_r', ax=ax,
xticklabels=["N", "P"], yticklabels=["N", "P"], xtickrotation=0, vmin=0, vmax=100)
plt.tight_layout();
report(y_pred_1)
precision recall f1-score support 0 0.90 1.00 0.95 90 1 0.00 0.00 0.00 10 accuracy 0.90 100 macro avg 0.45 0.50 0.47 100 weighted avg 0.81 0.90 0.85 100
report(y_pred_2)
precision recall f1-score support 0 1.00 0.89 0.94 90 1 0.50 1.00 0.67 10 accuracy 0.90 100 macro avg 0.75 0.94 0.80 100 weighted avg 0.95 0.90 0.91 100
report(y_pred_3)
precision recall f1-score support 0 0.94 0.94 0.94 90 1 0.50 0.50 0.50 10 accuracy 0.90 100 macro avg 0.72 0.72 0.72 100 weighted avg 0.90 0.90 0.90 100
Scikit-learn offers 2 functions. Often, both are available for every learner, but not always.
In the binary classification case, the return value of decision_function encodes how strongly the model believes a data point to belong to the “positive” class.
# create and split a synthetic dataset
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_blobs
X, y = make_blobs(centers=2, cluster_std=2.5, random_state=8)
# we rename the classes "blue" and "red"
y_named = np.array(["blue", "red"])[y]
# we can call train test split with arbitrary many arrays
# all will be split in a consistent manner
X_train, X_test, y_train_named, y_test_named, y_train, y_test = \
train_test_split(X, y_named, y, random_state=0)
# build the logistic regression model
lr = LogisticRegression()
lr.fit(X_train, y_train_named)
# get the decision function
dec = lr.decision_function(X_test)
mglearn.plots.plot_2d_separator(lr, X)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y);
plt.rcParams['figure.dpi'] = 90
for i, v in enumerate(dec):
plt.annotate("{:.2f}".format(v), (X_test[i,0],X_test[i,1]),
textcoords="offset points", xytext=(0,7))
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
mglearn.tools.plot_2d_separator(lr, X, ax=axes[0], alpha=.4,
fill=True, cm=mglearn.cm2)
scores_image = mglearn.tools.plot_2d_scores(lr, X, ax=axes[1],
alpha=.4, cm=mglearn.ReBl)
for ax in axes:
# plot training and test points
mglearn.discrete_scatter(X_test[:, 0], X_test[:, 1], y_test,
markers='^', ax=ax)
mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train,
markers='o', ax=ax)
ax.set_xlabel("Feature 0")
ax.set_ylabel("Feature 1")
cbar = plt.colorbar(scores_image, ax=axes.tolist())
cbar.set_alpha(1)
cbar.draw_all()
axes[0].legend(["Test class 0", "Test class 1", "Train class 0",
"Train class 1"], ncol=4, loc=(.1, 1.1));
The output of predict_proba is a probability for each class, with one column per class. They sum up to 1.
print("Shape of probabilities: {}".format(lr.predict_proba(X_test).shape))
# show the first few entries of predict_proba
print("Predicted probabilities:\n{}".format(
lr.predict_proba(X_test[:6])))
Shape of probabilities: (25, 2) Predicted probabilities: [[0.232 0.768] [0.002 0.998] [0. 1. ] [0.003 0.997] [0.001 0.999] [1. 0. ]]
We can visualize them again. Note that the gradient looks different now.
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
mglearn.tools.plot_2d_separator(
lr, X, ax=axes[0], alpha=.4, fill=True, cm=mglearn.cm2)
scores_image = mglearn.tools.plot_2d_scores(
lr, X, ax=axes[1], alpha=.5, cm=mglearn.ReBl, function='predict_proba')
for ax in axes:
# plot training and test points
mglearn.discrete_scatter(X_test[:, 0], X_test[:, 1], y_test,
markers='^', ax=ax)
mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train,
markers='o', ax=ax)
ax.set_xlabel("Feature 0")
ax.set_ylabel("Feature 1")
# don't want a transparent colorbar
cbar = plt.colorbar(scores_image, ax=axes.tolist())
cbar.set_alpha(1)
cbar.draw_all()
axes[0].legend(["Test class 0", "Test class 1", "Train class 0",
"Train class 1"], ncol=4, loc=(.1, 1.1));
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import calibration_curve
from sklearn.datasets import fetch_covtype
from sklearn.utils import check_array
def load_data(dtype=np.float32, order='C', random_state=13):
######################################################################
# Load covertype dataset (downloads it from the web, might take a bit)
# TODO: Use OpenML version
data = fetch_covtype(download_if_missing=True, shuffle=True,
random_state=random_state)
X = check_array(data['data'], dtype=dtype, order=order)
# make it bineary classification
y = (data['target'] != 1).astype(np.int)
# Create train-test split (as [Joachims, 2006])
n_train = 522911
X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = y[n_train:]
# Standardize first 10 features (the numerical ones)
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
mean[10:] = 0.0
std[10:] = 1.0
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = load_data()
# subsample training set by a factor of 10:
X_train = X_train[::10]
y_train = y_train[::10]
#probs = lr.predict_proba(X_test)[:, 1]
#prob_true, prob_pred = calibration_curve(y_test, probs, n_bins=5)
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV().fit(X_train, y_train)
def plot_calibration_curve(y_true, y_prob, n_bins=5, ax=None, hist=True, normalize=False):
prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, normalize=normalize)
if ax is None:
ax = plt.gca()
if hist:
ax.hist(y_prob, weights=np.ones_like(y_prob) / len(y_prob), alpha=.4,
bins=np.maximum(10, n_bins))
ax.plot([0, 1], [0, 1], ':', c='k')
curve = ax.plot(prob_pred, prob_true, marker="o")
ax.set_xlabel("predicted probability")
ax.set_ylabel("fraction of positive samples")
ax.set(aspect='equal')
return curve
fig, axes = plt.subplots(1, 3, figsize=(8, 8))
for ax, clf in zip(axes, [LogisticRegressionCV(), DecisionTreeClassifier(),
RandomForestClassifier()]):
# use predict_proba is the estimator has it
scores = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
plot_calibration_curve(y_test, scores, n_bins=20, ax=ax)
ax.set_title(clf.__class__.__name__)
plt.tight_layout();
1d model! (or more for multi-class) $$f_{calib}(s(x))≈p(y)$$
s(x) is score given by model, usually
Use a logistic sigmoid for $f_{calib}$ $$f_{platt}=\frac{1}{1+\exp(−ws(x)−b)}$$
Basically learning a 1d logistic regression (+ some tricks)
decision_function
and predict_proba
decision_function
and 0.5 for predict_proba
by defaultplt.rcParams['figure.dpi'] = 80
mglearn.plots.plot_decision_threshold()
plt.rcParams['figure.dpi'] = 100
print("Threshold 0")
print(classification_report(y_test, svc.predict(X_test)))
Threshold 0 precision recall f1-score support 0 0.91 0.96 0.93 96 1 0.67 0.47 0.55 17 accuracy 0.88 113 macro avg 0.79 0.71 0.74 113 weighted avg 0.87 0.88 0.88 113
print("Threshold -0.8")
y_pred_lower_threshold = svc.decision_function(X_test) > -.8
print(classification_report(y_test, y_pred_lower_threshold))
Threshold -0.8 precision recall f1-score support 0 0.98 0.92 0.95 96 1 0.65 0.88 0.75 17 accuracy 0.91 113 macro avg 0.81 0.90 0.85 113 weighted avg 0.93 0.91 0.92 113
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(
y_test, svc.decision_function(X_test))
# create a similar dataset as before, but with more samples
# to get a smoother curve
X, y = make_blobs(n_samples=(4000, 500), centers=2, cluster_std=[7.0, 2],
random_state=22)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
svc = SVC(gamma=.05).fit(X_train, y_train)
precision, recall, thresholds = precision_recall_curve(
y_test, svc.decision_function(X_test))
# find threshold closest to zero:
close_zero = np.argmin(np.abs(thresholds))
plt.plot(recall[close_zero], precision[close_zero], 'o', markersize=10,
label="threshold zero", fillstyle="none", c='k', mew=2)
plt.plot(recall, precision, label="precision recall curve")
plt.ylabel("Precision")
plt.xlabel("Recall")
plt.legend(loc="best");
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2)
rf.fit(X_train, y_train)
# RandomForestClassifier has predict_proba, but not decision_function
# Only pass probabilities for the positive class
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(
y_test, rf.predict_proba(X_test)[:, 1])
plt.plot(recall, precision, label="svc")
plt.plot(recall[close_zero], precision[close_zero], 'o', markersize=10,
label="threshold zero svc", fillstyle="none", c='k', mew=2)
plt.plot(recall_rf, precision_rf, label="rf")
close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5))
plt.plot( recall_rf[close_default_rf], precision_rf[close_default_rf], '^', c='k',
markersize=10, label="threshold 0.5 rf", fillstyle="none", mew=2)
plt.ylabel("Precision")
plt.xlabel("Recall")
plt.legend(loc="best");
from sklearn.metrics import average_precision_score
ap_rf = average_precision_score(y_test, rf.predict_proba(X_test)[:, 1])
ap_svc = average_precision_score(y_test, svc.decision_function(X_test))
print("Average precision of random forest: {:.3f}".format(ap_rf))
print("Average precision of svc: {:.3f}".format(ap_svc))
Average precision of random forest: 0.660 Average precision of svc: 0.666
plt.rcParams['savefig.dpi'] = 100 # Use 300 for PDF, 100 for slides
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test))
plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
# find threshold closest to zero:
close_zero = np.argmin(np.abs(thresholds))
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
label="threshold zero", fillstyle="none", c='k', mew=2)
plt.legend(loc=4);
from sklearn.metrics import roc_curve
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label="ROC Curve SVC")
plt.plot(fpr_rf, tpr_rf, label="ROC Curve RF")
plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
label="threshold zero SVC", fillstyle="none", c='k', mew=2)
close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5))
plt.plot(fpr_rf[close_default_rf], tpr_rf[close_default_rf], '^', markersize=10,
label="threshold 0.5 RF", fillstyle="none", c='k', mew=2)
plt.legend(loc=4);
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
# Cost function
def cost(fpr, tpr, cost_FN, cost_FP):
return fpr * cost_FP + (1 - tpr) * cost_FN;
@interact
def plot_isometrics(c_FN=(1,10.0,1.0), c_FP=(1,10.0,1.0)):
plt.rcParams['savefig.dpi'] = 100 # Use 300 for PDF, 100 for slides
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test))
# get minimum
costs = [cost(fpr[x],tpr[x],c_FN,c_FP) for x in range(len(thresholds))]
min_cost = np.min(costs)
min_thres = np.argmin(costs)
# plot contours
x = np.arange(0.0, 1.1, 0.1)
y = np.arange(0.0, 1.1, 0.1)
Xp, Yp = np.meshgrid(x, y)
costs = [cost(f, t, c_FN, c_FP) for f, t in zip(Xp,Yp)]
plt.plot(fpr, tpr, label="ROC Curve")
levels = np.linspace(np.array(costs).min(), np.array(costs).max(), 10)
levels = np.sort(np.append(levels, min_cost))
CS = plt.contour(Xp, Yp, costs, levels)
plt.clabel(CS, inline=1, fontsize=10)
plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
# find threshold closest to zero:
plt.plot(fpr[min_thres], tpr[min_thres], 'o', markersize=4,
label="optimal", fillstyle="none", c='k', mew=2)
plt.legend(loc=4);
plt.title("Isometrics, cost_FN: {}, cost_FP: {}".format(c_FN, c_FP))
plt.show()
for cFP in [1, 10]:
plot_isometrics(1,cFP)
from sklearn.metrics import roc_auc_score
from sklearn.dummy import DummyClassifier
rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
svc_auc = roc_auc_score(y_test, svc.decision_function(X_test))
dummy = DummyClassifier().fit(X_train, y_train)
dummy_auc = roc_auc_score(y_test, dummy.predict_proba(X_test)[:, 1])
print("AUC for Random Forest: {:.3f}".format(rf_auc))
print("AUC for SVC: {:.3f}".format(svc_auc))
print("AUC for dummy classifier: {:.3f}".format(dummy_auc))
AUC for Random Forest: 0.937 AUC for SVC: 0.916 AUC for dummy classifier: 0.498
Example: unbalanced dataset (10% positive, 90% negative):
from sklearn.datasets import load_digits
digits = load_digits()
y = digits.target == 9
X_train, X_test, y_train, y_test = train_test_split(
digits.data, y, random_state=0)
plt.figure()
for gamma in [1, 0.01, 0.00001]:
svc = SVC(gamma=gamma).fit(X_train, y_train)
accuracy = svc.score(X_test, y_test)
auc = roc_auc_score(y_test, svc.decision_function(X_test))
fpr, tpr, _ = roc_curve(y_test , svc.decision_function(X_test))
print("gamma = {:.1e} ACC = {:.2f} AUC = {:.4f}".format(
gamma, accuracy, auc))
plt.plot(fpr, tpr, label="gamma={:.1e}".format(gamma), lw=2)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.xlim(-0.01, 1)
plt.ylim(0, 1.02)
plt.legend(loc="best");
gamma = 1.0e+00 ACC = 0.90 AUC = 0.5000 gamma = 1.0e-02 ACC = 0.90 AUC = 0.9995 gamma = 1.0e-05 ACC = 0.90 AUC = 0.9882
Common technique: one-vs-rest approach:
from sklearn.datasets import make_blobs
from sklearn.svm import LinearSVC
X, y = make_blobs(random_state=42)
linear_svm = LinearSVC().fit(X, y)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
line = np.linspace(-15, 15)
for coef, intercept, color in zip(linear_svm.coef_, linear_svm.intercept_,
mglearn.cm3.colors):
plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color)
plt.ylim(-10, 15)
plt.xlim(-10, 8)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.legend(['Class 0', 'Class 1', 'Class 2', 'Line class 0', 'Line class 1',
'Line class 2'], loc=(1.01, 0.3));
Every binary classifiers makes a prediction
mglearn.plots.plot_2d_classification(linear_svm, X, fill=True, alpha=.7)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
line = np.linspace(-15, 15)
for coef, intercept, color in zip(linear_svm.coef_, linear_svm.intercept_,
mglearn.cm3.colors):
plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color)
plt.legend(['Class 0', 'Class 1', 'Class 2', 'Line class 0', 'Line class 1',
'Line class 2'], loc=(1.01, 0.3))
plt.xlabel("Feature 0")
plt.ylabel("Feature 1");
decision_function
and predict_proba
also work in the multiclass settingfrom sklearn.datasets import load_iris
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=42)
lr2 = LogisticRegression()
lr2 = lr2.fit(X_train, y_train)
print("Decision function:\n{}".format(lr2.decision_function(X_test)[:6, :]))
# show the first few entries of predict_proba
print("Predicted probabilities:\n{}".format(lr2.predict_proba(X_test)[:6]))
Decision function: [[ -3.035 2.294 0.741] [ 5.919 3.091 -9.01 ] [-10.052 1.875 8.177] [ -2.733 2.036 0.697] [ -3.737 2.476 1.262] [ 6.036 3.035 -9.07 ]] Predicted probabilities: [[0.004 0.822 0.174] [0.944 0.056 0. ] [0. 0.002 0.998] [0.007 0.787 0.206] [0.002 0.77 0.229] [0.953 0.047 0. ]]
from sklearn.metrics import accuracy_score
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(
digits.data, digits.target, random_state=0)
lr = LogisticRegression().fit(X_train, y_train)
pred = lr.predict(X_test)
scores_image = mglearn.tools.heatmap(
confusion_matrix(y_test, pred), xlabel='Predicted label',
ylabel='True label', xticklabels=digits.target_names,
yticklabels=digits.target_names, cmap=plt.cm.gray_r, fmt="%d")
plt.title("Confusion matrix")
plt.gca().invert_yaxis()
Precision, recall, F1-score now yield 10 per-class scores
print(classification_report(y_test, pred))
precision recall f1-score support 0 1.00 1.00 1.00 37 1 0.89 0.93 0.91 43 2 0.98 0.91 0.94 44 3 0.91 0.96 0.93 45 4 0.97 0.97 0.97 38 5 0.98 0.96 0.97 48 6 1.00 0.98 0.99 52 7 0.98 0.96 0.97 48 8 0.91 0.90 0.91 48 9 0.90 0.96 0.93 47 accuracy 0.95 450 macro avg 0.95 0.95 0.95 450 weighted avg 0.95 0.95 0.95 450
macro-averaging: computes unweighted per-class scores: $\frac{\sum_{i=0}^{n}score_i}{n}$
weighted averaging: scores are weighted by the relative size of the classes (support): $\frac{\sum_{i=0}^{n}score_i weight_i}{n}$
micro-averaging: computes total number of FP, FN, TP over all classes, then computes scores using these counts: $recall = \frac{\sum_{i=0}^{n}TP_i}{\sum_{i=0}^{n}TP_i + \sum_{i=0}^{n}FN_i}$
print("Micro average f1 score: {:.3f}".format(f1_score(y_test, pred, average="micro")))
print("Weighted average f1 score: {:.3f}".format(f1_score(y_test, pred, average="weighted")))
print("Macro average f1 score: {:.3f}".format(f1_score(y_test, pred, average="macro")))
Micro average f1 score: 0.951 Weighted average f1 score: 0.951 Macro average f1 score: 0.952
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score
# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()
Most commonly used are
from sklearn.linear_model import Ridge
from sklearn.datasets import load_boston
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target)
pred = Ridge(normalize=True).fit(X_train, y_train).predict(X_test)
plt.subplot(1, 2, 1)
plt.gca().set_aspect("equal")
plt.plot([10, 50], [10, 50], '--', c='k')
plt.plot(y_test, pred, 'o', alpha=.5)
plt.ylabel("predicted")
plt.xlabel("true");
plt.subplot(1, 2, 2)
plt.gca().set_aspect("equal")
plt.plot([10, 50], [0,0], '--', c='k')
plt.plot(y_test, y_test - pred, 'o', alpha=.5)
plt.xlabel("true")
plt.ylabel("true - predicted")
plt.tight_layout();
from sklearn.model_selection import ShuffleSplit, train_test_split
# Bias-Variance Computation
def compute_bias_variance(clf, X, y):
# Bootstraps
n_repeat = 40 # 40 is on the low side to get a good estimate. 100 is better.
shuffle_split = ShuffleSplit(test_size=0.33, n_splits=n_repeat, random_state=0)
# Store sample predictions
y_all_pred = [[] for _ in range(len(y))]
# Train classifier on each bootstrap and score predictions
for i, (train_index, test_index) in enumerate(shuffle_split.split(X)):
# Train and predict
clf.fit(X[train_index], y[train_index])
y_pred = clf.predict(X[test_index])
# Store predictions
for j,index in enumerate(test_index):
y_all_pred[index].append(y_pred[j])
# Compute bias, variance, error
bias_sq = sum([ (1 - x.count(y[i])/len(x))**2 * len(x)/n_repeat
for i,x in enumerate(y_all_pred)])
var = sum([((1 - ((x.count(0)/len(x))**2 + (x.count(1)/len(x))**2))/2) * len(x)/n_repeat
for i,x in enumerate(y_all_pred)])
error = sum([ (1 - x.count(y[i])/len(x)) * len(x)/n_repeat
for i,x in enumerate(y_all_pred)])
return np.sqrt(bias_sq), var, error
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
cancer = load_breast_cancer()
def plot_bias_variance_rf(clf, X, y):
bias_scores = []
var_scores = []
err_scores = []
n_estimators= [1, 2, 4, 8, 16, 32, 64, 128, 256]
for i in n_estimators:
b,v,e = compute_bias_variance(clf.set_params(random_state=0,n_estimators=i),X,y)
bias_scores.append(b)
var_scores.append(v)
err_scores.append(e)
plt.figure(figsize=(5,2))
plt.rcParams.update({'font.size': 12})
plt.suptitle(clf.__class__.__name__)
plt.plot(n_estimators, var_scores,label ="variance", lw=2 )
plt.plot(n_estimators, np.square(bias_scores),label ="bias^2", lw=2 )
plt.plot(n_estimators, err_scores,label ="error", lw=2 )
plt.xscale('log',basex=2)
plt.xlabel("n_estimators")
plt.legend(loc="best")
plt.show()
X, y = cancer.data, cancer.target
rf = RandomForestClassifier(random_state=0, n_estimators=512, n_jobs=-1)
plot_bias_variance_rf(rf, X, y)
X, y = cancer.data, cancer.target
ab = AdaBoostClassifier(random_state=0, n_estimators=512)
plot_bias_variance_rf(ab, X, y)
Bias-Variance Flowchart (Andrew Ng, Coursera)
We can basically use any optimization technique to optimize hyperparameters:
More advanced techniques (see lecture 7):
mglearn.plots.plot_threefold_split()
from sklearn.svm import SVC
# split data into train+validation set and test set
X_trainval, X_test, y_trainval, y_test = train_test_split(
iris.data, iris.target, random_state=0)
# split train+validation set into training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(
X_trainval, y_trainval, random_state=1)
print("Size of training set: {} size of validation set: {} size of test set:"
" {}\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]))
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
# for each combination of parameters
# train an SVC
svm = SVC(gamma=gamma, C=C)
svm.fit(X_train, y_train)
# evaluate the SVC on the test set
score = svm.score(X_valid, y_valid)
# if we got a better score, store the score and parameters
if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
# rebuild a model on the combined training and validation set,
# and evaluate it on the test set
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print("Best score on validation set: {:.2f}".format(best_score))
print("Best parameters: ", best_parameters)
print("Test set score with best parameters: {:.2f}".format(test_score))
Size of training set: 84 size of validation set: 28 size of test set: 38 Best score on validation set: 0.96 Best parameters: {'C': 10, 'gamma': 0.001} Test set score with best parameters: 0.92
plt.rcParams['figure.dpi'] = 70 # Avoid overlapping boxes
mglearn.plots.plot_grid_search_overview()
GridSearchCV
on all data againscores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5),
iris.data, iris.target, cv=5)
scores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5),
iris.data, iris.target, cv=5)
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-57-fb9e55fc7320> in <module> ----> 1 scores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5), 2 iris.data, iris.target, cv=5) 3 print("Cross-validation scores: ", scores) 4 print("Mean cross-validation score: ", scores.mean()) NameError: name 'GridSearchCV' is not defined