interactive(children=(Dropdown(description='scaler', options=(StandardScaler(), RobustScaler(), MinMaxScaler()…
interactive(children=(Dropdown(description='classifier', options=(KNeighborsClassifier(), SVC(), LinearSVC(), …
boro | salary | vegan | |
---|---|---|---|
0 | Manhattan | 103 | 0 |
1 | Queens | 89 | 0 |
2 | Manhattan | 142 | 0 |
3 | Brooklyn | 54 | 1 |
4 | Brooklyn | 63 | 1 |
5 | Bronx | 219 | 0 |
boro | boro_ordinal | salary | |
---|---|---|---|
0 | Manhattan | 2 | 103 |
1 | Queens | 3 | 89 |
2 | Manhattan | 2 | 142 |
3 | Brooklyn | 1 | 54 |
4 | Brooklyn | 1 | 63 |
5 | Bronx | 0 | 219 |
boro | boro_Bronx | boro_Brooklyn | boro_Manhattan | boro_Queens | salary | |
---|---|---|---|---|---|---|
0 | Manhattan | 0 | 0 | 1 | 0 | 103 |
1 | Queens | 0 | 0 | 0 | 1 | 89 |
2 | Manhattan | 0 | 0 | 1 | 0 | 142 |
3 | Brooklyn | 0 | 1 | 0 | 0 | 54 |
4 | Brooklyn | 0 | 1 | 0 | 0 | 63 |
5 | Bronx | 1 | 0 | 0 | 0 | 219 |
boro | boro_encoded | salary | vegan | |
---|---|---|---|---|
0 | Manhattan | 0.089647 | 103 | 0 |
1 | Queens | 0.333333 | 89 | 0 |
2 | Manhattan | 0.089647 | 142 | 0 |
3 | Brooklyn | 0.820706 | 54 | 1 |
4 | Brooklyn | 0.820706 | 63 | 1 |
5 | Bronx | 0.333333 | 219 | 0 |
ordinal_encoder = OrdinalEncoder(dtype=int)
one_hot_encoder = OneHotEncoder(dtype=int)
category_encoders
target_encoder = TargetEncoder(return_df=True)
fit-transform
paradigmfit
prepares the encoder, transform
actually encodes the featuresencoder.fit(X, y)
X_encoded = encoder.transform(X,y)
# choose scaling method and fit on training data
scaler = StandardScaler()
scaler.fit(X_train)
# transform training and test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# calling fit and transform in sequence
X_train_scaled = scaler.fit(X_train).transform(X_train)
# same result, but more efficient computation
X_train_scaled = scaler.fit_transform(X_train)
fit
on training set, transform
on training and test setfit
and transform
on the training and test data separatelyfit
, predict
, and score
method, just like any other learning algorithmpipeline
combines multiple processing steps in a single estimatortransform
method)# Make pipeline, step names will be 'minmaxscaler' and 'linearsvc'
pipe = make_pipeline(MinMaxScaler(), LinearSVC())
# Build pipeline with named steps
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", LinearSVC())])
# Correct fit and score
score = pipe.fit(X_train, y_train).score(X_test, y_test)
# Retrieve trained model by name
svm = pipe.named_steps["svm"]
# Correct cross-validation
scores = cross_val_score(pipe, X, y)
ColumnTransformer
FeatureUnion
to concatenate columns# 2 sub-pipelines, one for numeric features, other for categorical ones
numeric_pipe = make_pipeline(SimpleImputer(),StandardScaler())
categorical_pipe = make_pipeline(SimpleImputer(),OneHotEncoder())
# Using categorical pipe for features A,B,C, numeric pipe otherwise
preprocessor = make_column_transformer((categorical_pipe,
["A","B","C"]),
remainder=numeric_pipe)
# Combine with learning algorithm in another pipeline
pipe = make_pipeline(preprocess, LinearSVC())
# Feature union of PCA features and selected features
union = FeatureUnion([("pca", PCA()), ("selected", SelectKBest())])
pipe = make_pipeline(union, LinearSVC())
ColumnTransformer
concatenates features in orderpipe = make_column_transformer((StandardScaler(),numeric_features),
(PCA(),numeric_features),
(OneHotEncoder(),categorical_features))
'__'
to refer to the hyperparameters of a step, e.g. svm__C
# Correct grid search (can have hyperparameters of any step)
param_grid = {'svm__C': [0.001, 0.01],
'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid).fit(X,y)
# Best estimator is now the best pipeline
best_pipe = grid.best_estimator_
# Tune pipeline and evaluate on held-out test set
grid = GridSearchCV(pipe, param_grid=param_grid).fit(X_train,y_train)
grid.score(X_test,y_test)
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(), Ridge())
param_grid = {'polynomialfeatures__degree': [1, 2, 3],
'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid).fit(X_train, y_train)
It can be a good idea to reduce the number of features to only the most useful ones
feel_temp
? Or temp
? Maybe one correlates more with the target?interactive(children=(Dropdown(description='method1', options=('FTest', 'MutualInformation', 'RandomForest', '…
interactive(children=(Dropdown(description='method1', options=('FTest', 'MutualInformation', 'RandomForest', '…
VarianceTreshold
selector = VarianceThreshold(threshold=0.01)
X_selected = selector.fit_transform(X)
variances = selector.variances_
f_regression
, mutual_info_regression
f_classification
, chi2
, mutual_info_classication
SelectKBest
, SelectPercentile
, SelectFpr
,...selector = SelectPercentile(score_func=f_regression, percentile=50)
X_selected = selector.fit_transform(X,y)
selected_features = selector.get_support()
f_values, p_values = f_regression(X,y)
mi_values = mutual_info_regression(X,y,discrete_features=[])
SelectFromModel
: requires a model and a selection thresholdRFE
, RFECV
(recursive feature elimination): requires model and final nr featuresselector = SelectFromModel(RandomForestRegressor(), threshold='mean')
rfe_selector = RFE(RidgeCV(), n_features_to_select=20)
X_selected = selector.fit_transform(X)
rf_importances = Randomforest().fit(X, y).feature_importances_
mlxtend
, sklearn-compatible)selector = SequentialFeatureSelector(RidgeCV(), k_features=20, forward=True,
floating=True)
X_selected = selector.fit_transform(X)
sklearn.inspection
), no fit-transform interfaceimportances = permutation_importance(RandomForestRegressor().fit(X,y),
X, y, n_repeats=10).importances_mean
feature_ids = (-importances).argsort()[:n]
orig | [-3.0,-1.5] | [-1.5,0.0] | [0.0,1.5] | [1.5,3.0] | |
---|---|---|---|---|---|
0 | -0.752759 | 0.000000 | 1.000000 | 0.000000 | 0.000000 |
1 | 2.704286 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
2 | 1.391964 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
orig | b0 | b1 | b2 | b3 | X*b0 | X*b1 | X*b2 | X*b3 | |
---|---|---|---|---|---|---|---|---|---|
0 | -0.752759 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | -0.000000 | -0.752759 | -0.000000 | -0.000000 |
1 | 2.704286 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 2.704286 |
2 | 1.391964 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.391964 | 0.000000 |
gender | age | pageviews | time | |
---|---|---|---|---|
0 | M | 14 | 70 | 269 |
1 | F | 16 | 12 | 1522 |
2 | M | 12 | 42 | 235 |
3 | F | 25 | 64 | 63 |
4 | F | 22 | 93 | 21 |
age_M | pageviews_M | time_M | gender_M_M | age_F | pageviews_F | time_F | gender_F_F | |
---|---|---|---|---|---|---|---|---|
0 | 14 | 70 | 269 | 1 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 16 | 12 | 1522 | 1 |
2 | 12 | 42 | 235 | 1 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 25 | 64 | 63 | 1 |
4 | 0 | 0 | 0 | 0 | 22 | 93 | 21 | 1 |
interactive(children=(Dropdown(description='imputer', options=('Mean Imputation', 'kNN Imputation', 'Iterative…
interactive(children=(Dropdown(description='imputer', options=('Mean Imputation', 'kNN Imputation', 'Iterative…
SimpleImputer
mean
(numeric), median
, most_frequent
(categorical)imp = SimpleImputer(strategy='mean', missing_values=np.nan, add_indicator=False)
X_complete = imp.fit_transform(X_train)
KNNImputer
imp = KNNImputer(n_neighbors=5)
X_complete = imp.fit_transform(X_train)
IterativeImputer
BayesianRidge
) and number of iterations (default 10)imp = IterativeImputer(estimator=RandomForestClassifier(), max_iter=10)
X_complete = imp.fit_transform(X_train)
fit_transform
but no transform
)SoftImpute
imp = SoftImpute(max_iter=10, shrinkage_value=None)
X_complete = imp.fit_transform(X)
MatrixFactorization
imp = MatrixFactorization(rank=10, learning_rate=0.001, epochs=10000)
X_complete = imp.fit_transform(X)
class_weight='balanced'
interactive(children=(Dropdown(description='sampler', options=(RandomUnderSampler(), EditedNearestNeighbours()…
X_resampled, y_resampled = SMOTE(k_neighbors=5).fit_sample(X, y)
fit
(not in predict
)smote_pipe = make_pipeline(SMOTE(), LogisticRegression())
scores = cross_validate(smote_pipe, X_train, y_train)
param_grid = {"k_neighbors": [3,5,7]}
grid = GridSearchCV(smote_pipe, param_grid=param_grid, X, y)
clf = EasyEnsembleClassifier(base_estimator=SVC()).fit(X_train, y_train)
interactive(children=(Dropdown(description='dataset', options=('Speech', 'mc1', 'mammography'), value='Speech'…