# Experiment tracking with scikit-learn[¶](https://developer.dataiku.com/latest/tutorials/machine-learning/experiment-tracking/scikit-learn/index.html#experiment-tracking-with-scikit-learn "Permalink to this heading")

In this tutorial you will train a model using the scikit-learn framework and use the experiment tracking capabilities of Dataiku to log training runs (parameters, performance).**Solution**

Pre-requisites

* Dataiku DSS >= 11.0.0.

* Access to a Project with a Dataset that contains the UCI Bank Marketing data.

* A code environment containing the `mlflow` and `scikit-learn` packages.

The following code snippet provides a reusable example to train a simple random forest classifier with these main steps:

**(1)** Select the feature and target variables.

**(2)** Build the preprocessing pipeline for categorical and numerical features.

**(3)** Define the hyperparameters to run the training on, namely the numbers of decision trees in the random forest, the maximum depth of each tree and the minimum number of samples required to be at a leaf node.

**(4)** Perform the experiment run, log the hyperparameters, performance metrics (here we use the F1 and the ROC AUC) and the trained model.

§ import dataiku

§ from datetime import datetime

§ from sklearn.impute import SimpleImputer

§ from sklearn.preprocessing import OneHotEncoder, StandardScaler

§ from sklearn.compose import ColumnTransformer

§ from sklearn.model\_selection import ParameterGrid

§ from sklearn.model\_selection import StratifiedKFold

§ from sklearn.ensemble import RandomForestClassifier

§ from sklearn.pipeline import Pipeline, make\_pipeline

§ from sklearn.model\_selection import cross\_validate

§ def now\_str() -> str:

§ return datetime.now().strftime("%Y%m%d%H%M%S")

§ # !! - Replace these values with your own - !!

§ USER\_PROJECT\_KEY = ""

§ USER\_XPTRACKING\_FOLDER\_ID = ""

§ USER\_EXPERIMENT\_NAME = ""

§ USER\_TRAINING\_DATASET= ""

§ USER\_MLFLOW\_CODE\_ENV\_NAME = ""

§ client = dataiku.api\_client()

§ project = client.get\_project(USER\_PROJECT\_KEY)

§ ds = dataiku.Dataset(USER\_TRAINING\_DATASET)

§ df = ds.get\_dataframe()

§ # (1)

§ num\_features = ['age', 'balance', 'duration', 'previous', 'campaign']

§ cat\_features = ['job', 'marital', 'education', 'default',

§ 'housing', 'loan', 'contact', 'poutcome']

§ target = "y"

§ X\_train = df.drop(target, axis=1)

§ y\_train = df[target]

§ # (2)

§ num\_pipeline = Pipeline([

§ ('imp', SimpleImputer(strategy='median')),

§ ('sts', StandardScaler()),

§ ])

§ transformers = [

§ ('num', num\_pipeline, num\_features),

§ ('cat', OneHotEncoder(handle\_unknown='ignore'), cat\_features)

§ ]

§ preprocessor = ColumnTransformer(transformers, remainder='drop')

§ # (3)

§ param\_space\_rf = {

§ "n\_estimators": [40,80],

§ "n\_jobs": [-1],

§ "max\_depth": [6, 14],

§ "min\_samples\_leaf": (10, 20, 40, 100)

§ }

§ n\_cv\_folds = 5

§ grid = ParameterGrid(param\_space\_rf)

§ cv = StratifiedKFold(n\_splits=n\_cv\_folds)

§ # (4)

§ mf = project.get\_managed\_folder(USER\_XPTRACKING\_FOLDER\_ID)

§ metrics = ["f1\_macro", "roc\_auc"]

§ mlflow\_extension = project.get\_mlflow\_extension()

§ with project.setup\_mlflow(mf) as mlflow:

§ experiment\_id = mlflow.create\_experiment(

§ f'{USER\_EXPERIMENT\_NAME}\_{now\_str()}')

§ mlflow.tracking.MlflowClient().set\_experiment\_tag(

§ experiment\_id, "library", "Scikit-learn")

§ mlflow.tracking.MlflowClient().set\_experiment\_tag(

§ experiment\_id, "predictionType", "BINARY\_CLASSIFICATION")

§ for hparams in grid:

§ with mlflow.start\_run(experiment\_id=experiment\_id) as run:

§ print(f'Starting run {run.info.run\_id} ...\n{hparams}')

§ run\_metrics = {}

§ clf = RandomForestClassifier(\*\*hparams)

§ pipeline = make\_pipeline(preprocessor, clf)

§ scores = cross\_validate(

§ pipeline, X\_train, y\_train, cv=cv, scoring=metrics)

§ # --Compute the mean and standard dev of the metrics across held-out folds

§ for m in [f"test\_{mname}" for mname in metrics]:

§ run\_metrics[f"mean\_{m}"] = scores[m].mean()

§ run\_metrics[f"std\_{m}"] = scores[m].std()

§ mlflow.log\_metrics(metrics=run\_metrics)

§ for k,v in hparams.items():

§ mlflow.log\_param(k,v)

§ # --Fit the prepocessing steps and the model on the whole train dataset

§ pipeline.fit(X\_train, y\_train)

§ # --Log the pipeline object

§ artifact\_path = f"{type(clf).\_\_name\_\_}-{run.info.run\_id}"

§ mlflow.sklearn.log\_model(sk\_model=pipeline, artifact\_path=artifact\_path)

§ # --Log useful information for the Dataiku Experiment tracking interface

§ mlflow\_extension.set\_run\_inference\_info(

§ run\_id=run.\_info.run\_id,

§ prediction\_type="BINARY\_CLASSIFICATION",

§ classes=pipeline.classes\_.tolist(),

§ code\_env\_name=USER\_MLFLOW\_CODE\_ENV\_NAME,

§ target=target)

§ print(f'Run {run.info.run\_id} done\n{"-"\*40}')

After these steps you should have your experiment run’s data available both in the Dataiku UI and programmatically via the `dataikuapi.dss.mlflow.DSSMLflowExtension` class of the Python API client.
