{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Gradient Boosting Out-of-Bag estimates\nOut-of-bag (OOB) estimates can be a useful heuristic to estimate\nthe \"optimal\" number of boosting iterations.\nOOB estimates are almost identical to cross-validation estimates but\nthey can be computed on-the-fly without the need for repeated model\nfitting.\nOOB estimates are only available for Stochastic Gradient Boosting\n(i.e. ``subsample < 1.0``), the estimates are derived from the improvement\nin loss based on the examples not included in the bootstrap sample\n(the so-called out-of-bag examples).\nThe OOB estimator is a pessimistic estimator of the true\ntest loss, but remains a fairly good approximation for a small number of trees.\nThe figure shows the cumulative sum of the negative OOB improvements\nas a function of the boosting iteration. As you can see, it tracks the test\nloss for the first hundred iterations but then diverges in a\npessimistic way.\nThe figure also shows the performance of 3-fold cross validation which\nusually gives a better estimate of the test loss\nbut is computationally more demanding.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom scipy.special import expit\n\nfrom sklearn import ensemble\nfrom sklearn.metrics import log_loss\nfrom sklearn.model_selection import KFold, train_test_split\n\n# Generate data (adapted from G. Ridgeway's gbm example)\nn_samples = 1000\nrandom_state = np.random.RandomState(13)\nx1 = random_state.uniform(size=n_samples)\nx2 = random_state.uniform(size=n_samples)\nx3 = random_state.randint(0, 4, size=n_samples)\n\np = expit(np.sin(3 * x1) - 4 * x2 + x3)\ny = random_state.binomial(1, p, size=n_samples)\n\nX = np.c_[x1, x2, x3]\n\nX = X.astype(np.float32)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=9)\n\n# Fit classifier with out-of-bag estimates\nparams = {\n    \"n_estimators\": 1200,\n    \"max_depth\": 3,\n    \"subsample\": 0.5,\n    \"learning_rate\": 0.01,\n    \"min_samples_leaf\": 1,\n    \"random_state\": 3,\n}\nclf = ensemble.GradientBoostingClassifier(**params)\n\nclf.fit(X_train, y_train)\nacc = clf.score(X_test, y_test)\nprint(\"Accuracy: {:.4f}\".format(acc))\n\nn_estimators = params[\"n_estimators\"]\nx = np.arange(n_estimators) + 1\n\n\ndef heldout_score(clf, X_test, y_test):\n    \"\"\"compute deviance scores on ``X_test`` and ``y_test``.\"\"\"\n    score = np.zeros((n_estimators,), dtype=np.float64)\n    for i, y_proba in enumerate(clf.staged_predict_proba(X_test)):\n        score[i] = 2 * log_loss(y_test, y_proba[:, 1])\n    return score\n\n\ndef cv_estimate(n_splits=None):\n    cv = KFold(n_splits=n_splits)\n    cv_clf = ensemble.GradientBoostingClassifier(**params)\n    val_scores = np.zeros((n_estimators,), dtype=np.float64)\n    for train, test in cv.split(X_train, y_train):\n        cv_clf.fit(X_train[train], y_train[train])\n        val_scores += heldout_score(cv_clf, X_train[test], y_train[test])\n    val_scores /= n_splits\n    return val_scores\n\n\n# Estimate best n_estimator using cross-validation\ncv_score = cv_estimate(3)\n\n# Compute best n_estimator for test data\ntest_score = heldout_score(clf, X_test, y_test)\n\n# negative cumulative sum of oob improvements\ncumsum = -np.cumsum(clf.oob_improvement_)\n\n# min loss according to OOB\noob_best_iter = x[np.argmin(cumsum)]\n\n# min loss according to test (normalize such that first loss is 0)\ntest_score -= test_score[0]\ntest_best_iter = x[np.argmin(test_score)]\n\n# min loss according to cv (normalize such that first loss is 0)\ncv_score -= cv_score[0]\ncv_best_iter = x[np.argmin(cv_score)]\n\n# color brew for the three curves\noob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))\ntest_color = list(map(lambda x: x / 256.0, (127, 201, 127)))\ncv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))\n\n# line type for the three curves\noob_line = \"dashed\"\ntest_line = \"solid\"\ncv_line = \"dashdot\"\n\n# plot curves and vertical lines for best iterations\nplt.figure(figsize=(8, 4.8))\nplt.plot(x, cumsum, label=\"OOB loss\", color=oob_color, linestyle=oob_line)\nplt.plot(x, test_score, label=\"Test loss\", color=test_color, linestyle=test_line)\nplt.plot(x, cv_score, label=\"CV loss\", color=cv_color, linestyle=cv_line)\nplt.axvline(x=oob_best_iter, color=oob_color, linestyle=oob_line)\nplt.axvline(x=test_best_iter, color=test_color, linestyle=test_line)\nplt.axvline(x=cv_best_iter, color=cv_color, linestyle=cv_line)\n\n# add three vertical lines to xticks\nxticks = plt.xticks()\nxticks_pos = np.array(\n    xticks[0].tolist() + [oob_best_iter, cv_best_iter, test_best_iter]\n)\nxticks_label = np.array(list(map(lambda t: int(t), xticks[0])) + [\"OOB\", \"CV\", \"Test\"])\nind = np.argsort(xticks_pos)\nxticks_pos = xticks_pos[ind]\nxticks_label = xticks_label[ind]\nplt.xticks(xticks_pos, xticks_label, rotation=90)\n\nplt.legend(loc=\"upper center\")\nplt.ylabel(\"normalized loss\")\nplt.xlabel(\"number of iterations\")\n\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.14"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}