{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Inductive Clustering\n\nClustering can be expensive, especially when our dataset contains millions\nof datapoints. Many clustering algorithms are not :term:`inductive` and so\ncannot be directly applied to new data samples without recomputing the\nclustering, which may be intractable. Instead, we can use clustering to then\nlearn an inductive model with a classifier, which has several benefits:\n\n- it allows the clusters to scale and apply to new data\n- unlike re-fitting the clusters to new samples, it makes sure the labelling\n  procedure is consistent over time\n- it allows us to use the inferential capabilities of the classifier to\n  describe or explain the clusters\n\nThis example illustrates a generic implementation of a meta-estimator which\nextends clustering by inducing a classifier from the cluster labels.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.base import BaseEstimator, clone\nfrom sklearn.cluster import AgglomerativeClustering\nfrom sklearn.datasets import make_blobs\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.inspection import DecisionBoundaryDisplay\nfrom sklearn.utils.metaestimators import available_if\nfrom sklearn.utils.validation import check_is_fitted\n\nN_SAMPLES = 5000\nRANDOM_STATE = 42\n\n\ndef _classifier_has(attr):\n    \"\"\"Check if we can delegate a method to the underlying classifier.\n\n    First, we check the first fitted classifier if available, otherwise we\n    check the unfitted classifier.\n    \"\"\"\n    return lambda estimator: (\n        hasattr(estimator.classifier_, attr)\n        if hasattr(estimator, \"classifier_\")\n        else hasattr(estimator.classifier, attr)\n    )\n\n\nclass InductiveClusterer(BaseEstimator):\n    def __init__(self, clusterer, classifier):\n        self.clusterer = clusterer\n        self.classifier = classifier\n\n    def fit(self, X, y=None):\n        self.clusterer_ = clone(self.clusterer)\n        self.classifier_ = clone(self.classifier)\n        y = self.clusterer_.fit_predict(X)\n        self.classifier_.fit(X, y)\n        return self\n\n    @available_if(_classifier_has(\"predict\"))\n    def predict(self, X):\n        check_is_fitted(self)\n        return self.classifier_.predict(X)\n\n    @available_if(_classifier_has(\"decision_function\"))\n    def decision_function(self, X):\n        check_is_fitted(self)\n        return self.classifier_.decision_function(X)\n\n\ndef plot_scatter(X, color, alpha=0.5):\n    return plt.scatter(X[:, 0], X[:, 1], c=color, alpha=alpha, edgecolor=\"k\")\n\n\n# Generate some training data from clustering\nX, y = make_blobs(\n    n_samples=N_SAMPLES,\n    cluster_std=[1.0, 1.0, 0.5],\n    centers=[(-5, -5), (0, 0), (5, 5)],\n    random_state=RANDOM_STATE,\n)\n\n\n# Train a clustering algorithm on the training data and get the cluster labels\nclusterer = AgglomerativeClustering(n_clusters=3)\ncluster_labels = clusterer.fit_predict(X)\n\nplt.figure(figsize=(12, 4))\n\nplt.subplot(131)\nplot_scatter(X, cluster_labels)\nplt.title(\"Ward Linkage\")\n\n\n# Generate new samples and plot them along with the original dataset\nX_new, y_new = make_blobs(\n    n_samples=10, centers=[(-7, -1), (-2, 4), (3, 6)], random_state=RANDOM_STATE\n)\n\nplt.subplot(132)\nplot_scatter(X, cluster_labels)\nplot_scatter(X_new, \"black\", 1)\nplt.title(\"Unknown instances\")\n\n\n# Declare the inductive learning model that it will be used to\n# predict cluster membership for unknown instances\nclassifier = RandomForestClassifier(random_state=RANDOM_STATE)\ninductive_learner = InductiveClusterer(clusterer, classifier).fit(X)\n\nprobable_clusters = inductive_learner.predict(X_new)\n\n\nax = plt.subplot(133)\nplot_scatter(X, cluster_labels)\nplot_scatter(X_new, probable_clusters)\n\n# Plotting decision regions\nDecisionBoundaryDisplay.from_estimator(\n    inductive_learner, X, response_method=\"predict\", alpha=0.4, ax=ax\n)\nplt.title(\"Classify unknown instances\")\n\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.14"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}