{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Feature agglomeration vs. univariate selection\n\nThis example compares 2 dimensionality reduction strategies:\n\n- univariate feature selection with Anova\n\n- feature agglomeration with Ward hierarchical clustering\n\nBoth methods are compared in a regression problem using\na BayesianRidge as supervised estimator.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import shutil\nimport tempfile\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom joblib import Memory\nfrom scipy import linalg, ndimage\n\nfrom sklearn import feature_selection\nfrom sklearn.cluster import FeatureAgglomeration\nfrom sklearn.feature_extraction.image import grid_to_graph\nfrom sklearn.linear_model import BayesianRidge\nfrom sklearn.model_selection import GridSearchCV, KFold\nfrom sklearn.pipeline import Pipeline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Set parameters\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "n_samples = 200\nsize = 40  # image size\nroi_size = 15\nsnr = 5.0\nnp.random.seed(0)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Generate data\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "coef = np.zeros((size, size))\ncoef[0:roi_size, 0:roi_size] = -1.0\ncoef[-roi_size:, -roi_size:] = 1.0\n\nX = np.random.randn(n_samples, size**2)\nfor x in X:  # smooth data\n    x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()\nX -= X.mean(axis=0)\nX /= X.std(axis=0)\n\ny = np.dot(X, coef.ravel())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "add noise\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "noise = np.random.randn(y.shape[0])\nnoise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.0)) / linalg.norm(noise, 2)\ny += noise_coef * noise"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Compute the coefs of a Bayesian Ridge with GridSearch\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "cv = KFold(2)  # cross-validation generator for model selection\nridge = BayesianRidge()\ncachedir = tempfile.mkdtemp()\nmem = Memory(location=cachedir, verbose=1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Ward agglomeration followed by BayesianRidge\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "connectivity = grid_to_graph(n_x=size, n_y=size)\nward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem)\nclf = Pipeline([(\"ward\", ward), (\"ridge\", ridge)])\n# Select the optimal number of parcels with grid search\nclf = GridSearchCV(clf, {\"ward__n_clusters\": [10, 20, 30]}, n_jobs=1, cv=cv)\nclf.fit(X, y)  # set the best parameters\ncoef_ = clf.best_estimator_.steps[-1][1].coef_\ncoef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)\ncoef_agglomeration_ = coef_.reshape(size, size)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Anova univariate feature selection followed by BayesianRidge\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "f_regression = mem.cache(feature_selection.f_regression)  # caching function\nanova = feature_selection.SelectPercentile(f_regression)\nclf = Pipeline([(\"anova\", anova), (\"ridge\", ridge)])\n# Select the optimal percentage of features with grid search\nclf = GridSearchCV(clf, {\"anova__percentile\": [5, 10, 20]}, cv=cv)\nclf.fit(X, y)  # set the best parameters\ncoef_ = clf.best_estimator_.steps[-1][1].coef_\ncoef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))\ncoef_selection_ = coef_.reshape(size, size)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Inverse the transformation to plot the results on an image\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "plt.close(\"all\")\nplt.figure(figsize=(7.3, 2.7))\nplt.subplot(1, 3, 1)\nplt.imshow(coef, interpolation=\"nearest\", cmap=plt.cm.RdBu_r)\nplt.title(\"True weights\")\nplt.subplot(1, 3, 2)\nplt.imshow(coef_selection_, interpolation=\"nearest\", cmap=plt.cm.RdBu_r)\nplt.title(\"Feature Selection\")\nplt.subplot(1, 3, 3)\nplt.imshow(coef_agglomeration_, interpolation=\"nearest\", cmap=plt.cm.RdBu_r)\nplt.title(\"Feature Agglomeration\")\nplt.subplots_adjust(0.04, 0.0, 0.98, 0.94, 0.16, 0.26)\nplt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Attempt to remove the temporary cachedir, but don't worry if it fails\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "shutil.rmtree(cachedir, ignore_errors=True)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.14"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}