{ "cells": [ { "cell_type": "markdown", "id": "e930091e-e136-4cb6-ab65-c93a7ca6165a", "metadata": {}, "source": [ "# Bias-Variance Decomposition for Regression Problems\n", "\n", "In this example, we will see how to calculate the bias-variance decomposition for regression problems." ] }, { "cell_type": "code", "execution_count": 1, "id": "84f10ff2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "I0000 00:00:1701450845.494601 1 tfrt_cpu_pjrt_client.cc:349] TfrtCpuClient created.\n" ] } ], "source": [ "import pandas as pd\n", "\n", "from sklearn.datasets import fetch_california_housing\n", "from sklearn.model_selection import train_test_split\n", "\n", "from mvtk.bias_variance import bias_variance_compute, bias_variance_mse\n", "from mvtk.bias_variance.estimators import EstimatorWrapper" ] }, { "cell_type": "code", "execution_count": 2, "id": "54ad5c92-5610-49a7-9b00-ec6340122b8d", "metadata": {}, "outputs": [], "source": [ "random_state=123" ] }, { "cell_type": "markdown", "id": "056ddfd2", "metadata": {}, "source": [ "## Load the example dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "1b6763e6", "metadata": {}, "outputs": [], "source": [ "housing = fetch_california_housing()\n", "X = pd.DataFrame(housing.data, columns=housing.feature_names)\n", "y = housing.target\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)" ] }, { "cell_type": "markdown", "id": "73a13343", "metadata": {}, "source": [ "## Scikit-Learn Example" ] }, { "cell_type": "code", "execution_count": 4, "id": "93bac7b2", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", "\n", "from mvtk.bias_variance.estimators import SciKitLearnEstimatorWrapper" ] }, { "cell_type": "code", "execution_count": 5, "id": "5b8c734c", "metadata": {}, "outputs": [], "source": [ "model_scikit = LinearRegression()" ] }, { "cell_type": "markdown", "id": "8b5f6354", "metadata": {}, "source": [ "## Need to instantiate a wrapper class for usage by the bias variance calculation" ] }, { "cell_type": "code", "execution_count": 6, "id": "642bc9e3", "metadata": {}, "outputs": [], "source": [ "model_scikit_wrapped = SciKitLearnEstimatorWrapper(model_scikit)" ] }, { "cell_type": "code", "execution_count": 7, "id": "f5ed38bf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "average loss: 0.59902430\n", "average bias: 0.52119134\n", "average variance: 0.07783295\n", "net variance: 0.07783295\n" ] } ], "source": [ "# Use wrapped estimator\n", "avg_loss, avg_bias, avg_var, net_var = bias_variance_compute(model_scikit_wrapped, X_train, y_train, X_test, y_test, iterations=200, \n", " random_state=random_state, decomp_fn=bias_variance_mse)\n", "\n", "print(f'average loss: {avg_loss:10.8f}')\n", "print(f'average bias: {avg_bias:10.8f}')\n", "print(f'average variance: {avg_var:10.8f}')\n", "print(f'net variance: {net_var:10.8f}')" ] }, { "cell_type": "markdown", "id": "87fb7085", "metadata": {}, "source": [ "## PyTorch Example" ] }, { "cell_type": "code", "execution_count": 8, "id": "c8d7471f", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "\n", "from mvtk.bias_variance.estimators import PyTorchEstimatorWrapper" ] }, { "cell_type": "code", "execution_count": 9, "id": "d28a52e5", "metadata": {}, "outputs": [], "source": [ "X_train_torch = torch.FloatTensor(X_train.values)\n", "X_test_torch = torch.FloatTensor(X_test.values)\n", "y_train_torch = torch.FloatTensor(y_train).reshape(-1, 1)\n", "y_test_torch = torch.FloatTensor(y_test).reshape(-1, 1)" ] }, { "cell_type": "code", "execution_count": 10, "id": "60092549", "metadata": {}, "outputs": [], "source": [ "class ModelPyTorch(nn.Module):\n", " def __init__(self):\n", " super().__init__()\n", " self.linear1 = nn.Linear(8, 24)\n", " self.linear2 = nn.Linear(24, 12)\n", " self.linear3 = nn.Linear(12, 6)\n", " self.linear4 = nn.Linear(6, 1)\n", " \n", " def forward(self, x):\n", " x = self.linear1(x)\n", " x = self.linear2(x)\n", " x = self.linear3(x)\n", " x = self.linear4(x)\n", " return x" ] }, { "cell_type": "code", "execution_count": 11, "id": "1b3c2219", "metadata": {}, "outputs": [], "source": [ "model_pytorch = ModelPyTorch()\n", "optimizer = torch.optim.Adam(model_pytorch.parameters(), lr=0.001)\n", "loss_fn = nn.MSELoss()" ] }, { "cell_type": "markdown", "id": "e84d8252", "metadata": {}, "source": [ "## Need to instantiate a wrapper class for usage by the bias variance calculation" ] }, { "cell_type": "code", "execution_count": 12, "id": "de3dbd51", "metadata": {}, "outputs": [], "source": [ "def optimizer_generator(x):\n", " return torch.optim.Adam(x.parameters(), lr=0.001)" ] }, { "cell_type": "code", "execution_count": 13, "id": "b344f7f5", "metadata": {}, "outputs": [], "source": [ "model_pytorch_wrapped = PyTorchEstimatorWrapper(model_pytorch, optimizer_generator, loss_fn)" ] }, { "cell_type": "code", "execution_count": 14, "id": "b1920755", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "average loss: 153.52321480\n", "average bias: 5.05105724\n", "average variance: 148.47215756\n", "net variance: 148.47215756\n" ] } ], "source": [ "# Use wrapped estimator\n", "avg_loss, avg_bias, avg_var, net_var = bias_variance_compute(model_pytorch_wrapped, X_train_torch, y_train_torch, X_test_torch, y_test, \n", " iterations=200, random_state=random_state, decomp_fn=bias_variance_mse, \n", " fit_kwargs={'epochs': 25})\n", "\n", "print(f'average loss: {avg_loss:10.8f}')\n", "print(f'average bias: {avg_bias:10.8f}')\n", "print(f'average variance: {avg_var:10.8f}')\n", "print(f'net variance: {net_var:10.8f}')" ] }, { "cell_type": "markdown", "id": "b0363203", "metadata": {}, "source": [ "## TensorFlow example" ] }, { "cell_type": "code", "execution_count": 15, "id": "00c36e62", "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", "from keras import initializers\n", "\n", "from mvtk.bias_variance.estimators import TensorFlowEstimatorWrapper" ] }, { "cell_type": "code", "execution_count": 16, "id": "9780f64b", "metadata": {}, "outputs": [], "source": [ "model_tensorflow = tf.keras.Sequential([\n", " tf.keras.layers.Dense(64, activation='relu', kernel_initializer=initializers.glorot_uniform(seed=0)),\n", " tf.keras.layers.Dense(64, activation='relu', kernel_initializer=initializers.glorot_uniform(seed=0)),\n", " tf.keras.layers.Dense(1, kernel_initializer=initializers.glorot_uniform(seed=0))\n", "])" ] }, { "cell_type": "code", "execution_count": 17, "id": "b1fa3121", "metadata": {}, "outputs": [], "source": [ "model_tensorflow.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),\n", " loss='mean_absolute_error',\n", " metrics=['mean_squared_error'])" ] }, { "cell_type": "code", "execution_count": 18, "id": "ba3c0852", "metadata": {}, "outputs": [], "source": [ "model_tensorflow_wrapped = TensorFlowEstimatorWrapper(model_tensorflow)" ] }, { "cell_type": "code", "execution_count": 19, "id": "19e95731", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "average loss: 12.15213883\n", "average bias: 3.08502204\n", "average variance: 9.06711679\n", "net variance: 9.06711679\n" ] } ], "source": [ "# Use wrapped estimator\n", "avg_loss, avg_bias, avg_var, net_var = bias_variance_compute(model_tensorflow_wrapped, X_train, y_train, X_test, y_test, iterations=200, \n", " random_state=random_state, decomp_fn=bias_variance_mse, \n", " fit_kwargs={'epochs': 25, 'batch_size': 5000, 'verbose': False}, \n", " predict_kwargs={'verbose': False})\n", "\n", "print(f'average loss: {avg_loss:10.8f}')\n", "print(f'average bias: {avg_bias:10.8f}')\n", "print(f'average variance: {avg_var:10.8f}')\n", "print(f'net variance: {net_var:10.8f}')" ] }, { "cell_type": "markdown", "id": "ebc8b115", "metadata": {}, "source": [ "## We can run the same bias variance calculation in parallel for faster execution (in general for larger datasets and more intensive computations)" ] }, { "cell_type": "code", "execution_count": 20, "id": "84a38e59", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-12-01 12:15:52,912\tINFO worker.py:1673 -- Started a local Ray instance.\n", "\u001b[36m(pid=73435)\u001b[0m WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "\u001b[36m(pid=73435)\u001b[0m I0000 00:00:1701450956.584361 1 tfrt_cpu_pjrt_client.cc:349] TfrtCpuClient created.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "average loss: 7.90420163\n", "average bias: 4.06288774\n", "average variance: 3.84131389\n", "net variance: 3.84131389\n" ] } ], "source": [ "from mvtk.bias_variance import bias_variance_compute_parallel\n", "\n", "avg_loss, avg_bias, avg_var, net_var = bias_variance_compute_parallel(model_tensorflow_wrapped, X_train, y_train, X_test, y_test, \n", " iterations=200, random_state=random_state, \n", " decomp_fn=bias_variance_mse, \n", " fit_kwargs={'epochs': 25, 'batch_size': 5000, 'verbose': False}, \n", " predict_kwargs={'verbose': False})\n", "\n", "print(f'average loss: {avg_loss:10.8f}')\n", "print(f'average bias: {avg_bias:10.8f}')\n", "print(f'average variance: {avg_var:10.8f}')\n", "print(f'net variance: {net_var:10.8f}')" ] }, { "cell_type": "code", "execution_count": null, "id": "d7755b28", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }