ogrisel/three_blankets.ipynb

## three_blankets.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "92d1072d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X.shape = (50000, 5)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.model_selection import train_test_split\n",
    "from hidimstat import CFI, PFI\n",
    "from sklearn.metrics._regression import mean_squared_error\n",
    "from sklearn.linear_model import RidgeCV\n",
    "from sklearn.preprocessing import SplineTransformer\n",
    "from sklearn.kernel_approximation import Nystroem\n",
    "from sklearn.pipeline import make_pipeline\n",
    "\n",
    "\n",
    "rng = np.random.default_rng(0)\n",
    "Z = rng.random((50_000, 3))  # predictive variables (directly observed or not)\n",
    "N = rng.random(size=(Z.shape[0], 1))  # non-predictive yet observed noise variable(s)\n",
    "\n",
    "# The target is the sum of the predictive variables.\n",
    "y = Z.sum(axis=1)\n",
    "\n",
    "# The 3 first observed features are quadratic polynomials of the 2 predictive variables.\n",
    "X = np.hstack(\n",
    "    [\n",
    "        Z[:, [0, 1]] ** 2,  # squares of predictive variables 0 and 1\n",
    "        Z[:, [0]] * Z[:, [1]],  # product of predictive variables 0 and 1\n",
    "        Z[:, [2]],  # the 4th feature is the 3rd predictive variable (directly observed)\n",
    "        N,  # non-predictive noise variable(s)\n",
    "    ]\n",
    ")\n",
    "print(f\"X.shape = {X.shape}\")\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)\n",
    "\n",
    "\n",
    "def plot_importances(rf, X_test, y_test, n_jobs=None, n_digits=2):\n",
    "    print(f\"R^2: {rf.score(X_test, y_test):.3f}\")\n",
    "    print(f\"MDI importances: {rf.feature_importances_.round(n_digits)}\")\n",
    "    pfi = PFI(\n",
    "        rf, loss=mean_squared_error, n_permutations=10, n_jobs=n_jobs, random_state=0\n",
    "    )\n",
    "    pfi.fit(X_test, y_test)\n",
    "    print(\n",
    "        f\"PFI importances: {pfi.importance(X_test, y_test)['importance'].round(n_digits)}\"\n",
    "    )\n",
    "    cfi = CFI(\n",
    "        rf,\n",
    "        loss=mean_squared_error,\n",
    "        # imputation_model_continuous=RandomForestRegressor(min_samples_leaf=50),\n",
    "        imputation_model_continuous=make_pipeline(\n",
    "            SplineTransformer(degree=3, n_knots=10, include_bias=False),\n",
    "            Nystroem(kernel=\"poly\", degree=2, n_components=1000, random_state=0),\n",
    "            RidgeCV(alphas=np.logspace(-6, 6, 10)),\n",
    "        ),\n",
    "        n_permutations=10,\n",
    "        n_jobs=n_jobs,\n",
    "        random_state=0,\n",
    "    )\n",
    "    cfi.fit(X_test, y_test)\n",
    "    print(\n",
    "        f\"CFI importances: {cfi.importance(X_test, y_test)['importance'].round(n_digits)}\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7115f6c",
   "metadata": {},
   "source": [
    "Using all features leads to a near perfect $R^2$ because all\n",
    "the target variance can be explained by the information contained in the\n",
    "features  (and the RF model is expressive enough, and the sample size is\n",
    "large enough).\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "90fde62a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R^2: 0.999\n",
      "MDI importances: [0.03 0.03 0.6  0.33 0.  ]\n",
      "PFI importances: [ 0.03  0.03  0.22  0.16 -0.  ]\n",
      "CFI importances: [ 0.    0.    0.    0.16 -0.  ]\n"
     ]
    }
   ],
   "source": [
    "rf_full = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0).fit(\n",
    "    X_train, y_train\n",
    ")\n",
    "plot_importances(rf_full, X_test, y_test, n_jobs=-1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b1b5268",
   "metadata": {},
   "source": [
    "Using any pair of 2 features out of the first 3 leads to a near perfect $R^2$ because\n",
    "all the target variance can be explained by the information contained in any pair of\n",
    "features (and the RF model is expressive enough, and the sample size is large enough).\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "46d7602f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using features [0, 1, 3]\n",
      "R^2: 0.999\n",
      "MDI importances: [0.33 0.33 0.34]\n",
      "PFI importances: [0.17 0.17 0.16]\n",
      "CFI importances: [0.17 0.17 0.16]\n",
      "\n",
      "Using features [1, 2, 3]\n",
      "R^2: 0.998\n",
      "MDI importances: [0.05 0.62 0.33]\n",
      "PFI importances: [0.04 0.28 0.16]\n",
      "CFI importances: [0.04 0.19 0.16]\n",
      "\n",
      "Using features [0, 2, 3]\n",
      "R^2: 0.998\n",
      "MDI importances: [0.05 0.62 0.33]\n",
      "PFI importances: [0.04 0.28 0.16]\n",
      "CFI importances: [0.04 0.19 0.16]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for feature_set in ([0, 1, 3], [1, 2, 3], [0, 2, 3]):\n",
    "    print(f\"Using features {feature_set}\")\n",
    "    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0).fit(\n",
    "        X_train[:, feature_set],\n",
    "        y_train,\n",
    "    )\n",
    "    plot_importances(rf, X_test[:, feature_set], y_test, n_jobs=-1)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c23695ee",
   "metadata": {},
   "source": [
    "Any choice of 2 individual feature leads to a significantly degraded $R^2$\n",
    "because the target cannot be written as a function of any pair of individual\n",
    "features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "20686a6c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using features [0, 3]\n",
      "R^2: 0.598\n",
      "MDI importances: [0.49 0.51]\n",
      "PFI importances: [0.16 0.16]\n",
      "CFI importances: [0.16 0.16]\n",
      "\n",
      "Using features [1, 3]\n",
      "R^2: 0.593\n",
      "MDI importances: [0.49 0.51]\n",
      "PFI importances: [0.16 0.17]\n",
      "CFI importances: [0.16 0.17]\n",
      "\n",
      "Using features [2, 3]\n",
      "R^2: 0.921\n",
      "MDI importances: [0.64 0.36]\n",
      "PFI importances: [0.3  0.16]\n",
      "CFI importances: [0.3  0.16]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for feature_set in ([0, 3], [1, 3], [2, 3]):\n",
    "    print(f\"Using features {feature_set}\")\n",
    "    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0).fit(\n",
    "        X_train[:, feature_set],\n",
    "        y_train,\n",
    "    )\n",
    "    plot_importances(rf, X_test[:, feature_set], y_test)\n",
    "    print()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dev",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}

## three_blankets.py
# %%
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

rng = np.random.default_rng(0)
Z = rng.random((100_000, 2))

# The target is the sum of the two latent variables.
y = Z.sum(axis=1)

# The observed features are quadratic polynomials of the latent variables.
X = np.hstack([Z**2, Z[:, [0]] * Z[:, [1]]])
assert X.shape == (100_000, 3)


# %% [markdown]
#
# Using all features leads to a near perfect $R^2$ because all
# the target variance can be explained by the information contained in the
# features  (and the RF model is expressive enough, and the sample size is
# large enough).

# %%
cv_results = cross_validate(
    RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
    X,
    y,
    scoring="r2",
    return_estimator=True,
    cv=2,
)
print(f"R^2 using all 3 features: {cv_results['test_score'].mean():.3f}")
for rf in cv_results['estimator']:
    print(rf.feature_importances_.round(3))

# %% [markdown]
#
# Using any pair of 2 features out of 3 leads to a near perfect $R^2$ because
# all the target variance can be explained by the information contained in any pair of
# features (and the RF model is expressive enough, and the sample size is large enough).

# %%
for feature_set in ([0, 1], [1, 2], [0, 2]):
    cv_results = cross_validate(
        RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
        X[:, feature_set],
        y,
        scoring="r2",
        cv=2,
        return_estimator=True,
    )
    print(f"R^2 using features {feature_set}: {cv_results['test_score'].mean():.3f}")
    for rf in cv_results['estimator']:
        print(rf.feature_importances_.round(3))

# %% [markdown]
#
# Any choice of individual feature leads to a poor $R^2$ because
# the target cannot be written as a function of any individual feature.

# %%
for i in range(3):
    cv_results = cross_validate(
        RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
        X[:, [i]],
        y,
        scoring="r2",
        cv=2,
        return_estimator=True,
    )
    print(f"R^2 using feature {i}: {cv_results['test_score'].mean():.3f}")
    for rf in cv_results['estimator']:
        print(rf.feature_importances_.round(3))
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "92d1072d",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"X.shape = (50000, 5)\n"
	]
	}
	],
	"source": [
	"import numpy as np\n",
	"from sklearn.ensemble import RandomForestRegressor\n",
	"from sklearn.model_selection import train_test_split\n",
	"from hidimstat import CFI, PFI\n",
	"from sklearn.metrics._regression import mean_squared_error\n",
	"from sklearn.linear_model import RidgeCV\n",
	"from sklearn.preprocessing import SplineTransformer\n",
	"from sklearn.kernel_approximation import Nystroem\n",
	"from sklearn.pipeline import make_pipeline\n",
	"\n",
	"\n",
	"rng = np.random.default_rng(0)\n",
	"Z = rng.random((50_000, 3)) # predictive variables (directly observed or not)\n",
	"N = rng.random(size=(Z.shape[0], 1)) # non-predictive yet observed noise variable(s)\n",
	"\n",
	"# The target is the sum of the predictive variables.\n",
	"y = Z.sum(axis=1)\n",
	"\n",
	"# The 3 first observed features are quadratic polynomials of the 2 predictive variables.\n",
	"X = np.hstack(\n",
	" [\n",
	" Z[:, [0, 1]] ** 2, # squares of predictive variables 0 and 1\n",
	" Z[:, [0]] * Z[:, [1]], # product of predictive variables 0 and 1\n",
	" Z[:, [2]], # the 4th feature is the 3rd predictive variable (directly observed)\n",
	" N, # non-predictive noise variable(s)\n",
	" ]\n",
	")\n",
	"print(f\"X.shape = {X.shape}\")\n",
	"\n",
	"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)\n",
	"\n",
	"\n",
	"def plot_importances(rf, X_test, y_test, n_jobs=None, n_digits=2):\n",
	" print(f\"R^2: {rf.score(X_test, y_test):.3f}\")\n",
	" print(f\"MDI importances: {rf.feature_importances_.round(n_digits)}\")\n",
	" pfi = PFI(\n",
	" rf, loss=mean_squared_error, n_permutations=10, n_jobs=n_jobs, random_state=0\n",
	" )\n",
	" pfi.fit(X_test, y_test)\n",
	" print(\n",
	" f\"PFI importances: {pfi.importance(X_test, y_test)['importance'].round(n_digits)}\"\n",
	" )\n",
	" cfi = CFI(\n",
	" rf,\n",
	" loss=mean_squared_error,\n",
	" # imputation_model_continuous=RandomForestRegressor(min_samples_leaf=50),\n",
	" imputation_model_continuous=make_pipeline(\n",
	" SplineTransformer(degree=3, n_knots=10, include_bias=False),\n",
	" Nystroem(kernel=\"poly\", degree=2, n_components=1000, random_state=0),\n",
	" RidgeCV(alphas=np.logspace(-6, 6, 10)),\n",
	" ),\n",
	" n_permutations=10,\n",
	" n_jobs=n_jobs,\n",
	" random_state=0,\n",
	" )\n",
	" cfi.fit(X_test, y_test)\n",
	" print(\n",
	" f\"CFI importances: {cfi.importance(X_test, y_test)['importance'].round(n_digits)}\"\n",
	" )"
	]
	},
	{
	"cell_type": "markdown",
	"id": "c7115f6c",
	"metadata": {},
	"source": [
	"Using all features leads to a near perfect $R^2$ because all\n",
	"the target variance can be explained by the information contained in the\n",
	"features (and the RF model is expressive enough, and the sample size is\n",
	"large enough).\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "90fde62a",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"R^2: 0.999\n",
	"MDI importances: [0.03 0.03 0.6 0.33 0. ]\n",
	"PFI importances: [ 0.03 0.03 0.22 0.16 -0. ]\n",
	"CFI importances: [ 0. 0. 0. 0.16 -0. ]\n"
	]
	}
	],
	"source": [
	"rf_full = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0).fit(\n",
	" X_train, y_train\n",
	")\n",
	"plot_importances(rf_full, X_test, y_test, n_jobs=-1)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "3b1b5268",
	"metadata": {},
	"source": [
	"Using any pair of 2 features out of the first 3 leads to a near perfect $R^2$ because\n",
	"all the target variance can be explained by the information contained in any pair of\n",
	"features (and the RF model is expressive enough, and the sample size is large enough).\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "46d7602f",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Using features [0, 1, 3]\n",
	"R^2: 0.999\n",
	"MDI importances: [0.33 0.33 0.34]\n",
	"PFI importances: [0.17 0.17 0.16]\n",
	"CFI importances: [0.17 0.17 0.16]\n",
	"\n",
	"Using features [1, 2, 3]\n",
	"R^2: 0.998\n",
	"MDI importances: [0.05 0.62 0.33]\n",
	"PFI importances: [0.04 0.28 0.16]\n",
	"CFI importances: [0.04 0.19 0.16]\n",
	"\n",
	"Using features [0, 2, 3]\n",
	"R^2: 0.998\n",
	"MDI importances: [0.05 0.62 0.33]\n",
	"PFI importances: [0.04 0.28 0.16]\n",
	"CFI importances: [0.04 0.19 0.16]\n",
	"\n"
	]
	}
	],
	"source": [
	"for feature_set in ([0, 1, 3], [1, 2, 3], [0, 2, 3]):\n",
	" print(f\"Using features {feature_set}\")\n",
	" rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0).fit(\n",
	" X_train[:, feature_set],\n",
	" y_train,\n",
	" )\n",
	" plot_importances(rf, X_test[:, feature_set], y_test, n_jobs=-1)\n",
	" print()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "c23695ee",
	"metadata": {},
	"source": [
	"Any choice of 2 individual feature leads to a significantly degraded $R^2$\n",
	"because the target cannot be written as a function of any pair of individual\n",
	"features."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "20686a6c",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Using features [0, 3]\n",
	"R^2: 0.598\n",
	"MDI importances: [0.49 0.51]\n",
	"PFI importances: [0.16 0.16]\n",
	"CFI importances: [0.16 0.16]\n",
	"\n",
	"Using features [1, 3]\n",
	"R^2: 0.593\n",
	"MDI importances: [0.49 0.51]\n",
	"PFI importances: [0.16 0.17]\n",
	"CFI importances: [0.16 0.17]\n",
	"\n",
	"Using features [2, 3]\n",
	"R^2: 0.921\n",
	"MDI importances: [0.64 0.36]\n",
	"PFI importances: [0.3 0.16]\n",
	"CFI importances: [0.3 0.16]\n",
	"\n"
	]
	}
	],
	"source": [
	"for feature_set in ([0, 3], [1, 3], [2, 3]):\n",
	" print(f\"Using features {feature_set}\")\n",
	" rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0).fit(\n",
	" X_train[:, feature_set],\n",
	" y_train,\n",
	" )\n",
	" plot_importances(rf, X_test[:, feature_set], y_test)\n",
	" print()\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "dev",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.13.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
	# %%
	import numpy as np
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.model_selection import cross_validate

	rng = np.random.default_rng(0)
	Z = rng.random((100_000, 2))

	# The target is the sum of the two latent variables.
	y = Z.sum(axis=1)

	# The observed features are quadratic polynomials of the latent variables.
	X = np.hstack([Z*2, Z[:, [0]] Z[:, [1]]])
	assert X.shape == (100_000, 3)


	# %% [markdown]
	#
	# Using all features leads to a near perfect $R^2$ because all
	# the target variance can be explained by the information contained in the
	# features (and the RF model is expressive enough, and the sample size is
	# large enough).

	# %%
	cv_results = cross_validate(
	RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
	X,
	y,
	scoring="r2",
	return_estimator=True,
	cv=2,
	)
	print(f"R^2 using all 3 features: {cv_results['test_score'].mean():.3f}")
	for rf in cv_results['estimator']:
	print(rf.feature_importances_.round(3))

	# %% [markdown]
	#
	# Using any pair of 2 features out of 3 leads to a near perfect $R^2$ because
	# all the target variance can be explained by the information contained in any pair of
	# features (and the RF model is expressive enough, and the sample size is large enough).

	# %%
	for feature_set in ([0, 1], [1, 2], [0, 2]):
	cv_results = cross_validate(
	RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
	X[:, feature_set],
	y,
	scoring="r2",
	cv=2,
	return_estimator=True,
	)
	print(f"R^2 using features {feature_set}: {cv_results['test_score'].mean():.3f}")
	for rf in cv_results['estimator']:
	print(rf.feature_importances_.round(3))

	# %% [markdown]
	#
	# Any choice of individual feature leads to a poor $R^2$ because
	# the target cannot be written as a function of any individual feature.

	# %%
	for i in range(3):
	cv_results = cross_validate(
	RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
	X[:, [i]],
	y,
	scoring="r2",
	cv=2,
	return_estimator=True,
	)
	print(f"R^2 using feature {i}: {cv_results['test_score'].mean():.3f}")
	for rf in cv_results['estimator']:
	print(rf.feature_importances_.round(3))