Skip to content

Instantly share code, notes, and snippets.

@ogrisel
Last active September 25, 2025 15:25
Show Gist options
  • Select an option

  • Save ogrisel/936d5f455969fef19629d89df4e1db54 to your computer and use it in GitHub Desktop.

Select an option

Save ogrisel/936d5f455969fef19629d89df4e1db54 to your computer and use it in GitHub Desktop.
Non unique Markov blanket example.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "92d1072d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X.shape = (50000, 5)\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.model_selection import train_test_split\n",
"from hidimstat import CFI, PFI\n",
"from sklearn.metrics._regression import mean_squared_error\n",
"from sklearn.linear_model import RidgeCV\n",
"from sklearn.preprocessing import SplineTransformer\n",
"from sklearn.kernel_approximation import Nystroem\n",
"from sklearn.pipeline import make_pipeline\n",
"\n",
"\n",
"rng = np.random.default_rng(0)\n",
"Z = rng.random((50_000, 3)) # predictive variables (directly observed or not)\n",
"N = rng.random(size=(Z.shape[0], 1)) # non-predictive yet observed noise variable(s)\n",
"\n",
"# The target is the sum of the predictive variables.\n",
"y = Z.sum(axis=1)\n",
"\n",
"# The 3 first observed features are quadratic polynomials of the 2 predictive variables.\n",
"X = np.hstack(\n",
" [\n",
" Z[:, [0, 1]] ** 2, # squares of predictive variables 0 and 1\n",
" Z[:, [0]] * Z[:, [1]], # product of predictive variables 0 and 1\n",
" Z[:, [2]], # the 4th feature is the 3rd predictive variable (directly observed)\n",
" N, # non-predictive noise variable(s)\n",
" ]\n",
")\n",
"print(f\"X.shape = {X.shape}\")\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)\n",
"\n",
"\n",
"def plot_importances(rf, X_test, y_test, n_jobs=None, n_digits=2):\n",
" print(f\"R^2: {rf.score(X_test, y_test):.3f}\")\n",
" print(f\"MDI importances: {rf.feature_importances_.round(n_digits)}\")\n",
" pfi = PFI(\n",
" rf, loss=mean_squared_error, n_permutations=10, n_jobs=n_jobs, random_state=0\n",
" )\n",
" pfi.fit(X_test, y_test)\n",
" print(\n",
" f\"PFI importances: {pfi.importance(X_test, y_test)['importance'].round(n_digits)}\"\n",
" )\n",
" cfi = CFI(\n",
" rf,\n",
" loss=mean_squared_error,\n",
" # imputation_model_continuous=RandomForestRegressor(min_samples_leaf=50),\n",
" imputation_model_continuous=make_pipeline(\n",
" SplineTransformer(degree=3, n_knots=10, include_bias=False),\n",
" Nystroem(kernel=\"poly\", degree=2, n_components=1000, random_state=0),\n",
" RidgeCV(alphas=np.logspace(-6, 6, 10)),\n",
" ),\n",
" n_permutations=10,\n",
" n_jobs=n_jobs,\n",
" random_state=0,\n",
" )\n",
" cfi.fit(X_test, y_test)\n",
" print(\n",
" f\"CFI importances: {cfi.importance(X_test, y_test)['importance'].round(n_digits)}\"\n",
" )"
]
},
{
"cell_type": "markdown",
"id": "c7115f6c",
"metadata": {},
"source": [
"Using all features leads to a near perfect $R^2$ because all\n",
"the target variance can be explained by the information contained in the\n",
"features (and the RF model is expressive enough, and the sample size is\n",
"large enough).\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "90fde62a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"R^2: 0.999\n",
"MDI importances: [0.03 0.03 0.6 0.33 0. ]\n",
"PFI importances: [ 0.03 0.03 0.22 0.16 -0. ]\n",
"CFI importances: [ 0. 0. 0. 0.16 -0. ]\n"
]
}
],
"source": [
"rf_full = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0).fit(\n",
" X_train, y_train\n",
")\n",
"plot_importances(rf_full, X_test, y_test, n_jobs=-1)"
]
},
{
"cell_type": "markdown",
"id": "3b1b5268",
"metadata": {},
"source": [
"Using any pair of 2 features out of the first 3 leads to a near perfect $R^2$ because\n",
"all the target variance can be explained by the information contained in any pair of\n",
"features (and the RF model is expressive enough, and the sample size is large enough).\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "46d7602f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using features [0, 1, 3]\n",
"R^2: 0.999\n",
"MDI importances: [0.33 0.33 0.34]\n",
"PFI importances: [0.17 0.17 0.16]\n",
"CFI importances: [0.17 0.17 0.16]\n",
"\n",
"Using features [1, 2, 3]\n",
"R^2: 0.998\n",
"MDI importances: [0.05 0.62 0.33]\n",
"PFI importances: [0.04 0.28 0.16]\n",
"CFI importances: [0.04 0.19 0.16]\n",
"\n",
"Using features [0, 2, 3]\n",
"R^2: 0.998\n",
"MDI importances: [0.05 0.62 0.33]\n",
"PFI importances: [0.04 0.28 0.16]\n",
"CFI importances: [0.04 0.19 0.16]\n",
"\n"
]
}
],
"source": [
"for feature_set in ([0, 1, 3], [1, 2, 3], [0, 2, 3]):\n",
" print(f\"Using features {feature_set}\")\n",
" rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0).fit(\n",
" X_train[:, feature_set],\n",
" y_train,\n",
" )\n",
" plot_importances(rf, X_test[:, feature_set], y_test, n_jobs=-1)\n",
" print()"
]
},
{
"cell_type": "markdown",
"id": "c23695ee",
"metadata": {},
"source": [
"Any choice of 2 individual feature leads to a significantly degraded $R^2$\n",
"because the target cannot be written as a function of any pair of individual\n",
"features."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "20686a6c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using features [0, 3]\n",
"R^2: 0.598\n",
"MDI importances: [0.49 0.51]\n",
"PFI importances: [0.16 0.16]\n",
"CFI importances: [0.16 0.16]\n",
"\n",
"Using features [1, 3]\n",
"R^2: 0.593\n",
"MDI importances: [0.49 0.51]\n",
"PFI importances: [0.16 0.17]\n",
"CFI importances: [0.16 0.17]\n",
"\n",
"Using features [2, 3]\n",
"R^2: 0.921\n",
"MDI importances: [0.64 0.36]\n",
"PFI importances: [0.3 0.16]\n",
"CFI importances: [0.3 0.16]\n",
"\n"
]
}
],
"source": [
"for feature_set in ([0, 3], [1, 3], [2, 3]):\n",
" print(f\"Using features {feature_set}\")\n",
" rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0).fit(\n",
" X_train[:, feature_set],\n",
" y_train,\n",
" )\n",
" plot_importances(rf, X_test[:, feature_set], y_test)\n",
" print()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
# %%
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
rng = np.random.default_rng(0)
Z = rng.random((100_000, 2))
# The target is the sum of the two latent variables.
y = Z.sum(axis=1)
# The observed features are quadratic polynomials of the latent variables.
X = np.hstack([Z**2, Z[:, [0]] * Z[:, [1]]])
assert X.shape == (100_000, 3)
# %% [markdown]
#
# Using all features leads to a near perfect $R^2$ because all
# the target variance can be explained by the information contained in the
# features (and the RF model is expressive enough, and the sample size is
# large enough).
# %%
cv_results = cross_validate(
RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
X,
y,
scoring="r2",
return_estimator=True,
cv=2,
)
print(f"R^2 using all 3 features: {cv_results['test_score'].mean():.3f}")
for rf in cv_results['estimator']:
print(rf.feature_importances_.round(3))
# %% [markdown]
#
# Using any pair of 2 features out of 3 leads to a near perfect $R^2$ because
# all the target variance can be explained by the information contained in any pair of
# features (and the RF model is expressive enough, and the sample size is large enough).
# %%
for feature_set in ([0, 1], [1, 2], [0, 2]):
cv_results = cross_validate(
RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
X[:, feature_set],
y,
scoring="r2",
cv=2,
return_estimator=True,
)
print(f"R^2 using features {feature_set}: {cv_results['test_score'].mean():.3f}")
for rf in cv_results['estimator']:
print(rf.feature_importances_.round(3))
# %% [markdown]
#
# Any choice of individual feature leads to a poor $R^2$ because
# the target cannot be written as a function of any individual feature.
# %%
for i in range(3):
cv_results = cross_validate(
RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
X[:, [i]],
y,
scoring="r2",
cv=2,
return_estimator=True,
)
print(f"R^2 using feature {i}: {cv_results['test_score'].mean():.3f}")
for rf in cv_results['estimator']:
print(rf.feature_importances_.round(3))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment