Skip to content

Instantly share code, notes, and snippets.

@robintux
Created February 15, 2026 21:37
Show Gist options
  • Select an option

  • Save robintux/c55a61e84a92f868842450aae830a2a2 to your computer and use it in GitHub Desktop.

Select an option

Save robintux/c55a61e84a92f868842450aae830a2a2 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import miceforest as mf
def impute_life_expectancy_data_robust(df):
"""
Pipeline de imputación diferenciada con manejo explícito de tipos de datos.
Soluciona el error de columnas 'object' en miceforest mediante:
1. Conversión de categóricas a tipo 'category'
2. Exclusión de identificadores del kernel MICE
3. Imputación temporal para variables epidemiológicas
4. Fallback a sklearn si miceforest falla
"""
df_imp = df.copy()
# === PASO 1: Diagnóstico y preparación de tipos ===
print("Diagnóstico inicial de tipos de datos:")
print(df_imp.dtypes.value_counts())
# Convertir Status a tipo categórico (requerido por miceforest)
df_imp['Status'] = df_imp['Status'].astype('category')
# Crear versión dummy para Status (alternativa robusta)
df_imp['Status_Developed'] = (df_imp['Status'] == 'Developed').astype(int)
# === PASO 2: Imputación temporal para variables MCAR (Life expectancy, Adult Mortality) ===
print("\nImputación temporal por país para variables epidemiológicas...")
for country in df_imp['Country'].unique():
mask = df_imp['Country'] == country
# Interpolación lineal bidireccional (forward + backward fill)
df_imp.loc[mask, 'Life expectancy'] = (
df_imp.loc[mask, 'Life expectancy']
.interpolate(method='linear', limit_direction='both')
)
df_imp.loc[mask, 'Adult Mortality'] = (
df_imp.loc[mask, 'Adult Mortality']
.interpolate(method='linear', limit_direction='both')
)
# === PASO 3: Definir variables para MICE (SOLO numéricas + categóricas convertidas) ===
mice_vars = [
'GDP',
'Income composition of resources',
'Schooling',
'percentage expenditure',
'Total expenditure',
'Alcohol',
'BMI',
'Hepatitis B',
'Polio',
'Diphtheria',
'thinness 1-19 years',
'thinness 5-9 years'
]
# Variables predictoras (excluyendo identificadores object)
predictor_vars = mice_vars + ['Status_Developed', 'Year', 'HIV/AIDS', 'infant deaths']
# Crear subset limpio para MICE (solo columnas compatibles)
df_mice = df_imp[predictor_vars].copy()
# Verificación crítica: ninguna columna debe ser 'object'
assert df_mice.select_dtypes(include=['object']).empty, \
f"Columnas object detectadas: {df_mice.select_dtypes(include=['object']).columns.tolist()}"
print(f"\nVariables en kernel MICE: {len(df_mice.columns)}")
print(f"Missingness inicial en GDP: {df_mice['GDP'].isnull().sum()} ({df_mice['GDP'].isnull().mean()*100:.1f}%)")
# === PASO 4: Imputación MICE con diagnóstico de convergencia ===
try:
kernel = mf.ImputationKernel(
df_mice,
save_all_iterations_data=True,
random_state=42
)
kernel.mice(5, verbose=True) # 5 iteraciones para robustez
# Extraer dataset completo imputado
df_mice_complete = kernel.complete_data(iteration=-1)
# Reemplazar valores imputados en dataframe principal
for col in mice_vars:
df_imp[col] = df_mice_complete[col]
print(f"\n✓ Imputación MICE exitosa")
print(f"Missingness final en GDP: {df_imp['GDP'].isnull().sum()} ({df_imp['GDP'].isnull().mean()*100:.1f}%)")
except AssertionError as e:
print(f"\n Error en miceforest: {e}")
print("Fallback a IterativeImputer de sklearn...")
df_imp = fallback_sklearn_imputation(df_imp, mice_vars)
# === PASO 5: Manejo de variables MNAR (Population) ===
df_imp['Population_missing'] = df_imp['Population'].isnull().astype(int)
# NO imputamos Population (MNAR sospechoso) - mantenemos missingness como feature
return df_imp
def fallback_sklearn_imputation(df, variables_to_impute):
"""
Fallback robusto usando sklearn cuando miceforest falla.
Más lento pero compatible con cualquier tipo de dato numérico.
"""
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
df_fallback = df.copy()
# Seleccionar solo variables numéricas para imputación
numeric_vars = [col for col in variables_to_impute if df_fallback[col].dtype in [np.float64, np.int64]]
imputer = IterativeImputer(
max_iter=10,
random_state=42,
verbose=1,
initial_strategy='median' # Más robusto que 'mean' para outliers
)
# Imputar solo las variables seleccionadas
df_fallback[numeric_vars] = imputer.fit_transform(df_fallback[numeric_vars])
print(f" Fallback sklearn completado para {len(numeric_vars)} variables")
return df_fallback
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment