Created
February 15, 2026 21:37
-
-
Save robintux/c55a61e84a92f868842450aae830a2a2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| import miceforest as mf | |
| def impute_life_expectancy_data_robust(df): | |
| """ | |
| Pipeline de imputación diferenciada con manejo explícito de tipos de datos. | |
| Soluciona el error de columnas 'object' en miceforest mediante: | |
| 1. Conversión de categóricas a tipo 'category' | |
| 2. Exclusión de identificadores del kernel MICE | |
| 3. Imputación temporal para variables epidemiológicas | |
| 4. Fallback a sklearn si miceforest falla | |
| """ | |
| df_imp = df.copy() | |
| # === PASO 1: Diagnóstico y preparación de tipos === | |
| print("Diagnóstico inicial de tipos de datos:") | |
| print(df_imp.dtypes.value_counts()) | |
| # Convertir Status a tipo categórico (requerido por miceforest) | |
| df_imp['Status'] = df_imp['Status'].astype('category') | |
| # Crear versión dummy para Status (alternativa robusta) | |
| df_imp['Status_Developed'] = (df_imp['Status'] == 'Developed').astype(int) | |
| # === PASO 2: Imputación temporal para variables MCAR (Life expectancy, Adult Mortality) === | |
| print("\nImputación temporal por país para variables epidemiológicas...") | |
| for country in df_imp['Country'].unique(): | |
| mask = df_imp['Country'] == country | |
| # Interpolación lineal bidireccional (forward + backward fill) | |
| df_imp.loc[mask, 'Life expectancy'] = ( | |
| df_imp.loc[mask, 'Life expectancy'] | |
| .interpolate(method='linear', limit_direction='both') | |
| ) | |
| df_imp.loc[mask, 'Adult Mortality'] = ( | |
| df_imp.loc[mask, 'Adult Mortality'] | |
| .interpolate(method='linear', limit_direction='both') | |
| ) | |
| # === PASO 3: Definir variables para MICE (SOLO numéricas + categóricas convertidas) === | |
| mice_vars = [ | |
| 'GDP', | |
| 'Income composition of resources', | |
| 'Schooling', | |
| 'percentage expenditure', | |
| 'Total expenditure', | |
| 'Alcohol', | |
| 'BMI', | |
| 'Hepatitis B', | |
| 'Polio', | |
| 'Diphtheria', | |
| 'thinness 1-19 years', | |
| 'thinness 5-9 years' | |
| ] | |
| # Variables predictoras (excluyendo identificadores object) | |
| predictor_vars = mice_vars + ['Status_Developed', 'Year', 'HIV/AIDS', 'infant deaths'] | |
| # Crear subset limpio para MICE (solo columnas compatibles) | |
| df_mice = df_imp[predictor_vars].copy() | |
| # Verificación crítica: ninguna columna debe ser 'object' | |
| assert df_mice.select_dtypes(include=['object']).empty, \ | |
| f"Columnas object detectadas: {df_mice.select_dtypes(include=['object']).columns.tolist()}" | |
| print(f"\nVariables en kernel MICE: {len(df_mice.columns)}") | |
| print(f"Missingness inicial en GDP: {df_mice['GDP'].isnull().sum()} ({df_mice['GDP'].isnull().mean()*100:.1f}%)") | |
| # === PASO 4: Imputación MICE con diagnóstico de convergencia === | |
| try: | |
| kernel = mf.ImputationKernel( | |
| df_mice, | |
| save_all_iterations_data=True, | |
| random_state=42 | |
| ) | |
| kernel.mice(5, verbose=True) # 5 iteraciones para robustez | |
| # Extraer dataset completo imputado | |
| df_mice_complete = kernel.complete_data(iteration=-1) | |
| # Reemplazar valores imputados en dataframe principal | |
| for col in mice_vars: | |
| df_imp[col] = df_mice_complete[col] | |
| print(f"\n✓ Imputación MICE exitosa") | |
| print(f"Missingness final en GDP: {df_imp['GDP'].isnull().sum()} ({df_imp['GDP'].isnull().mean()*100:.1f}%)") | |
| except AssertionError as e: | |
| print(f"\n Error en miceforest: {e}") | |
| print("Fallback a IterativeImputer de sklearn...") | |
| df_imp = fallback_sklearn_imputation(df_imp, mice_vars) | |
| # === PASO 5: Manejo de variables MNAR (Population) === | |
| df_imp['Population_missing'] = df_imp['Population'].isnull().astype(int) | |
| # NO imputamos Population (MNAR sospechoso) - mantenemos missingness como feature | |
| return df_imp | |
| def fallback_sklearn_imputation(df, variables_to_impute): | |
| """ | |
| Fallback robusto usando sklearn cuando miceforest falla. | |
| Más lento pero compatible con cualquier tipo de dato numérico. | |
| """ | |
| from sklearn.experimental import enable_iterative_imputer | |
| from sklearn.impute import IterativeImputer | |
| df_fallback = df.copy() | |
| # Seleccionar solo variables numéricas para imputación | |
| numeric_vars = [col for col in variables_to_impute if df_fallback[col].dtype in [np.float64, np.int64]] | |
| imputer = IterativeImputer( | |
| max_iter=10, | |
| random_state=42, | |
| verbose=1, | |
| initial_strategy='median' # Más robusto que 'mean' para outliers | |
| ) | |
| # Imputar solo las variables seleccionadas | |
| df_fallback[numeric_vars] = imputer.fit_transform(df_fallback[numeric_vars]) | |
| print(f" Fallback sklearn completado para {len(numeric_vars)} variables") | |
| return df_fallback |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment