Created
February 15, 2026 22:19
-
-
Save robintux/7b408e7e8a810d5ce209e62ff1b91668 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def temporal_train_test_split(df, predictors, target, test_size=0.2): | |
| """ | |
| División temporal que respeta: | |
| 1. Orden cronológico (no leakage) | |
| 2. Integridad de series por país (no partir series arbitrariamente) | |
| 3. Balance de países entre train/test | |
| """ | |
| # Ordenar por año | |
| df_sorted = df.sort_values('Year').reset_index(drop=True) | |
| # Determinar punto de corte temporal | |
| cutoff_year = df_sorted['Year'].quantile(1 - test_size) | |
| train = df_sorted[df_sorted['Year'] <= cutoff_year] | |
| test = df_sorted[df_sorted['Year'] > cutoff_year] | |
| # Verificar que no haya leakage de países | |
| countries_train = set(train['Country'].unique()) | |
| countries_test = set(test['Country'].unique()) | |
| countries_overlap = countries_train & countries_test | |
| print(f"División temporal:") | |
| print(f" - Año de corte: {cutoff_year:.0f}") | |
| print(f" - Train: {len(train)} obs ({train['Year'].min()}-{train['Year'].max()})") | |
| print(f" - Test: {len(test)} obs ({test['Year'].min()}-{test['Year'].max()})") | |
| print(f" - Países en ambos sets: {len(countries_overlap)} (evitar overfitting)") | |
| return ( | |
| train[predictors].values, | |
| train[target].values, | |
| test[predictors].values, | |
| test[target].values, | |
| train, test | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment