Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save gbaeza2002/5379cae9a5c374f1c6310cb783fac893 to your computer and use it in GitHub Desktop.

Select an option

Save gbaeza2002/5379cae9a5c374f1c6310cb783fac893 to your computer and use it in GitHub Desktop.
# IMPLEMENTACIÓN DE MODELOS PREDICTIVOS. CASO PREDICTIVO DE MERMAS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import datetime as dt
from sqlalchemy import create_engine
import pymysql
from xgboost import XGBRegressor
print("*IMPLEMENTACIÓN DE MODELOS PREDICTIVOS. CASO PREDICTIVO DE MERMAS*")
# Conexión a MySQL
try:
engine = create_engine('mysql+pymysql://graciany:Informatica2022@localhost:3306/mermas_productos')
print("Conexión exitosa a la base de datos.")
except Exception as e:
print(f"Error al conectar a la base de datos: {e}")
# Consulta SQL
query = """
SELECT
fm.unidad_merma,
fm.monto_merma,
dp.marca,
dp.linea,
dt.fecha,
dt.año,
dl.comuna,
dl.region,
dm.motivo,
dpr.pais_origen
FROM fact_Mermas fm
JOIN dim_Productos dp ON fm.producto_id = dp.id
JOIN dim_Tiempo dt ON fm.tiempo_id = dt.id
JOIN dim_Localizacion dl ON fm.localizacion_id = dl.id
JOIN dim_Motivos dm ON fm.motivo_id = dm.id
JOIN dim_Proveedor dpr ON fm.proveedor_id = dpr.id;
"""
# Cargar los datos
data = pd.read_sql(query, engine)
# Filtrar mermas razonables
data = data[(data['unidad_merma'] >= 0.01) & (data['unidad_merma'] <= 110)]
# Características seleccionadas
features = ["marca", "linea", "fecha", "año", "comuna", "region", "motivo", "pais_origen"]
X = data[features]
y = data['unidad_merma']
# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Variables categóricas y numéricas
categorical_features = ['marca', 'linea', 'comuna', 'region', 'motivo', 'pais_origen']
numeric_features = []
# Preprocesador
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
# Modelo 1: Regresión Lineal
pipeline_lr = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', LinearRegression())
])
# Modelo 2: Random Forest
pipeline_rf = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])
# Modelo 3: XGBoost Regressor
pipeline_xgb = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', XGBRegressor(n_estimators=100, random_state=42))
])
# Entrenamiento
print("Entrenando Regresión Lineal...")
pipeline_lr.fit(X_train, y_train)
print("Entrenando Random Forest...")
pipeline_rf.fit(X_train, y_train)
print("Modelos entrenados correctamente")
print("Entrenando XGBoost...")
pipeline_xgb.fit(X_train, y_train)
print("Modelo XGBoost entrenado correctamente")
# Predicciones
y_pred_lr = pipeline_lr.predict(X_test)
y_pred_rf = pipeline_rf.predict(X_test)
y_pred_xgb = pipeline_xgb.predict(X_test)
# Métricas
mse_lr = mean_squared_error(y_test, y_pred_lr)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_lr = np.sqrt(mse_lr)
rmse_rf = np.sqrt(mse_rf)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_lr = r2_score(y_test, y_pred_lr)
r2_rf = r2_score(y_test, y_pred_rf)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
# Resultados
results_df = pd.DataFrame({
'Valor_Real': y_test,
'Prediccion_LR': y_pred_lr,
'Prediccion_RF': y_pred_rf,
'Error_LR': y_test - y_pred_lr,
'Error_RF': y_test - y_pred_rf,
'Error_Porcentual_LR': ((y_test - y_pred_lr) / y_test) * 100,
'Error_Porcentual_RF': ((y_test - y_pred_rf) / y_test) * 100
}).reset_index(drop=True)
results_df['Prediccion_XGB'] = y_pred_xgb
results_df['Error_XGB'] = y_test.values - y_pred_xgb
results_df['Error_Porcentual_XGB'] = ((y_test.values - y_pred_xgb) / y_test.values) * 100
X_test_reset = X_test.reset_index(drop=True)
for feature in X_test.columns:
results_df[feature] = X_test_reset[feature]
results_df = results_df.sort_values('Valor_Real', ascending=False)
# Guardar archivo markdown: Regresión Lineal
with open('prediccion_lr.md', 'w') as f:
f.write('# Resultados de Predicción: Regresión Lineal\n\n')
f.write('## Resumen de Métricas\n\n')
f.write(f'- **R²**: {r2_lr:.4f}\n')
f.write(f'- **RMSE**: {rmse_lr:.2f}\n')
f.write(f'- **MAE**: {mae_lr:.2f}\n\n')
f.write('## Interpretación\n\n')
f.write(f'Este modelo explica aproximadamente el {r2_lr*100:.1f}% de la variabilidad en las mermas.\n\n')
f.write('## Muestra de Predicciones (Top 10)\n\n')
f.write('| # | Valor Real | Predicción | Error | Error % | Marca | Región |\n')
f.write('|---|------------|------------|-------|---------|--------|--------|\n')
for i, row in results_df.head(10).iterrows():
f.write(f"| {i} | {row['Valor_Real']:.2f} | {row['Prediccion_LR']:.2f} | {row['Error_LR']:.2f} | {row['Error_Porcentual_LR']:.1f}% | {row['marca']} | {row['region']} |\n")
# Guardar archivo markdown: Random Forest
with open('prediccion_rf.md', 'w') as f:
f.write('# Resultados de Predicción: Random Forest\n\n')
f.write('## Resumen de Métricas\n\n')
f.write(f'- **R²**: {r2_rf:.4f}\n')
f.write(f'- **RMSE**: {rmse_rf:.2f}\n')
f.write(f'- **MAE**: {mae_rf:.2f}\n\n')
f.write('## Interpretación\n\n')
f.write(f'Este modelo explica aproximadamente el {r2_rf*100:.1f}% de la variabilidad en las mermas.\n\n')
f.write('## Muestra de Predicciones (Top 10)\n\n')
f.write('| # | Valor Real | Predicción | Error | Error % | Marca | Región |\n')
f.write('|---|------------|------------|-------|---------|--------|--------|\n')
for i, row in results_df.head(10).iterrows():
f.write(f"| {i} | {row['Valor_Real']:.2f} | {row['Prediccion_RF']:.2f} | {row['Error_RF']:.2f} | {row['Error_Porcentual_RF']:.1f}% | {row['marca']} | {row['region']} |\n")
with open('prediccion_xgb.md', 'w') as f:
f.write('# Resultados de Predicción: XGBoost\n\n')
f.write('## Resumen de Métricas\n\n')
f.write(f'- **R²**: {r2_xgb:.4f}\n')
f.write(f'- **RMSE**: {rmse_xgb:.2f}\n')
f.write(f'- **MAE**: {mae_xgb:.2f}\n\n')
f.write('## Interpretación\n\n')
f.write(f'Este modelo explica aproximadamente el {r2_xgb*100:.1f}% de la variabilidad en las mermas.\n\n')
f.write('## Muestra de Predicciones (Top 10)\n\n')
f.write('| # | Valor Real | Predicción | Error | Error % | Marca | Región |\n')
f.write('|---|------------|------------|-------|---------|--------|--------|\n')
for i, row in results_df.head(10).iterrows():
f.write(f"| {i} | {row['Valor_Real']:.2f} | {row['Prediccion_XGB']:.2f} | {row['Error_XGB']:.2f} | {row['Error_Porcentual_XGB']:.1f}% | {row['marca']} | {row['region']} |\n")
print("Resultados exportados correctamente a 'prediccion_lr.md' y 'prediccion_rf.md'")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment