gbaeza2002/IMPLEMENTACIÓN DE MODELOS PREDICTIVOS. CASO PREDICTIVO DE MERMAS

## IMPLEMENTACIÓN DE MODELOS PREDICTIVOS. CASO PREDICTIVO DE MERMAS
# IMPLEMENTACIÓN DE MODELOS PREDICTIVOS. CASO PREDICTIVO DE MERMAS

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import datetime as dt
from sqlalchemy import create_engine
import pymysql
from xgboost import XGBRegressor

print("*IMPLEMENTACIÓN DE MODELOS PREDICTIVOS. CASO PREDICTIVO DE MERMAS*")

# Conexión a MySQL
try:
    engine = create_engine('mysql+pymysql://graciany:Informatica2022@localhost:3306/mermas_productos')
    print("Conexión exitosa a la base de datos.")
except Exception as e:
    print(f"Error al conectar a la base de datos: {e}")

# Consulta SQL
query = """
SELECT
  fm.unidad_merma,
  fm.monto_merma,
  dp.marca,
  dp.linea,
  dt.fecha,
  dt.año,
  dl.comuna,
  dl.region,
  dm.motivo,
  dpr.pais_origen
FROM fact_Mermas fm
JOIN dim_Productos dp ON fm.producto_id = dp.id
JOIN dim_Tiempo dt ON fm.tiempo_id = dt.id
JOIN dim_Localizacion dl ON fm.localizacion_id = dl.id
JOIN dim_Motivos dm ON fm.motivo_id = dm.id
JOIN dim_Proveedor dpr ON fm.proveedor_id = dpr.id;
"""

# Cargar los datos
data = pd.read_sql(query, engine)

# Filtrar mermas razonables
data = data[(data['unidad_merma'] >= 0.01) & (data['unidad_merma'] <= 110)]

# Características seleccionadas
features = ["marca", "linea", "fecha", "año", "comuna", "region", "motivo", "pais_origen"]
X = data[features]
y = data['unidad_merma']

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Variables categóricas y numéricas
categorical_features = ['marca', 'linea', 'comuna', 'region', 'motivo', 'pais_origen']
numeric_features = []

# Preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Modelo 1: Regresión Lineal
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Modelo 2: Random Forest
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Modelo 3: XGBoost Regressor
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=100, random_state=42))
])

# Entrenamiento
print("Entrenando Regresión Lineal...")
pipeline_lr.fit(X_train, y_train)
print("Entrenando Random Forest...")
pipeline_rf.fit(X_train, y_train)
print("Modelos entrenados correctamente")
print("Entrenando XGBoost...")
pipeline_xgb.fit(X_train, y_train)
print("Modelo XGBoost entrenado correctamente")


# Predicciones
y_pred_lr = pipeline_lr.predict(X_test)
y_pred_rf = pipeline_rf.predict(X_test)
y_pred_xgb = pipeline_xgb.predict(X_test)

# Métricas
mse_lr = mean_squared_error(y_test, y_pred_lr)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_lr = np.sqrt(mse_lr)
rmse_rf = np.sqrt(mse_rf)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_lr = r2_score(y_test, y_pred_lr)
r2_rf = r2_score(y_test, y_pred_rf)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Resultados
results_df = pd.DataFrame({
    'Valor_Real': y_test,
    'Prediccion_LR': y_pred_lr,
    'Prediccion_RF': y_pred_rf,
    'Error_LR': y_test - y_pred_lr,
    'Error_RF': y_test - y_pred_rf,
    'Error_Porcentual_LR': ((y_test - y_pred_lr) / y_test) * 100,
    'Error_Porcentual_RF': ((y_test - y_pred_rf) / y_test) * 100
}).reset_index(drop=True)

results_df['Prediccion_XGB'] = y_pred_xgb
results_df['Error_XGB'] = y_test.values - y_pred_xgb
results_df['Error_Porcentual_XGB'] = ((y_test.values - y_pred_xgb) / y_test.values) * 100


X_test_reset = X_test.reset_index(drop=True)
for feature in X_test.columns:
    results_df[feature] = X_test_reset[feature]

results_df = results_df.sort_values('Valor_Real', ascending=False)

# Guardar archivo markdown: Regresión Lineal
with open('prediccion_lr.md', 'w') as f:
    f.write('# Resultados de Predicción: Regresión Lineal\n\n')
    f.write('## Resumen de Métricas\n\n')
    f.write(f'- **R²**: {r2_lr:.4f}\n')
    f.write(f'- **RMSE**: {rmse_lr:.2f}\n')
    f.write(f'- **MAE**: {mae_lr:.2f}\n\n')
    f.write('## Interpretación\n\n')
    f.write(f'Este modelo explica aproximadamente el {r2_lr*100:.1f}% de la variabilidad en las mermas.\n\n')
    f.write('## Muestra de Predicciones (Top 10)\n\n')
    f.write('| # | Valor Real | Predicción | Error | Error % | Marca | Región |\n')
    f.write('|---|------------|------------|-------|---------|--------|--------|\n')
    for i, row in results_df.head(10).iterrows():
        f.write(f"| {i} | {row['Valor_Real']:.2f} | {row['Prediccion_LR']:.2f} | {row['Error_LR']:.2f} | {row['Error_Porcentual_LR']:.1f}% | {row['marca']} | {row['region']} |\n")

# Guardar archivo markdown: Random Forest
with open('prediccion_rf.md', 'w') as f:
    f.write('# Resultados de Predicción: Random Forest\n\n')
    f.write('## Resumen de Métricas\n\n')
    f.write(f'- **R²**: {r2_rf:.4f}\n')
    f.write(f'- **RMSE**: {rmse_rf:.2f}\n')
    f.write(f'- **MAE**: {mae_rf:.2f}\n\n')
    f.write('## Interpretación\n\n')
    f.write(f'Este modelo explica aproximadamente el {r2_rf*100:.1f}% de la variabilidad en las mermas.\n\n')
    f.write('## Muestra de Predicciones (Top 10)\n\n')
    f.write('| # | Valor Real | Predicción | Error | Error % | Marca | Región |\n')
    f.write('|---|------------|------------|-------|---------|--------|--------|\n')
    for i, row in results_df.head(10).iterrows():
        f.write(f"| {i} | {row['Valor_Real']:.2f} | {row['Prediccion_RF']:.2f} | {row['Error_RF']:.2f} | {row['Error_Porcentual_RF']:.1f}% | {row['marca']} | {row['region']} |\n")
with open('prediccion_xgb.md', 'w') as f:
    f.write('# Resultados de Predicción: XGBoost\n\n')
    f.write('## Resumen de Métricas\n\n')
    f.write(f'- **R²**: {r2_xgb:.4f}\n')
    f.write(f'- **RMSE**: {rmse_xgb:.2f}\n')
    f.write(f'- **MAE**: {mae_xgb:.2f}\n\n')
    f.write('## Interpretación\n\n')
    f.write(f'Este modelo explica aproximadamente el {r2_xgb*100:.1f}% de la variabilidad en las mermas.\n\n')
    f.write('## Muestra de Predicciones (Top 10)\n\n')
    f.write('| # | Valor Real | Predicción | Error | Error % | Marca | Región |\n')
    f.write('|---|------------|------------|-------|---------|--------|--------|\n')
    for i, row in results_df.head(10).iterrows():
        f.write(f"| {i} | {row['Valor_Real']:.2f} | {row['Prediccion_XGB']:.2f} | {row['Error_XGB']:.2f} | {row['Error_Porcentual_XGB']:.1f}% | {row['marca']} | {row['region']} |\n")


print("Resultados exportados correctamente a 'prediccion_lr.md' y 'prediccion_rf.md'")
	# IMPLEMENTACIÓN DE MODELOS PREDICTIVOS. CASO PREDICTIVO DE MERMAS

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.linear_model import LinearRegression
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
	import datetime as dt
	from sqlalchemy import create_engine
	import pymysql
	from xgboost import XGBRegressor

	print("IMPLEMENTACIÓN DE MODELOS PREDICTIVOS. CASO PREDICTIVO DE MERMAS")

	# Conexión a MySQL
	try:
	engine = create_engine('mysql+pymysql://graciany:Informatica2022@localhost:3306/mermas_productos')
	print("Conexión exitosa a la base de datos.")
	except Exception as e:
	print(f"Error al conectar a la base de datos: {e}")

	# Consulta SQL
	query = """
	SELECT
	fm.unidad_merma,
	fm.monto_merma,
	dp.marca,
	dp.linea,
	dt.fecha,
	dt.año,
	dl.comuna,
	dl.region,
	dm.motivo,
	dpr.pais_origen
	FROM fact_Mermas fm
	JOIN dim_Productos dp ON fm.producto_id = dp.id
	JOIN dim_Tiempo dt ON fm.tiempo_id = dt.id
	JOIN dim_Localizacion dl ON fm.localizacion_id = dl.id
	JOIN dim_Motivos dm ON fm.motivo_id = dm.id
	JOIN dim_Proveedor dpr ON fm.proveedor_id = dpr.id;
	"""

	# Cargar los datos
	data = pd.read_sql(query, engine)

	# Filtrar mermas razonables
	data = data[(data['unidad_merma'] >= 0.01) & (data['unidad_merma'] <= 110)]

	# Características seleccionadas
	features = ["marca", "linea", "fecha", "año", "comuna", "region", "motivo", "pais_origen"]
	X = data[features]
	y = data['unidad_merma']

	# Dividir los datos
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Variables categóricas y numéricas
	categorical_features = ['marca', 'linea', 'comuna', 'region', 'motivo', 'pais_origen']
	numeric_features = []

	# Preprocesador
	preprocessor = ColumnTransformer(
	transformers=[
	('num', StandardScaler(), numeric_features),
	('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
	])

	# Modelo 1: Regresión Lineal
	pipeline_lr = Pipeline(steps=[
	('preprocessor', preprocessor),
	('regressor', LinearRegression())
	])

	# Modelo 2: Random Forest
	pipeline_rf = Pipeline(steps=[
	('preprocessor', preprocessor),
	('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
	])

	# Modelo 3: XGBoost Regressor
	pipeline_xgb = Pipeline(steps=[
	('preprocessor', preprocessor),
	('regressor', XGBRegressor(n_estimators=100, random_state=42))
	])

	# Entrenamiento
	print("Entrenando Regresión Lineal...")
	pipeline_lr.fit(X_train, y_train)
	print("Entrenando Random Forest...")
	pipeline_rf.fit(X_train, y_train)
	print("Modelos entrenados correctamente")
	print("Entrenando XGBoost...")
	pipeline_xgb.fit(X_train, y_train)
	print("Modelo XGBoost entrenado correctamente")


	# Predicciones
	y_pred_lr = pipeline_lr.predict(X_test)
	y_pred_rf = pipeline_rf.predict(X_test)
	y_pred_xgb = pipeline_xgb.predict(X_test)

	# Métricas
	mse_lr = mean_squared_error(y_test, y_pred_lr)
	mse_rf = mean_squared_error(y_test, y_pred_rf)
	rmse_lr = np.sqrt(mse_lr)
	rmse_rf = np.sqrt(mse_rf)
	mae_lr = mean_absolute_error(y_test, y_pred_lr)
	mae_rf = mean_absolute_error(y_test, y_pred_rf)
	r2_lr = r2_score(y_test, y_pred_lr)
	r2_rf = r2_score(y_test, y_pred_rf)
	mse_xgb = mean_squared_error(y_test, y_pred_xgb)
	rmse_xgb = np.sqrt(mse_xgb)
	mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
	r2_xgb = r2_score(y_test, y_pred_xgb)

	# Resultados
	results_df = pd.DataFrame({
	'Valor_Real': y_test,
	'Prediccion_LR': y_pred_lr,
	'Prediccion_RF': y_pred_rf,
	'Error_LR': y_test - y_pred_lr,
	'Error_RF': y_test - y_pred_rf,
	'Error_Porcentual_LR': ((y_test - y_pred_lr) / y_test) * 100,
	'Error_Porcentual_RF': ((y_test - y_pred_rf) / y_test) * 100
	}).reset_index(drop=True)

	results_df['Prediccion_XGB'] = y_pred_xgb
	results_df['Error_XGB'] = y_test.values - y_pred_xgb
	results_df['Error_Porcentual_XGB'] = ((y_test.values - y_pred_xgb) / y_test.values) * 100


	X_test_reset = X_test.reset_index(drop=True)
	for feature in X_test.columns:
	results_df[feature] = X_test_reset[feature]

	results_df = results_df.sort_values('Valor_Real', ascending=False)

	# Guardar archivo markdown: Regresión Lineal
	with open('prediccion_lr.md', 'w') as f:
	f.write('# Resultados de Predicción: Regresión Lineal\n\n')
	f.write('## Resumen de Métricas\n\n')
	f.write(f'- R²: {r2_lr:.4f}\n')
	f.write(f'- RMSE: {rmse_lr:.2f}\n')
	f.write(f'- MAE: {mae_lr:.2f}\n\n')
	f.write('## Interpretación\n\n')
	f.write(f'Este modelo explica aproximadamente el {r2_lr*100:.1f}% de la variabilidad en las mermas.\n\n')
	f.write('## Muestra de Predicciones (Top 10)\n\n')
	f.write('\| # \| Valor Real \| Predicción \| Error \| Error % \| Marca \| Región \|\n')
	f.write('\|---\|------------\|------------\|-------\|---------\|--------\|--------\|\n')
	for i, row in results_df.head(10).iterrows():
	f.write(f"\| {i} \| {row['Valor_Real']:.2f} \| {row['Prediccion_LR']:.2f} \| {row['Error_LR']:.2f} \| {row['Error_Porcentual_LR']:.1f}% \| {row['marca']} \| {row['region']} \|\n")

	# Guardar archivo markdown: Random Forest
	with open('prediccion_rf.md', 'w') as f:
	f.write('# Resultados de Predicción: Random Forest\n\n')
	f.write('## Resumen de Métricas\n\n')
	f.write(f'- R²: {r2_rf:.4f}\n')
	f.write(f'- RMSE: {rmse_rf:.2f}\n')
	f.write(f'- MAE: {mae_rf:.2f}\n\n')
	f.write('## Interpretación\n\n')
	f.write(f'Este modelo explica aproximadamente el {r2_rf*100:.1f}% de la variabilidad en las mermas.\n\n')
	f.write('## Muestra de Predicciones (Top 10)\n\n')
	f.write('\| # \| Valor Real \| Predicción \| Error \| Error % \| Marca \| Región \|\n')
	f.write('\|---\|------------\|------------\|-------\|---------\|--------\|--------\|\n')
	for i, row in results_df.head(10).iterrows():
	f.write(f"\| {i} \| {row['Valor_Real']:.2f} \| {row['Prediccion_RF']:.2f} \| {row['Error_RF']:.2f} \| {row['Error_Porcentual_RF']:.1f}% \| {row['marca']} \| {row['region']} \|\n")
	with open('prediccion_xgb.md', 'w') as f:
	f.write('# Resultados de Predicción: XGBoost\n\n')
	f.write('## Resumen de Métricas\n\n')
	f.write(f'- R²: {r2_xgb:.4f}\n')
	f.write(f'- RMSE: {rmse_xgb:.2f}\n')
	f.write(f'- MAE: {mae_xgb:.2f}\n\n')
	f.write('## Interpretación\n\n')
	f.write(f'Este modelo explica aproximadamente el {r2_xgb*100:.1f}% de la variabilidad en las mermas.\n\n')
	f.write('## Muestra de Predicciones (Top 10)\n\n')
	f.write('\| # \| Valor Real \| Predicción \| Error \| Error % \| Marca \| Región \|\n')
	f.write('\|---\|------------\|------------\|-------\|---------\|--------\|--------\|\n')
	for i, row in results_df.head(10).iterrows():
	f.write(f"\| {i} \| {row['Valor_Real']:.2f} \| {row['Prediccion_XGB']:.2f} \| {row['Error_XGB']:.2f} \| {row['Error_Porcentual_XGB']:.1f}% \| {row['marca']} \| {row['region']} \|\n")


	print("Resultados exportados correctamente a 'prediccion_lr.md' y 'prediccion_rf.md'")
No results found