FT: Generar modelos con Gradient Boosting mejorando greedy

This commit is contained in:
2026-01-06 22:41:08 +01:00
parent 7eae70a59e
commit e5cfa721e8
3 changed files with 469 additions and 11 deletions

View File

@@ -42,6 +42,20 @@ class ModelConfig:
"data_function": "fetch_data_legacy",
"description": "Modelo legacy compatible",
"required_params": ["hour", "dow", "total_events"]
},
"demand_advanced": {
"module": "models_train",
"train_function": "train_demand_model_advanced",
"data_function": "fetch_data",
"description": "Modelo avanzado de demanda con feature engineering y Gradient Boosting optimizado",
"required_params": ["h3", "week", "dow", "hour"]
},
"nulo_advanced": {
"module": "models_train",
"train_function": "train_nulo_model_advanced",
"data_function": "fetch_data",
"description": "Modelo avanzado de nulos con feature engineering y mejor manejo de desbalance",
"required_params": ["h3", "week", "dow", "hour"]
}
}
}

View File

@@ -0,0 +1,271 @@
"""
Módulo de evaluación de modelos de Machine Learning.
Proporciona funciones para evaluar modelos de regresión y clasificación,
comparar múltiples modelos y realizar validación cruzada.
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Any, Tuple
from sklearn.metrics import (
mean_absolute_error,
mean_squared_error,
r2_score,
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_auc_score,
confusion_matrix,
classification_report
)
from sklearn.model_selection import cross_val_score, cross_validate
import warnings
def evaluate_regression_model(model, X_test, y_test, model_name: str = "Model") -> Dict[str, Any]:
"""
Evalúa un modelo de regresión con múltiples métricas.
Args:
model: Modelo entrenado con método predict()
X_test: Características de prueba
y_test: Valores objetivo de prueba
model_name: Nombre del modelo para el reporte
Returns:
Diccionario con métricas de evaluación
"""
try:
# Realizar predicciones
y_pred = model.predict(X_test)
# Calcular métricas
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
# Calcular error porcentual medio absoluto (MAPE)
# Evitar división por cero
mask = y_test != 0
mape = np.mean(np.abs((y_test[mask] - y_pred[mask]) / y_test[mask])) * 100 if mask.any() else np.inf
results = {
"model_name": model_name,
"mae": float(mae),
"mse": float(mse),
"rmse": float(rmse),
"r2": float(r2),
"mape": float(mape) if mape != np.inf else None,
"n_samples": len(y_test)
}
return results
except Exception as e:
return {
"model_name": model_name,
"error": str(e)
}
def evaluate_classification_model(model, X_test, y_test, model_name: str = "Model",
average: str = 'binary') -> Dict[str, Any]:
"""
Evalúa un modelo de clasificación con múltiples métricas.
Args:
model: Modelo entrenado con métodos predict() y predict_proba()
X_test: Características de prueba
y_test: Etiquetas de prueba
model_name: Nombre del modelo para el reporte
average: Tipo de promedio para métricas ('binary', 'macro', 'weighted')
Returns:
Diccionario con métricas de evaluación
"""
try:
# Realizar predicciones
y_pred = model.predict(X_test)
# Calcular métricas básicas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=average, zero_division=0)
recall = recall_score(y_test, y_pred, average=average, zero_division=0)
f1 = f1_score(y_test, y_pred, average=average, zero_division=0)
results = {
"model_name": model_name,
"accuracy": float(accuracy),
"precision": float(precision),
"recall": float(recall),
"f1_score": float(f1),
"n_samples": len(y_test)
}
# Calcular AUC-ROC si el modelo tiene predict_proba
if hasattr(model, 'predict_proba'):
try:
y_proba = model.predict_proba(X_test)
# Para clasificación binaria
if y_proba.shape[1] == 2:
auc_roc = roc_auc_score(y_test, y_proba[:, 1])
results["auc_roc"] = float(auc_roc)
except Exception as e:
warnings.warn(f"No se pudo calcular AUC-ROC: {str(e)}")
# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
results["confusion_matrix"] = cm.tolist()
return results
except Exception as e:
return {
"model_name": model_name,
"error": str(e)
}
def compare_models(models: Dict[str, Any], X_test, y_test,
model_type: str = 'regression') -> pd.DataFrame:
"""
Compara múltiples modelos y genera un reporte.
Args:
models: Diccionario {nombre_modelo: modelo_entrenado}
X_test: Características de prueba
y_test: Valores/etiquetas de prueba
model_type: 'regression' o 'classification'
Returns:
DataFrame con comparación de modelos
"""
results = []
for name, model in models.items():
if model_type == 'regression':
metrics = evaluate_regression_model(model, X_test, y_test, name)
elif model_type == 'classification':
metrics = evaluate_classification_model(model, X_test, y_test, name)
else:
raise ValueError(f"model_type debe ser 'regression' o 'classification', recibido: {model_type}")
results.append(metrics)
df = pd.DataFrame(results)
# Ordenar por métrica principal
if model_type == 'regression':
df = df.sort_values('rmse', ascending=True)
else:
df = df.sort_values('f1_score', ascending=False)
return df
def cross_validate_model(model, X, y, cv: int = 5,
model_type: str = 'regression') -> Dict[str, Any]:
"""
Realiza validación cruzada con múltiples métricas.
Args:
model: Modelo a evaluar
X: Características completas
y: Valores/etiquetas completos
cv: Número de folds para validación cruzada
model_type: 'regression' o 'classification'
Returns:
Diccionario con resultados de validación cruzada
"""
if model_type == 'regression':
scoring = {
'mae': 'neg_mean_absolute_error',
'mse': 'neg_mean_squared_error',
'r2': 'r2'
}
elif model_type == 'classification':
scoring = {
'accuracy': 'accuracy',
'precision': 'precision',
'recall': 'recall',
'f1': 'f1',
'roc_auc': 'roc_auc'
}
else:
raise ValueError(f"model_type debe ser 'regression' o 'classification'")
try:
cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring,
return_train_score=True, n_jobs=-1)
results = {
'cv_folds': cv,
'n_samples': len(y)
}
# Procesar resultados
for metric_name, scores in cv_results.items():
if metric_name.startswith('test_'):
metric = metric_name.replace('test_', '')
# Convertir métricas negativas a positivas
if metric in ['mae', 'mse']:
scores = -scores
results[f'{metric}_mean'] = float(np.mean(scores))
results[f'{metric}_std'] = float(np.std(scores))
results[f'{metric}_scores'] = scores.tolist()
# Calcular RMSE para regresión
if model_type == 'regression' and 'mse_mean' in results:
results['rmse_mean'] = float(np.sqrt(results['mse_mean']))
results['rmse_std'] = float(np.sqrt(results['mse_std']))
return results
except Exception as e:
return {
'error': str(e),
'cv_folds': cv,
'n_samples': len(y)
}
def print_evaluation_report(metrics: Dict[str, Any], model_type: str = 'regression'):
"""
Imprime un reporte formateado de las métricas de evaluación.
Args:
metrics: Diccionario con métricas de evaluación
model_type: 'regression' o 'classification'
"""
print("=" * 60)
print(f"Reporte de Evaluación: {metrics.get('model_name', 'Modelo')}")
print("=" * 60)
if 'error' in metrics:
print(f"ERROR: {metrics['error']}")
return
print(f"Muestras de prueba: {metrics.get('n_samples', 'N/A')}")
print("-" * 60)
if model_type == 'regression':
print(f"MAE (Error Absoluto Medio): {metrics.get('mae', 'N/A'):.4f}")
print(f"RMSE (Raíz del Error Cuadrático): {metrics.get('rmse', 'N/A'):.4f}")
print(f"R² (Coeficiente de Determinación): {metrics.get('r2', 'N/A'):.4f}")
if metrics.get('mape') is not None:
print(f"MAPE (Error Porcentual Medio): {metrics.get('mape', 'N/A'):.2f}%")
elif model_type == 'classification':
print(f"Exactitud (Accuracy): {metrics.get('accuracy', 'N/A'):.4f}")
print(f"Precisión (Precision): {metrics.get('precision', 'N/A'):.4f}")
print(f"Recall: {metrics.get('recall', 'N/A'):.4f}")
print(f"F1-Score: {metrics.get('f1_score', 'N/A'):.4f}")
if 'auc_roc' in metrics:
print(f"AUC-ROC: {metrics.get('auc_roc', 'N/A'):.4f}")
if 'confusion_matrix' in metrics:
print("\nMatriz de Confusión:")
cm = np.array(metrics['confusion_matrix'])
print(cm)
print("=" * 60)

View File

@@ -1,35 +1,91 @@
import xgboost as xgb
def train_demand_model(df):
"""
Entrena modelo de demanda con XGBoost mejorado.
Incluye early stopping y mejores hiperparámetros.
"""
from sklearn.model_selection import train_test_split
df["h3_int"] = df["h3"].apply(lambda x: int(x, 16))
X = df[["h3_int", "week", "dow", "hour"]]
y = df["total_events"]
model = xgb.XGBRegressor(
n_estimators=300,
max_depth=8,
learning_rate=0.05,
objective="reg:squarederror"
# Dividir en train y validation para early stopping
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42
)
model.fit(X, y)
model = xgb.XGBRegressor(
n_estimators=500,
max_depth=8,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
gamma=0.1, # Regularización mínima de ganancia
min_child_weight=3, # Prevenir overfitting
reg_alpha=0.1, # Regularización L1
reg_lambda=1.0, # Regularización L2
objective="reg:squarederror",
random_state=42,
n_jobs=-1
)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=50,
verbose=False
)
return model
def train_nulo_model(df):
"""
Entrena modelo de nulos con XGBoost mejorado.
Incluye manejo de desbalance de clases y early stopping.
"""
from sklearn.model_selection import train_test_split
import numpy as np
df["h3_int"] = df["h3"].apply(lambda x: int(x, 16))
X = df[["h3_int", "week", "dow", "hour"]]
y = (df["nulo_rate"] > 0).astype(int)
# Dividir en train y validation
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Calcular peso para balancear clases
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
model = xgb.XGBClassifier(
n_estimators=200,
max_depth=5,
n_estimators=300,
max_depth=6,
learning_rate=0.1,
eval_metric="logloss"
subsample=0.8,
colsample_bytree=0.8,
gamma=0.1,
min_child_weight=3,
reg_alpha=0.1, # Regularización L1
reg_lambda=1.0, # Regularización L2
scale_pos_weight=scale_pos_weight, # Balancear clases
eval_metric="logloss",
use_label_encoder=False,
random_state=42,
n_jobs=-1
)
model.fit(X, y)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=30,
verbose=False
)
return model
def train_legacy_model(df):
@@ -94,4 +150,121 @@ def train_custom_nulo_model(df):
)
model.fit(X, y)
return model
def train_demand_model_advanced(df):
"""
Modelo de demanda avanzado con feature engineering y optimización de hiperparámetros.
Utiliza técnicas avanzadas de Gradient Boosting.
"""
from sklearn.model_selection import train_test_split
import numpy as np
df["h3_int"] = df["h3"].apply(lambda x: int(x, 16))
# Feature engineering: crear características adicionales
df_features = df.copy()
df_features['hour_sin'] = np.sin(2 * np.pi * df_features['hour'] / 24)
df_features['hour_cos'] = np.cos(2 * np.pi * df_features['hour'] / 24)
df_features['dow_sin'] = np.sin(2 * np.pi * df_features['dow'] / 7)
df_features['dow_cos'] = np.cos(2 * np.pi * df_features['dow'] / 7)
df_features['is_weekend'] = (df_features['dow'] >= 5).astype(int)
df_features['is_rush_hour'] = ((df_features['hour'] >= 7) & (df_features['hour'] <= 9) |
(df_features['hour'] >= 17) & (df_features['hour'] <= 19)).astype(int)
X = df_features[["h3_int", "week", "dow", "hour",
"hour_sin", "hour_cos", "dow_sin", "dow_cos",
"is_weekend", "is_rush_hour"]]
y = df_features["total_events"]
# Dividir con validación
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42
)
model = xgb.XGBRegressor(
n_estimators=1000,
max_depth=10,
learning_rate=0.03,
subsample=0.85,
colsample_bytree=0.85,
colsample_bylevel=0.85,
gamma=0.2,
min_child_weight=5,
reg_alpha=0.3, # Mayor regularización L1
reg_lambda=1.5, # Mayor regularización L2
objective="reg:squarederror",
tree_method='hist', # Método más eficiente
random_state=42,
n_jobs=-1
)
model.fit(
X_train, y_train,
eval_set=[(X_train, y_train), (X_val, y_val)],
early_stopping_rounds=100,
verbose=False
)
return model
def train_nulo_model_advanced(df):
"""
Modelo de nulos avanzado con feature engineering y optimización.
Incluye mejor manejo de desbalance y técnicas de Gradient Boosting avanzadas.
"""
from sklearn.model_selection import train_test_split, StratifiedKFold
import numpy as np
df["h3_int"] = df["h3"].apply(lambda x: int(x, 16))
# Feature engineering
df_features = df.copy()
df_features['hour_sin'] = np.sin(2 * np.pi * df_features['hour'] / 24)
df_features['hour_cos'] = np.cos(2 * np.pi * df_features['hour'] / 24)
df_features['dow_sin'] = np.sin(2 * np.pi * df_features['dow'] / 7)
df_features['dow_cos'] = np.cos(2 * np.pi * df_features['dow'] / 7)
df_features['is_weekend'] = (df_features['dow'] >= 5).astype(int)
df_features['is_rush_hour'] = ((df_features['hour'] >= 7) & (df_features['hour'] <= 9) |
(df_features['hour'] >= 17) & (df_features['hour'] <= 19)).astype(int)
X = df_features[["h3_int", "week", "dow", "hour",
"hour_sin", "hour_cos", "dow_sin", "dow_cos",
"is_weekend", "is_rush_hour"]]
y = (df_features["nulo_rate"] > 0).astype(int)
# Dividir con estratificación
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Calcular peso para balancear clases
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
model = xgb.XGBClassifier(
n_estimators=500,
max_depth=8,
learning_rate=0.05,
subsample=0.85,
colsample_bytree=0.85,
colsample_bylevel=0.85,
gamma=0.2,
min_child_weight=5,
reg_alpha=0.3,
reg_lambda=1.5,
scale_pos_weight=scale_pos_weight,
eval_metric="logloss",
use_label_encoder=False,
tree_method='hist',
random_state=42,
n_jobs=-1
)
model.fit(
X_train, y_train,
eval_set=[(X_train, y_train), (X_val, y_val)],
early_stopping_rounds=50,
verbose=False
)
return model