From e5cfa721e853effa9954c5a776dbe4beda43e06f Mon Sep 17 00:00:00 2001 From: dfcarvajal Date: Tue, 6 Jan 2026 22:41:08 +0100 Subject: [PATCH] FT: Generar modelos con Gradient Boosting mejorando greedy --- src/config/models_config.py | 14 ++ src/core/model_evaluation.py | 271 +++++++++++++++++++++++++++++++++++ src/models_train.py | 195 +++++++++++++++++++++++-- 3 files changed, 469 insertions(+), 11 deletions(-) create mode 100644 src/core/model_evaluation.py diff --git a/src/config/models_config.py b/src/config/models_config.py index 80bda9a..80d10d4 100644 --- a/src/config/models_config.py +++ b/src/config/models_config.py @@ -42,6 +42,20 @@ class ModelConfig: "data_function": "fetch_data_legacy", "description": "Modelo legacy compatible", "required_params": ["hour", "dow", "total_events"] + }, + "demand_advanced": { + "module": "models_train", + "train_function": "train_demand_model_advanced", + "data_function": "fetch_data", + "description": "Modelo avanzado de demanda con feature engineering y Gradient Boosting optimizado", + "required_params": ["h3", "week", "dow", "hour"] + }, + "nulo_advanced": { + "module": "models_train", + "train_function": "train_nulo_model_advanced", + "data_function": "fetch_data", + "description": "Modelo avanzado de nulos con feature engineering y mejor manejo de desbalance", + "required_params": ["h3", "week", "dow", "hour"] } } } diff --git a/src/core/model_evaluation.py b/src/core/model_evaluation.py new file mode 100644 index 0000000..840d8e9 --- /dev/null +++ b/src/core/model_evaluation.py @@ -0,0 +1,271 @@ +""" +Módulo de evaluación de modelos de Machine Learning. + +Proporciona funciones para evaluar modelos de regresión y clasificación, +comparar múltiples modelos y realizar validación cruzada. +""" + +import numpy as np +import pandas as pd +from typing import Dict, List, Any, Tuple +from sklearn.metrics import ( + mean_absolute_error, + mean_squared_error, + r2_score, + accuracy_score, + precision_score, + recall_score, + f1_score, + roc_auc_score, + confusion_matrix, + classification_report +) +from sklearn.model_selection import cross_val_score, cross_validate +import warnings + +def evaluate_regression_model(model, X_test, y_test, model_name: str = "Model") -> Dict[str, Any]: + """ + Evalúa un modelo de regresión con múltiples métricas. + + Args: + model: Modelo entrenado con método predict() + X_test: Características de prueba + y_test: Valores objetivo de prueba + model_name: Nombre del modelo para el reporte + + Returns: + Diccionario con métricas de evaluación + """ + try: + # Realizar predicciones + y_pred = model.predict(X_test) + + # Calcular métricas + mae = mean_absolute_error(y_test, y_pred) + mse = mean_squared_error(y_test, y_pred) + rmse = np.sqrt(mse) + r2 = r2_score(y_test, y_pred) + + # Calcular error porcentual medio absoluto (MAPE) + # Evitar división por cero + mask = y_test != 0 + mape = np.mean(np.abs((y_test[mask] - y_pred[mask]) / y_test[mask])) * 100 if mask.any() else np.inf + + results = { + "model_name": model_name, + "mae": float(mae), + "mse": float(mse), + "rmse": float(rmse), + "r2": float(r2), + "mape": float(mape) if mape != np.inf else None, + "n_samples": len(y_test) + } + + return results + + except Exception as e: + return { + "model_name": model_name, + "error": str(e) + } + +def evaluate_classification_model(model, X_test, y_test, model_name: str = "Model", + average: str = 'binary') -> Dict[str, Any]: + """ + Evalúa un modelo de clasificación con múltiples métricas. + + Args: + model: Modelo entrenado con métodos predict() y predict_proba() + X_test: Características de prueba + y_test: Etiquetas de prueba + model_name: Nombre del modelo para el reporte + average: Tipo de promedio para métricas ('binary', 'macro', 'weighted') + + Returns: + Diccionario con métricas de evaluación + """ + try: + # Realizar predicciones + y_pred = model.predict(X_test) + + # Calcular métricas básicas + accuracy = accuracy_score(y_test, y_pred) + precision = precision_score(y_test, y_pred, average=average, zero_division=0) + recall = recall_score(y_test, y_pred, average=average, zero_division=0) + f1 = f1_score(y_test, y_pred, average=average, zero_division=0) + + results = { + "model_name": model_name, + "accuracy": float(accuracy), + "precision": float(precision), + "recall": float(recall), + "f1_score": float(f1), + "n_samples": len(y_test) + } + + # Calcular AUC-ROC si el modelo tiene predict_proba + if hasattr(model, 'predict_proba'): + try: + y_proba = model.predict_proba(X_test) + # Para clasificación binaria + if y_proba.shape[1] == 2: + auc_roc = roc_auc_score(y_test, y_proba[:, 1]) + results["auc_roc"] = float(auc_roc) + except Exception as e: + warnings.warn(f"No se pudo calcular AUC-ROC: {str(e)}") + + # Matriz de confusión + cm = confusion_matrix(y_test, y_pred) + results["confusion_matrix"] = cm.tolist() + + return results + + except Exception as e: + return { + "model_name": model_name, + "error": str(e) + } + +def compare_models(models: Dict[str, Any], X_test, y_test, + model_type: str = 'regression') -> pd.DataFrame: + """ + Compara múltiples modelos y genera un reporte. + + Args: + models: Diccionario {nombre_modelo: modelo_entrenado} + X_test: Características de prueba + y_test: Valores/etiquetas de prueba + model_type: 'regression' o 'classification' + + Returns: + DataFrame con comparación de modelos + """ + results = [] + + for name, model in models.items(): + if model_type == 'regression': + metrics = evaluate_regression_model(model, X_test, y_test, name) + elif model_type == 'classification': + metrics = evaluate_classification_model(model, X_test, y_test, name) + else: + raise ValueError(f"model_type debe ser 'regression' o 'classification', recibido: {model_type}") + + results.append(metrics) + + df = pd.DataFrame(results) + + # Ordenar por métrica principal + if model_type == 'regression': + df = df.sort_values('rmse', ascending=True) + else: + df = df.sort_values('f1_score', ascending=False) + + return df + +def cross_validate_model(model, X, y, cv: int = 5, + model_type: str = 'regression') -> Dict[str, Any]: + """ + Realiza validación cruzada con múltiples métricas. + + Args: + model: Modelo a evaluar + X: Características completas + y: Valores/etiquetas completos + cv: Número de folds para validación cruzada + model_type: 'regression' o 'classification' + + Returns: + Diccionario con resultados de validación cruzada + """ + if model_type == 'regression': + scoring = { + 'mae': 'neg_mean_absolute_error', + 'mse': 'neg_mean_squared_error', + 'r2': 'r2' + } + elif model_type == 'classification': + scoring = { + 'accuracy': 'accuracy', + 'precision': 'precision', + 'recall': 'recall', + 'f1': 'f1', + 'roc_auc': 'roc_auc' + } + else: + raise ValueError(f"model_type debe ser 'regression' o 'classification'") + + try: + cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, + return_train_score=True, n_jobs=-1) + + results = { + 'cv_folds': cv, + 'n_samples': len(y) + } + + # Procesar resultados + for metric_name, scores in cv_results.items(): + if metric_name.startswith('test_'): + metric = metric_name.replace('test_', '') + # Convertir métricas negativas a positivas + if metric in ['mae', 'mse']: + scores = -scores + + results[f'{metric}_mean'] = float(np.mean(scores)) + results[f'{metric}_std'] = float(np.std(scores)) + results[f'{metric}_scores'] = scores.tolist() + + # Calcular RMSE para regresión + if model_type == 'regression' and 'mse_mean' in results: + results['rmse_mean'] = float(np.sqrt(results['mse_mean'])) + results['rmse_std'] = float(np.sqrt(results['mse_std'])) + + return results + + except Exception as e: + return { + 'error': str(e), + 'cv_folds': cv, + 'n_samples': len(y) + } + +def print_evaluation_report(metrics: Dict[str, Any], model_type: str = 'regression'): + """ + Imprime un reporte formateado de las métricas de evaluación. + + Args: + metrics: Diccionario con métricas de evaluación + model_type: 'regression' o 'classification' + """ + print("=" * 60) + print(f"Reporte de Evaluación: {metrics.get('model_name', 'Modelo')}") + print("=" * 60) + + if 'error' in metrics: + print(f"ERROR: {metrics['error']}") + return + + print(f"Muestras de prueba: {metrics.get('n_samples', 'N/A')}") + print("-" * 60) + + if model_type == 'regression': + print(f"MAE (Error Absoluto Medio): {metrics.get('mae', 'N/A'):.4f}") + print(f"RMSE (Raíz del Error Cuadrático): {metrics.get('rmse', 'N/A'):.4f}") + print(f"R² (Coeficiente de Determinación): {metrics.get('r2', 'N/A'):.4f}") + if metrics.get('mape') is not None: + print(f"MAPE (Error Porcentual Medio): {metrics.get('mape', 'N/A'):.2f}%") + + elif model_type == 'classification': + print(f"Exactitud (Accuracy): {metrics.get('accuracy', 'N/A'):.4f}") + print(f"Precisión (Precision): {metrics.get('precision', 'N/A'):.4f}") + print(f"Recall: {metrics.get('recall', 'N/A'):.4f}") + print(f"F1-Score: {metrics.get('f1_score', 'N/A'):.4f}") + if 'auc_roc' in metrics: + print(f"AUC-ROC: {metrics.get('auc_roc', 'N/A'):.4f}") + + if 'confusion_matrix' in metrics: + print("\nMatriz de Confusión:") + cm = np.array(metrics['confusion_matrix']) + print(cm) + + print("=" * 60) diff --git a/src/models_train.py b/src/models_train.py index 3a6bf84..da07a78 100644 --- a/src/models_train.py +++ b/src/models_train.py @@ -1,35 +1,91 @@ import xgboost as xgb def train_demand_model(df): + """ + Entrena modelo de demanda con XGBoost mejorado. + Incluye early stopping y mejores hiperparámetros. + """ + from sklearn.model_selection import train_test_split + df["h3_int"] = df["h3"].apply(lambda x: int(x, 16)) X = df[["h3_int", "week", "dow", "hour"]] y = df["total_events"] - - model = xgb.XGBRegressor( - n_estimators=300, - max_depth=8, - learning_rate=0.05, - objective="reg:squarederror" + + # Dividir en train y validation para early stopping + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=0.2, random_state=42 ) - model.fit(X, y) + model = xgb.XGBRegressor( + n_estimators=500, + max_depth=8, + learning_rate=0.05, + subsample=0.8, + colsample_bytree=0.8, + gamma=0.1, # Regularización mínima de ganancia + min_child_weight=3, # Prevenir overfitting + reg_alpha=0.1, # Regularización L1 + reg_lambda=1.0, # Regularización L2 + objective="reg:squarederror", + random_state=42, + n_jobs=-1 + ) + + model.fit( + X_train, y_train, + eval_set=[(X_val, y_val)], + early_stopping_rounds=50, + verbose=False + ) + return model def train_nulo_model(df): + """ + Entrena modelo de nulos con XGBoost mejorado. + Incluye manejo de desbalance de clases y early stopping. + """ + from sklearn.model_selection import train_test_split + import numpy as np + df["h3_int"] = df["h3"].apply(lambda x: int(x, 16)) X = df[["h3_int", "week", "dow", "hour"]] y = (df["nulo_rate"] > 0).astype(int) + + # Dividir en train y validation + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y + ) + + # Calcular peso para balancear clases + scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum() model = xgb.XGBClassifier( - n_estimators=200, - max_depth=5, + n_estimators=300, + max_depth=6, learning_rate=0.1, - eval_metric="logloss" + subsample=0.8, + colsample_bytree=0.8, + gamma=0.1, + min_child_weight=3, + reg_alpha=0.1, # Regularización L1 + reg_lambda=1.0, # Regularización L2 + scale_pos_weight=scale_pos_weight, # Balancear clases + eval_metric="logloss", + use_label_encoder=False, + random_state=42, + n_jobs=-1 ) - model.fit(X, y) + model.fit( + X_train, y_train, + eval_set=[(X_val, y_val)], + early_stopping_rounds=30, + verbose=False + ) + return model def train_legacy_model(df): @@ -94,4 +150,121 @@ def train_custom_nulo_model(df): ) model.fit(X, y) + return model + +def train_demand_model_advanced(df): + """ + Modelo de demanda avanzado con feature engineering y optimización de hiperparámetros. + Utiliza técnicas avanzadas de Gradient Boosting. + """ + from sklearn.model_selection import train_test_split + import numpy as np + + df["h3_int"] = df["h3"].apply(lambda x: int(x, 16)) + + # Feature engineering: crear características adicionales + df_features = df.copy() + df_features['hour_sin'] = np.sin(2 * np.pi * df_features['hour'] / 24) + df_features['hour_cos'] = np.cos(2 * np.pi * df_features['hour'] / 24) + df_features['dow_sin'] = np.sin(2 * np.pi * df_features['dow'] / 7) + df_features['dow_cos'] = np.cos(2 * np.pi * df_features['dow'] / 7) + df_features['is_weekend'] = (df_features['dow'] >= 5).astype(int) + df_features['is_rush_hour'] = ((df_features['hour'] >= 7) & (df_features['hour'] <= 9) | + (df_features['hour'] >= 17) & (df_features['hour'] <= 19)).astype(int) + + X = df_features[["h3_int", "week", "dow", "hour", + "hour_sin", "hour_cos", "dow_sin", "dow_cos", + "is_weekend", "is_rush_hour"]] + y = df_features["total_events"] + + # Dividir con validación + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + model = xgb.XGBRegressor( + n_estimators=1000, + max_depth=10, + learning_rate=0.03, + subsample=0.85, + colsample_bytree=0.85, + colsample_bylevel=0.85, + gamma=0.2, + min_child_weight=5, + reg_alpha=0.3, # Mayor regularización L1 + reg_lambda=1.5, # Mayor regularización L2 + objective="reg:squarederror", + tree_method='hist', # Método más eficiente + random_state=42, + n_jobs=-1 + ) + + model.fit( + X_train, y_train, + eval_set=[(X_train, y_train), (X_val, y_val)], + early_stopping_rounds=100, + verbose=False + ) + + return model + +def train_nulo_model_advanced(df): + """ + Modelo de nulos avanzado con feature engineering y optimización. + Incluye mejor manejo de desbalance y técnicas de Gradient Boosting avanzadas. + """ + from sklearn.model_selection import train_test_split, StratifiedKFold + import numpy as np + + df["h3_int"] = df["h3"].apply(lambda x: int(x, 16)) + + # Feature engineering + df_features = df.copy() + df_features['hour_sin'] = np.sin(2 * np.pi * df_features['hour'] / 24) + df_features['hour_cos'] = np.cos(2 * np.pi * df_features['hour'] / 24) + df_features['dow_sin'] = np.sin(2 * np.pi * df_features['dow'] / 7) + df_features['dow_cos'] = np.cos(2 * np.pi * df_features['dow'] / 7) + df_features['is_weekend'] = (df_features['dow'] >= 5).astype(int) + df_features['is_rush_hour'] = ((df_features['hour'] >= 7) & (df_features['hour'] <= 9) | + (df_features['hour'] >= 17) & (df_features['hour'] <= 19)).astype(int) + + X = df_features[["h3_int", "week", "dow", "hour", + "hour_sin", "hour_cos", "dow_sin", "dow_cos", + "is_weekend", "is_rush_hour"]] + y = (df_features["nulo_rate"] > 0).astype(int) + + # Dividir con estratificación + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y + ) + + # Calcular peso para balancear clases + scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum() + + model = xgb.XGBClassifier( + n_estimators=500, + max_depth=8, + learning_rate=0.05, + subsample=0.85, + colsample_bytree=0.85, + colsample_bylevel=0.85, + gamma=0.2, + min_child_weight=5, + reg_alpha=0.3, + reg_lambda=1.5, + scale_pos_weight=scale_pos_weight, + eval_metric="logloss", + use_label_encoder=False, + tree_method='hist', + random_state=42, + n_jobs=-1 + ) + + model.fit( + X_train, y_train, + eval_set=[(X_train, y_train), (X_val, y_val)], + early_stopping_rounds=50, + verbose=False + ) + return model \ No newline at end of file