from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.metrics import (f1_score, roc_auc_score, accuracy_score, precision_score, recall_score, classification_report) import lightgbm as lgb import numpy as np import joblib import os from data_processing import data_processing_pipeline, preprocess_data # 模型训练和评估类 class ModelTrainer: def __init__(self): self.models = {} self.metrics = {} # 创建models目录(如果不存在) os.makedirs("models", exist_ok=True) # 训练Logistic Regression模型 def train_logreg(self, X, y): print("训练Logistic Regression模型...") # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # 模型参数网格 param_grid = { 'C': [0.01, 0.1, 1.0, 10.0, 100.0], 'max_iter': [1000], 'solver': ['lbfgs'] } # 使用GridSearchCV进行参数调优 logreg = LogisticRegression(random_state=42) grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1) grid_search.fit(X_train, y_train) best_logreg = grid_search.best_estimator_ # 评估模型 y_pred = best_logreg.predict(X_test) y_pred_proba = best_logreg.predict_proba(X_test)[:, 1] metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba) # 保存模型 joblib.dump(best_logreg, "models/logreg_model.pkl") self.models["logreg"] = best_logreg self.metrics["logreg"] = metrics print("Logistic Regression模型训练完成!") print(f"最佳参数: {grid_search.best_params_}") print(f"F1分数: {metrics['f1']:.4f}") print(f"ROC-AUC: {metrics['roc_auc']:.4f}") return best_logreg, metrics # 训练LightGBM模型 def train_lightgbm(self, X, y): print("\n训练LightGBM模型...") # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # 使用sklearn接口的LightGBM分类器 from lightgbm import LGBMClassifier # 模型参数 params = { 'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'random_state': 42, 'n_jobs': -1, 'verbose': -1 } # 训练模型 lgbm = LGBMClassifier(**params) # 简化训练,不使用GridSearchCV best_lgbm = lgbm.fit(X_train, y_train) # 评估模型 y_pred_proba = best_lgbm.predict_proba(X_test)[:, 1] y_pred = best_lgbm.predict(X_test) metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba) # 保存模型 joblib.dump(lgbm, "models/lightgbm_model.pkl") self.models["lightgbm"] = lgbm self.metrics["lightgbm"] = metrics print("LightGBM模型训练完成!") print(f"F1分数: {metrics['f1']:.4f}") print(f"ROC-AUC: {metrics['roc_auc']:.4f}") return lgbm, metrics # 计算模型评估指标 def calculate_metrics(self, y_true, y_pred, y_pred_proba): return { 'accuracy': accuracy_score(y_true, y_pred), 'precision': precision_score(y_true, y_pred), 'recall': recall_score(y_true, y_pred), 'f1': f1_score(y_true, y_pred), 'roc_auc': roc_auc_score(y_true, y_pred_proba) } # 对比模型性能 def compare_models(self): print("\n" + "="*50) print("模型性能对比") print("="*50) for model_name, metrics in self.metrics.items(): print(f"\n{model_name.upper()} 性能:") print(f" Accuracy: {metrics['accuracy']:.4f}") print(f" Precision: {metrics['precision']:.4f}") print(f" Recall: {metrics['recall']:.4f}") print(f" F1 Score: {metrics['f1']:.4f}") print(f" ROC-AUC: {metrics['roc_auc']:.4f}") # 找出最佳模型 best_model = max(self.metrics.keys(), key=lambda x: self.metrics[x]['f1']) print(f"\n最佳模型: {best_model.upper()}") print(f"最佳F1分数: {self.metrics[best_model]['f1']:.4f}") return best_model # 加载模型进行预测 def predict(self, model_name, X): if model_name not in self.models: # 尝试从文件加载模型 try: model = joblib.load(f"models/{model_name}_model.pkl") self.models[model_name] = model except FileNotFoundError: raise ValueError(f"Model {model_name} not found. Please train the model first.") model = self.models[model_name] y_pred_proba = model.predict_proba(X)[:, 1] y_pred = (y_pred_proba >= 0.5).astype(int) return y_pred, y_pred_proba # 主函数 if __name__ == "__main__": # 1. 数据处理 print("正在处理数据...") X, y, df = data_processing_pipeline("data/Telco-Customer-Churn.csv") X_np, y_np = preprocess_data(X, y) # 2. 模型训练和评估 trainer = ModelTrainer() # 训练Logistic Regression logreg_model, logreg_metrics = trainer.train_logreg(X_np, y_np) # 训练LightGBM lgbm_model, lgbm_metrics = trainer.train_lightgbm(X_np, y_np) # 对比模型 best_model = trainer.compare_models() # 3. 检查是否达到要求 print("\n" + "="*50) print("模型性能要求检查") print("="*50) best_f1 = trainer.metrics[best_model]['f1'] best_roc_auc = trainer.metrics[best_model]['roc_auc'] if best_f1 >= 0.70 or best_roc_auc >= 0.75: print(f"✓ 模型性能达标!最佳F1: {best_f1:.4f}, 最佳ROC-AUC: {best_roc_auc:.4f}") print("✓ 满足F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的要求") else: print(f"✗ 模型性能未达标!最佳F1: {best_f1:.4f}, 最佳ROC-AUC: {best_roc_auc:.4f}") print("✗ 未满足F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的要求")