akaAKR47/machine_learning.py
akr f47c7d7196 feat: 实现客户流失预测与行动建议闭环系统
添加完整的客户流失预测系统,包括数据处理、模型训练、预测和行动建议功能。主要包含以下模块:
1. 数据预处理流水线(Polars + Pandera)
2. 机器学习模型训练(LightGBM + Logistic Regression)
3. AI Agent预测和建议工具
4. Streamlit交互式Web界面
5. 完整的课程设计报告文档
2026-01-15 15:19:07 +08:00

189 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (f1_score, roc_auc_score, accuracy_score,
precision_score, recall_score, classification_report)
import lightgbm as lgb
import numpy as np
import joblib
import os
from data_processing import data_processing_pipeline, preprocess_data
# 模型训练和评估类
class ModelTrainer:
def __init__(self):
self.models = {}
self.metrics = {}
# 创建models目录如果不存在
os.makedirs("models", exist_ok=True)
# 训练Logistic Regression模型
def train_logreg(self, X, y):
print("训练Logistic Regression模型...")
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 模型参数网格
param_grid = {
'C': [0.01, 0.1, 1.0, 10.0, 100.0],
'max_iter': [1000],
'solver': ['lbfgs']
}
# 使用GridSearchCV进行参数调优
logreg = LogisticRegression(random_state=42)
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid,
cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_logreg = grid_search.best_estimator_
# 评估模型
y_pred = best_logreg.predict(X_test)
y_pred_proba = best_logreg.predict_proba(X_test)[:, 1]
metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba)
# 保存模型
joblib.dump(best_logreg, "models/logreg_model.pkl")
self.models["logreg"] = best_logreg
self.metrics["logreg"] = metrics
print("Logistic Regression模型训练完成")
print(f"最佳参数: {grid_search.best_params_}")
print(f"F1分数: {metrics['f1']:.4f}")
print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
return best_logreg, metrics
# 训练LightGBM模型
def train_lightgbm(self, X, y):
print("\n训练LightGBM模型...")
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 使用sklearn接口的LightGBM分类器
from lightgbm import LGBMClassifier
# 模型参数
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting_type': 'gbdt',
'random_state': 42,
'n_jobs': -1,
'verbose': -1
}
# 训练模型
lgbm = LGBMClassifier(**params)
# 简化训练不使用GridSearchCV
best_lgbm = lgbm.fit(X_train, y_train)
# 评估模型
y_pred_proba = best_lgbm.predict_proba(X_test)[:, 1]
y_pred = best_lgbm.predict(X_test)
metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba)
# 保存模型
joblib.dump(lgbm, "models/lightgbm_model.pkl")
self.models["lightgbm"] = lgbm
self.metrics["lightgbm"] = metrics
print("LightGBM模型训练完成")
print(f"F1分数: {metrics['f1']:.4f}")
print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
return lgbm, metrics
# 计算模型评估指标
def calculate_metrics(self, y_true, y_pred, y_pred_proba):
return {
'accuracy': accuracy_score(y_true, y_pred),
'precision': precision_score(y_true, y_pred),
'recall': recall_score(y_true, y_pred),
'f1': f1_score(y_true, y_pred),
'roc_auc': roc_auc_score(y_true, y_pred_proba)
}
# 对比模型性能
def compare_models(self):
print("\n" + "="*50)
print("模型性能对比")
print("="*50)
for model_name, metrics in self.metrics.items():
print(f"\n{model_name.upper()} 性能:")
print(f" Accuracy: {metrics['accuracy']:.4f}")
print(f" Precision: {metrics['precision']:.4f}")
print(f" Recall: {metrics['recall']:.4f}")
print(f" F1 Score: {metrics['f1']:.4f}")
print(f" ROC-AUC: {metrics['roc_auc']:.4f}")
# 找出最佳模型
best_model = max(self.metrics.keys(), key=lambda x: self.metrics[x]['f1'])
print(f"\n最佳模型: {best_model.upper()}")
print(f"最佳F1分数: {self.metrics[best_model]['f1']:.4f}")
return best_model
# 加载模型进行预测
def predict(self, model_name, X):
if model_name not in self.models:
# 尝试从文件加载模型
try:
model = joblib.load(f"models/{model_name}_model.pkl")
self.models[model_name] = model
except FileNotFoundError:
raise ValueError(f"Model {model_name} not found. Please train the model first.")
model = self.models[model_name]
y_pred_proba = model.predict_proba(X)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)
return y_pred, y_pred_proba
# 主函数
if __name__ == "__main__":
# 1. 数据处理
print("正在处理数据...")
X, y, df = data_processing_pipeline("data/Telco-Customer-Churn.csv")
X_np, y_np = preprocess_data(X, y)
# 2. 模型训练和评估
trainer = ModelTrainer()
# 训练Logistic Regression
logreg_model, logreg_metrics = trainer.train_logreg(X_np, y_np)
# 训练LightGBM
lgbm_model, lgbm_metrics = trainer.train_lightgbm(X_np, y_np)
# 对比模型
best_model = trainer.compare_models()
# 3. 检查是否达到要求
print("\n" + "="*50)
print("模型性能要求检查")
print("="*50)
best_f1 = trainer.metrics[best_model]['f1']
best_roc_auc = trainer.metrics[best_model]['roc_auc']
if best_f1 >= 0.70 or best_roc_auc >= 0.75:
print(f"✓ 模型性能达标最佳F1: {best_f1:.4f}, 最佳ROC-AUC: {best_roc_auc:.4f}")
print("✓ 满足F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的要求")
else:
print(f"✗ 模型性能未达标最佳F1: {best_f1:.4f}, 最佳ROC-AUC: {best_roc_auc:.4f}")
print("✗ 未满足F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的要求")