import polars as pl import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier import lightgbm as lgb from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix ) import joblib from pathlib import Path from typing import Tuple, Dict, Any, Optional from config import settings class SpamClassifier: """垃圾短信分类器""" def __init__(self, model_name: str = "lightgbm"): """初始化分类器""" self.model_name = model_name self.model = None self.vectorizer = None self.load_model() def load_model(self): """加载模型和向量器""" model_dir = Path(settings.model_save_path) # 加载模型 model_path = model_dir / f"{self.model_name}_model.joblib" self.model = joblib.load(model_path) print(f"模型已从: {model_path} 加载") # 加载向量器 vectorizer_path = model_dir / f"{self.model_name}_vectorizer.joblib" self.vectorizer = joblib.load(vectorizer_path) print(f"向量器已从: {vectorizer_path} 加载") def classify(self, message: str) -> Dict[str, Any]: """分类单条短信""" # 将短信转换为向量 message_vector = self.vectorizer.transform([message]) # 预测标签和置信度 label = self.model.predict(message_vector)[0] confidence = self.model.predict_proba(message_vector)[0][label] # 转换标签为文本 label_text = "spam" if label == 1 else "ham" return { "label": label_text, "confidence": confidence } def extract_features( X_train: pl.Series, X_test: pl.Series, max_features: int = 1000 ) -> Tuple[Any, Any, TfidfVectorizer]: """ 使用TF-IDF提取文本特征 Args: X_train: 训练集文本 X_test: 测试集文本 max_features: 最大特征数 Returns: 训练集特征、测试集特征、TF-IDF向量化器 """ # 将Polars Series转换为Pandas Series X_train_pd = X_train.to_pandas() X_test_pd = X_test.to_pandas() # 初始化TF-IDF向量化器 tfidf = TfidfVectorizer( max_features=max_features, stop_words="english", ngram_range=(1, 2) ) # 拟合并转换训练集 X_train_tfidf = tfidf.fit_transform(X_train_pd) # 转换测试集 X_test_tfidf = tfidf.transform(X_test_pd) return X_train_tfidf, X_test_tfidf, tfidf def train_logistic_regression( X_train: Any, y_train: pl.Series ) -> LogisticRegression: """ 训练Logistic Regression模型 Args: X_train: 训练集特征 y_train: 训练集标签 Returns: 训练好的Logistic Regression模型 """ # 将Polars Series转换为Pandas Series y_train_pd = y_train.to_pandas() # 初始化Logistic Regression模型 log_reg = LogisticRegression( random_state=settings.random_state, max_iter=1000, class_weight="balanced" ) # 训练模型 log_reg.fit(X_train, y_train_pd) return log_reg def train_lightgbm( X_train: Any, y_train: pl.Series ) -> lgb.LGBMClassifier: """ 训练LightGBM模型 Args: X_train: 训练集特征 y_train: 训练集标签 Returns: 训练好的LightGBM模型 """ # 将Polars Series转换为Pandas Series y_train_pd = y_train.to_pandas() # 初始化LightGBM模型 lgb_clf = lgb.LGBMClassifier( random_state=settings.random_state, class_weight="balanced", n_estimators=1000, learning_rate=0.1, num_leaves=31 ) # 训练模型 lgb_clf.fit(X_train, y_train_pd) return lgb_clf def evaluate_model( model: Any, X_test: Any, y_test: pl.Series ) -> Dict[str, float]: """ 评估模型性能 Args: model: 训练好的模型 X_test: 测试集特征 y_test: 测试集标签 Returns: 模型评估指标 """ # 将Polars Series转换为Pandas Series y_test_pd = y_test.to_pandas() # 预测 y_pred = model.predict(X_test) y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None # 计算评估指标 metrics = { "accuracy": accuracy_score(y_test_pd, y_pred), "precision": precision_score(y_test_pd, y_pred), "recall": recall_score(y_test_pd, y_pred), "f1": f1_score(y_test_pd, y_pred) } # 计算ROC-AUC(如果模型支持概率预测) if y_pred_proba is not None: metrics["roc_auc"] = roc_auc_score(y_test_pd, y_pred_proba) # 打印分类报告和混淆矩阵 print("分类报告:") print(classification_report(y_test_pd, y_pred)) print("混淆矩阵:") print(confusion_matrix(y_test_pd, y_pred)) return metrics def save_model( model: Any, model_name: str, vectorizer: Any = None ) -> None: """ 保存模型和向量器 Args: model: 训练好的模型 model_name: 模型名称 vectorizer: TF-IDF向量化器 """ # 创建模型保存目录 model_dir = Path(settings.model_save_path) model_dir.mkdir(exist_ok=True) # 保存模型 model_path = model_dir / f"{model_name}_model.joblib" joblib.dump(model, model_path) print(f"模型已保存到: {model_path}") # 保存向量器(如果提供) if vectorizer is not None: vectorizer_path = model_dir / f"{model_name}_vectorizer.joblib" joblib.dump(vectorizer, vectorizer_path) print(f"向量器已保存到: {vectorizer_path}") def load_model( model_name: str ) -> Tuple[Any, Any]: """ 加载模型和向量器 Args: model_name: 模型名称 Returns: 加载的模型和向量器 """ # 创建模型保存目录 model_dir = Path(settings.model_save_path) # 加载模型 model_path = model_dir / f"{model_name}_model.joblib" model = joblib.load(model_path) print(f"模型已从: {model_path} 加载") # 加载向量器 vectorizer_path = model_dir / f"{model_name}_vectorizer.joblib" vectorizer = joblib.load(vectorizer_path) print(f"向量器已从: {vectorizer_path} 加载") return model, vectorizer def main(): """机器学习主函数""" # 1. 加载数据集 print("正在加载数据集...") df = pl.read_csv("../spam.csv", encoding="latin-1", ignore_errors=True) # 2. 清洗数据集 print("正在清洗数据集...") df = df.drop(df.columns[-3:]) df = df.rename({"v1": "label", "v2": "message"}) df = df.with_columns( pl.when(pl.col("label") == "spam").then(1).otherwise(0).alias("label") ) # 3. 分离特征和标签 X = df["message"] y = df["label"] # 4. 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=settings.test_size, random_state=settings.random_state, stratify=y ) print(f"训练集大小: {len(X_train)}") print(f"测试集大小: {len(X_test)}") # 5. 特征提取 print("正在提取特征...") X_train_tfidf, X_test_tfidf, tfidf = extract_features(X_train, X_test) # 6. 训练Logistic Regression模型 print("\n正在训练Logistic Regression模型...") log_reg_model = train_logistic_regression(X_train_tfidf, y_train) # 7. 评估Logistic Regression模型 print("\n评估Logistic Regression模型:") log_reg_metrics = evaluate_model(log_reg_model, X_test_tfidf, y_test) print(f"Logistic Regression指标: {log_reg_metrics}") # 8. 训练LightGBM模型 print("\n正在训练LightGBM模型...") lgb_model = train_lightgbm(X_train_tfidf, y_train) # 9. 评估LightGBM模型 print("\n评估LightGBM模型:") lgb_metrics = evaluate_model(lgb_model, X_test_tfidf, y_test) print(f"LightGBM指标: {lgb_metrics}") # 10. 保存模型 print("\n正在保存模型...") save_model(log_reg_model, "logistic_regression", tfidf) save_model(lgb_model, "lightgbm", tfidf) print("\n机器学习流程完成!") if __name__ == "__main__": main()