123/src/machine_learning.py
朱指乐 aa10e463b4 feat: 初始化垃圾短信分类项目基础结构
添加项目核心文件结构,包括:
- 配置文件和环境变量管理
- 数据处理和翻译模块
- 机器学习模型训练和评估
- 基于LLM的智能分析Agent
- 测试脚本和项目文档
2026-01-14 00:18:34 +08:00

317 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import polars as pl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, classification_report, confusion_matrix
)
import joblib
from pathlib import Path
from typing import Tuple, Dict, Any, Optional
from config import settings
class SpamClassifier:
"""垃圾短信分类器"""
def __init__(self, model_name: str = "lightgbm"):
"""初始化分类器"""
self.model_name = model_name
self.model = None
self.vectorizer = None
self.load_model()
def load_model(self):
"""加载模型和向量器"""
model_dir = Path(settings.model_save_path)
# 加载模型
model_path = model_dir / f"{self.model_name}_model.joblib"
self.model = joblib.load(model_path)
print(f"模型已从: {model_path} 加载")
# 加载向量器
vectorizer_path = model_dir / f"{self.model_name}_vectorizer.joblib"
self.vectorizer = joblib.load(vectorizer_path)
print(f"向量器已从: {vectorizer_path} 加载")
def classify(self, message: str) -> Dict[str, Any]:
"""分类单条短信"""
# 将短信转换为向量
message_vector = self.vectorizer.transform([message])
# 预测标签和置信度
label = self.model.predict(message_vector)[0]
confidence = self.model.predict_proba(message_vector)[0][label]
# 转换标签为文本
label_text = "spam" if label == 1 else "ham"
return {
"label": label_text,
"confidence": confidence
}
def extract_features(
X_train: pl.Series,
X_test: pl.Series,
max_features: int = 1000
) -> Tuple[Any, Any, TfidfVectorizer]:
"""
使用TF-IDF提取文本特征
Args:
X_train: 训练集文本
X_test: 测试集文本
max_features: 最大特征数
Returns:
训练集特征、测试集特征、TF-IDF向量化器
"""
# 将Polars Series转换为Pandas Series
X_train_pd = X_train.to_pandas()
X_test_pd = X_test.to_pandas()
# 初始化TF-IDF向量化器
tfidf = TfidfVectorizer(
max_features=max_features,
stop_words="english",
ngram_range=(1, 2)
)
# 拟合并转换训练集
X_train_tfidf = tfidf.fit_transform(X_train_pd)
# 转换测试集
X_test_tfidf = tfidf.transform(X_test_pd)
return X_train_tfidf, X_test_tfidf, tfidf
def train_logistic_regression(
X_train: Any,
y_train: pl.Series
) -> LogisticRegression:
"""
训练Logistic Regression模型
Args:
X_train: 训练集特征
y_train: 训练集标签
Returns:
训练好的Logistic Regression模型
"""
# 将Polars Series转换为Pandas Series
y_train_pd = y_train.to_pandas()
# 初始化Logistic Regression模型
log_reg = LogisticRegression(
random_state=settings.random_state,
max_iter=1000,
class_weight="balanced"
)
# 训练模型
log_reg.fit(X_train, y_train_pd)
return log_reg
def train_lightgbm(
X_train: Any,
y_train: pl.Series
) -> lgb.LGBMClassifier:
"""
训练LightGBM模型
Args:
X_train: 训练集特征
y_train: 训练集标签
Returns:
训练好的LightGBM模型
"""
# 将Polars Series转换为Pandas Series
y_train_pd = y_train.to_pandas()
# 初始化LightGBM模型
lgb_clf = lgb.LGBMClassifier(
random_state=settings.random_state,
class_weight="balanced",
n_estimators=1000,
learning_rate=0.1,
num_leaves=31
)
# 训练模型
lgb_clf.fit(X_train, y_train_pd)
return lgb_clf
def evaluate_model(
model: Any,
X_test: Any,
y_test: pl.Series
) -> Dict[str, float]:
"""
评估模型性能
Args:
model: 训练好的模型
X_test: 测试集特征
y_test: 测试集标签
Returns:
模型评估指标
"""
# 将Polars Series转换为Pandas Series
y_test_pd = y_test.to_pandas()
# 预测
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
# 计算评估指标
metrics = {
"accuracy": accuracy_score(y_test_pd, y_pred),
"precision": precision_score(y_test_pd, y_pred),
"recall": recall_score(y_test_pd, y_pred),
"f1": f1_score(y_test_pd, y_pred)
}
# 计算ROC-AUC如果模型支持概率预测
if y_pred_proba is not None:
metrics["roc_auc"] = roc_auc_score(y_test_pd, y_pred_proba)
# 打印分类报告和混淆矩阵
print("分类报告:")
print(classification_report(y_test_pd, y_pred))
print("混淆矩阵:")
print(confusion_matrix(y_test_pd, y_pred))
return metrics
def save_model(
model: Any,
model_name: str,
vectorizer: Any = None
) -> None:
"""
保存模型和向量器
Args:
model: 训练好的模型
model_name: 模型名称
vectorizer: TF-IDF向量化器
"""
# 创建模型保存目录
model_dir = Path(settings.model_save_path)
model_dir.mkdir(exist_ok=True)
# 保存模型
model_path = model_dir / f"{model_name}_model.joblib"
joblib.dump(model, model_path)
print(f"模型已保存到: {model_path}")
# 保存向量器(如果提供)
if vectorizer is not None:
vectorizer_path = model_dir / f"{model_name}_vectorizer.joblib"
joblib.dump(vectorizer, vectorizer_path)
print(f"向量器已保存到: {vectorizer_path}")
def load_model(
model_name: str
) -> Tuple[Any, Any]:
"""
加载模型和向量器
Args:
model_name: 模型名称
Returns:
加载的模型和向量器
"""
# 创建模型保存目录
model_dir = Path(settings.model_save_path)
# 加载模型
model_path = model_dir / f"{model_name}_model.joblib"
model = joblib.load(model_path)
print(f"模型已从: {model_path} 加载")
# 加载向量器
vectorizer_path = model_dir / f"{model_name}_vectorizer.joblib"
vectorizer = joblib.load(vectorizer_path)
print(f"向量器已从: {vectorizer_path} 加载")
return model, vectorizer
def main():
"""机器学习主函数"""
# 1. 加载数据集
print("正在加载数据集...")
df = pl.read_csv("../spam.csv", encoding="latin-1", ignore_errors=True)
# 2. 清洗数据集
print("正在清洗数据集...")
df = df.drop(df.columns[-3:])
df = df.rename({"v1": "label", "v2": "message"})
df = df.with_columns(
pl.when(pl.col("label") == "spam").then(1).otherwise(0).alias("label")
)
# 3. 分离特征和标签
X = df["message"]
y = df["label"]
# 4. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=settings.test_size, random_state=settings.random_state, stratify=y
)
print(f"训练集大小: {len(X_train)}")
print(f"测试集大小: {len(X_test)}")
# 5. 特征提取
print("正在提取特征...")
X_train_tfidf, X_test_tfidf, tfidf = extract_features(X_train, X_test)
# 6. 训练Logistic Regression模型
print("\n正在训练Logistic Regression模型...")
log_reg_model = train_logistic_regression(X_train_tfidf, y_train)
# 7. 评估Logistic Regression模型
print("\n评估Logistic Regression模型:")
log_reg_metrics = evaluate_model(log_reg_model, X_test_tfidf, y_test)
print(f"Logistic Regression指标: {log_reg_metrics}")
# 8. 训练LightGBM模型
print("\n正在训练LightGBM模型...")
lgb_model = train_lightgbm(X_train_tfidf, y_train)
# 9. 评估LightGBM模型
print("\n评估LightGBM模型:")
lgb_metrics = evaluate_model(lgb_model, X_test_tfidf, y_test)
print(f"LightGBM指标: {lgb_metrics}")
# 10. 保存模型
print("\n正在保存模型...")
save_model(log_reg_model, "logistic_regression", tfidf)
save_model(lgb_model, "lightgbm", tfidf)
print("\n机器学习流程完成!")
if __name__ == "__main__":
main()