317 lines
8.5 KiB
Python
317 lines
8.5 KiB
Python
|
|
import polars as pl
|
|||
|
|
import pandas as pd
|
|||
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|||
|
|
from sklearn.linear_model import LogisticRegression
|
|||
|
|
from sklearn.ensemble import RandomForestClassifier
|
|||
|
|
import lightgbm as lgb
|
|||
|
|
from sklearn.model_selection import train_test_split, GridSearchCV
|
|||
|
|
from sklearn.metrics import (
|
|||
|
|
accuracy_score, precision_score, recall_score, f1_score,
|
|||
|
|
roc_auc_score, classification_report, confusion_matrix
|
|||
|
|
)
|
|||
|
|
import joblib
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Tuple, Dict, Any, Optional
|
|||
|
|
from config import settings
|
|||
|
|
|
|||
|
|
|
|||
|
|
class SpamClassifier:
|
|||
|
|
"""垃圾短信分类器"""
|
|||
|
|
def __init__(self, model_name: str = "lightgbm"):
|
|||
|
|
"""初始化分类器"""
|
|||
|
|
self.model_name = model_name
|
|||
|
|
self.model = None
|
|||
|
|
self.vectorizer = None
|
|||
|
|
self.load_model()
|
|||
|
|
|
|||
|
|
def load_model(self):
|
|||
|
|
"""加载模型和向量器"""
|
|||
|
|
model_dir = Path(settings.model_save_path)
|
|||
|
|
|
|||
|
|
# 加载模型
|
|||
|
|
model_path = model_dir / f"{self.model_name}_model.joblib"
|
|||
|
|
self.model = joblib.load(model_path)
|
|||
|
|
print(f"模型已从: {model_path} 加载")
|
|||
|
|
|
|||
|
|
# 加载向量器
|
|||
|
|
vectorizer_path = model_dir / f"{self.model_name}_vectorizer.joblib"
|
|||
|
|
self.vectorizer = joblib.load(vectorizer_path)
|
|||
|
|
print(f"向量器已从: {vectorizer_path} 加载")
|
|||
|
|
|
|||
|
|
def classify(self, message: str) -> Dict[str, Any]:
|
|||
|
|
"""分类单条短信"""
|
|||
|
|
# 将短信转换为向量
|
|||
|
|
message_vector = self.vectorizer.transform([message])
|
|||
|
|
|
|||
|
|
# 预测标签和置信度
|
|||
|
|
label = self.model.predict(message_vector)[0]
|
|||
|
|
confidence = self.model.predict_proba(message_vector)[0][label]
|
|||
|
|
|
|||
|
|
# 转换标签为文本
|
|||
|
|
label_text = "spam" if label == 1 else "ham"
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"label": label_text,
|
|||
|
|
"confidence": confidence
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_features(
|
|||
|
|
X_train: pl.Series,
|
|||
|
|
X_test: pl.Series,
|
|||
|
|
max_features: int = 1000
|
|||
|
|
) -> Tuple[Any, Any, TfidfVectorizer]:
|
|||
|
|
"""
|
|||
|
|
使用TF-IDF提取文本特征
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
X_train: 训练集文本
|
|||
|
|
X_test: 测试集文本
|
|||
|
|
max_features: 最大特征数
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
训练集特征、测试集特征、TF-IDF向量化器
|
|||
|
|
"""
|
|||
|
|
# 将Polars Series转换为Pandas Series
|
|||
|
|
X_train_pd = X_train.to_pandas()
|
|||
|
|
X_test_pd = X_test.to_pandas()
|
|||
|
|
|
|||
|
|
# 初始化TF-IDF向量化器
|
|||
|
|
tfidf = TfidfVectorizer(
|
|||
|
|
max_features=max_features,
|
|||
|
|
stop_words="english",
|
|||
|
|
ngram_range=(1, 2)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 拟合并转换训练集
|
|||
|
|
X_train_tfidf = tfidf.fit_transform(X_train_pd)
|
|||
|
|
|
|||
|
|
# 转换测试集
|
|||
|
|
X_test_tfidf = tfidf.transform(X_test_pd)
|
|||
|
|
|
|||
|
|
return X_train_tfidf, X_test_tfidf, tfidf
|
|||
|
|
|
|||
|
|
|
|||
|
|
def train_logistic_regression(
|
|||
|
|
X_train: Any,
|
|||
|
|
y_train: pl.Series
|
|||
|
|
) -> LogisticRegression:
|
|||
|
|
"""
|
|||
|
|
训练Logistic Regression模型
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
X_train: 训练集特征
|
|||
|
|
y_train: 训练集标签
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
训练好的Logistic Regression模型
|
|||
|
|
"""
|
|||
|
|
# 将Polars Series转换为Pandas Series
|
|||
|
|
y_train_pd = y_train.to_pandas()
|
|||
|
|
|
|||
|
|
# 初始化Logistic Regression模型
|
|||
|
|
log_reg = LogisticRegression(
|
|||
|
|
random_state=settings.random_state,
|
|||
|
|
max_iter=1000,
|
|||
|
|
class_weight="balanced"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 训练模型
|
|||
|
|
log_reg.fit(X_train, y_train_pd)
|
|||
|
|
|
|||
|
|
return log_reg
|
|||
|
|
|
|||
|
|
|
|||
|
|
def train_lightgbm(
|
|||
|
|
X_train: Any,
|
|||
|
|
y_train: pl.Series
|
|||
|
|
) -> lgb.LGBMClassifier:
|
|||
|
|
"""
|
|||
|
|
训练LightGBM模型
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
X_train: 训练集特征
|
|||
|
|
y_train: 训练集标签
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
训练好的LightGBM模型
|
|||
|
|
"""
|
|||
|
|
# 将Polars Series转换为Pandas Series
|
|||
|
|
y_train_pd = y_train.to_pandas()
|
|||
|
|
|
|||
|
|
# 初始化LightGBM模型
|
|||
|
|
lgb_clf = lgb.LGBMClassifier(
|
|||
|
|
random_state=settings.random_state,
|
|||
|
|
class_weight="balanced",
|
|||
|
|
n_estimators=1000,
|
|||
|
|
learning_rate=0.1,
|
|||
|
|
num_leaves=31
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 训练模型
|
|||
|
|
lgb_clf.fit(X_train, y_train_pd)
|
|||
|
|
|
|||
|
|
return lgb_clf
|
|||
|
|
|
|||
|
|
|
|||
|
|
def evaluate_model(
|
|||
|
|
model: Any,
|
|||
|
|
X_test: Any,
|
|||
|
|
y_test: pl.Series
|
|||
|
|
) -> Dict[str, float]:
|
|||
|
|
"""
|
|||
|
|
评估模型性能
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
model: 训练好的模型
|
|||
|
|
X_test: 测试集特征
|
|||
|
|
y_test: 测试集标签
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
模型评估指标
|
|||
|
|
"""
|
|||
|
|
# 将Polars Series转换为Pandas Series
|
|||
|
|
y_test_pd = y_test.to_pandas()
|
|||
|
|
|
|||
|
|
# 预测
|
|||
|
|
y_pred = model.predict(X_test)
|
|||
|
|
y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
|
|||
|
|
|
|||
|
|
# 计算评估指标
|
|||
|
|
metrics = {
|
|||
|
|
"accuracy": accuracy_score(y_test_pd, y_pred),
|
|||
|
|
"precision": precision_score(y_test_pd, y_pred),
|
|||
|
|
"recall": recall_score(y_test_pd, y_pred),
|
|||
|
|
"f1": f1_score(y_test_pd, y_pred)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 计算ROC-AUC(如果模型支持概率预测)
|
|||
|
|
if y_pred_proba is not None:
|
|||
|
|
metrics["roc_auc"] = roc_auc_score(y_test_pd, y_pred_proba)
|
|||
|
|
|
|||
|
|
# 打印分类报告和混淆矩阵
|
|||
|
|
print("分类报告:")
|
|||
|
|
print(classification_report(y_test_pd, y_pred))
|
|||
|
|
|
|||
|
|
print("混淆矩阵:")
|
|||
|
|
print(confusion_matrix(y_test_pd, y_pred))
|
|||
|
|
|
|||
|
|
return metrics
|
|||
|
|
|
|||
|
|
|
|||
|
|
def save_model(
|
|||
|
|
model: Any,
|
|||
|
|
model_name: str,
|
|||
|
|
vectorizer: Any = None
|
|||
|
|
) -> None:
|
|||
|
|
"""
|
|||
|
|
保存模型和向量器
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
model: 训练好的模型
|
|||
|
|
model_name: 模型名称
|
|||
|
|
vectorizer: TF-IDF向量化器
|
|||
|
|
"""
|
|||
|
|
# 创建模型保存目录
|
|||
|
|
model_dir = Path(settings.model_save_path)
|
|||
|
|
model_dir.mkdir(exist_ok=True)
|
|||
|
|
|
|||
|
|
# 保存模型
|
|||
|
|
model_path = model_dir / f"{model_name}_model.joblib"
|
|||
|
|
joblib.dump(model, model_path)
|
|||
|
|
print(f"模型已保存到: {model_path}")
|
|||
|
|
|
|||
|
|
# 保存向量器(如果提供)
|
|||
|
|
if vectorizer is not None:
|
|||
|
|
vectorizer_path = model_dir / f"{model_name}_vectorizer.joblib"
|
|||
|
|
joblib.dump(vectorizer, vectorizer_path)
|
|||
|
|
print(f"向量器已保存到: {vectorizer_path}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def load_model(
|
|||
|
|
model_name: str
|
|||
|
|
) -> Tuple[Any, Any]:
|
|||
|
|
"""
|
|||
|
|
加载模型和向量器
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
model_name: 模型名称
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
加载的模型和向量器
|
|||
|
|
"""
|
|||
|
|
# 创建模型保存目录
|
|||
|
|
model_dir = Path(settings.model_save_path)
|
|||
|
|
|
|||
|
|
# 加载模型
|
|||
|
|
model_path = model_dir / f"{model_name}_model.joblib"
|
|||
|
|
model = joblib.load(model_path)
|
|||
|
|
print(f"模型已从: {model_path} 加载")
|
|||
|
|
|
|||
|
|
# 加载向量器
|
|||
|
|
vectorizer_path = model_dir / f"{model_name}_vectorizer.joblib"
|
|||
|
|
vectorizer = joblib.load(vectorizer_path)
|
|||
|
|
print(f"向量器已从: {vectorizer_path} 加载")
|
|||
|
|
|
|||
|
|
return model, vectorizer
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""机器学习主函数"""
|
|||
|
|
# 1. 加载数据集
|
|||
|
|
print("正在加载数据集...")
|
|||
|
|
df = pl.read_csv("../spam.csv", encoding="latin-1", ignore_errors=True)
|
|||
|
|
|
|||
|
|
# 2. 清洗数据集
|
|||
|
|
print("正在清洗数据集...")
|
|||
|
|
df = df.drop(df.columns[-3:])
|
|||
|
|
df = df.rename({"v1": "label", "v2": "message"})
|
|||
|
|
df = df.with_columns(
|
|||
|
|
pl.when(pl.col("label") == "spam").then(1).otherwise(0).alias("label")
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 3. 分离特征和标签
|
|||
|
|
X = df["message"]
|
|||
|
|
y = df["label"]
|
|||
|
|
|
|||
|
|
# 4. 划分训练集和测试集
|
|||
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|||
|
|
X, y, test_size=settings.test_size, random_state=settings.random_state, stratify=y
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
print(f"训练集大小: {len(X_train)}")
|
|||
|
|
print(f"测试集大小: {len(X_test)}")
|
|||
|
|
|
|||
|
|
# 5. 特征提取
|
|||
|
|
print("正在提取特征...")
|
|||
|
|
X_train_tfidf, X_test_tfidf, tfidf = extract_features(X_train, X_test)
|
|||
|
|
|
|||
|
|
# 6. 训练Logistic Regression模型
|
|||
|
|
print("\n正在训练Logistic Regression模型...")
|
|||
|
|
log_reg_model = train_logistic_regression(X_train_tfidf, y_train)
|
|||
|
|
|
|||
|
|
# 7. 评估Logistic Regression模型
|
|||
|
|
print("\n评估Logistic Regression模型:")
|
|||
|
|
log_reg_metrics = evaluate_model(log_reg_model, X_test_tfidf, y_test)
|
|||
|
|
print(f"Logistic Regression指标: {log_reg_metrics}")
|
|||
|
|
|
|||
|
|
# 8. 训练LightGBM模型
|
|||
|
|
print("\n正在训练LightGBM模型...")
|
|||
|
|
lgb_model = train_lightgbm(X_train_tfidf, y_train)
|
|||
|
|
|
|||
|
|
# 9. 评估LightGBM模型
|
|||
|
|
print("\n评估LightGBM模型:")
|
|||
|
|
lgb_metrics = evaluate_model(lgb_model, X_test_tfidf, y_test)
|
|||
|
|
print(f"LightGBM指标: {lgb_metrics}")
|
|||
|
|
|
|||
|
|
# 10. 保存模型
|
|||
|
|
print("\n正在保存模型...")
|
|||
|
|
save_model(log_reg_model, "logistic_regression", tfidf)
|
|||
|
|
save_model(lgb_model, "lightgbm", tfidf)
|
|||
|
|
|
|||
|
|
print("\n机器学习流程完成!")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|