feat: 初始化垃圾短信分类项目基础结构

添加项目核心文件结构，包括： - 配置文件和环境变量管理 - 数据处理和翻译模块 - 机器学习模型训练和评估 - 基于LLM的智能分析Agent - 测试脚本和项目文档
2026-01-14 00:18:34 +08:00 · 2026-01-14 00:18:34 +08:00 · aa10e463b4
commit aa10e463b4
parent d597ddd2ff
14 changed files with 1213 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,6 @@
 # DeepSeek API Configuration
 DEEPSEEK_API_KEY="your-deepseek-api-key-here"
 # Project Configuration
 MODEL_SAVE_PATH="./models"
 DATA_PATH="./data"
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,54 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 # Environment
 .env
 .env.local
 .env.development.local
 .env.test.local
 .env.production.local
 # Dependencies
 .venv/
 venv/
 env/
 # Data
 data/
 *.csv
 *.parquet
 *.h5
 # Models
 models/
 *.joblib
 *.pkl
 *.model
 *.txt
 # Logs
 logs/
 *.log
 # Build
 dist/
 build/
 *.egg-info/
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # Testing
 .pytest_cache/
 .coverage
 htmlcov/
 # OS
 .DS_Store
 Thumbs.db
--- a/.trae/documents/垃圾短信分类项目实现计划.md
+++ b/.trae/documents/垃圾短信分类项目实现计划.md
@ -0,0 +1,49 @@
 # 垃圾短信分类项目实现计划
 ## 1. 项目结构搭建
 - 创建项目目录结构，包括 `src`、`data`、`models` 等目录
 - 初始化项目依赖，使用 uv 进行管理
 - 创建配置文件和环境变量管理
 ## 2. 数据处理
 - 使用 Polars 加载和清洗 spam.csv 数据集
 - 将英文短信翻译成中文，使用 DeepSeek API
 - 使用 Pandera 定义数据 Schema 进行验证
 - 数据预处理和特征工程
 ## 3. 机器学习模型
 - 实现至少两个模型：Logistic Regression 作为基线，LightGBM 作为强模型
 - 模型训练、验证和评估
 - 模型保存与加载
 - 达到 F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的性能指标
 ## 4. LLM 集成
 - 使用 DeepSeek API 进行短信内容解释和归因
 - 生成结构化的行动建议
 - 确保输出可追溯、可复现
 ## 5. Agent 框架
 - 使用 pydantic-ai 构建结构化输出的 Agent
 - 实现至少两个工具：ML 预测工具和评估工具
 - 构建完整的工具调用流程
 ## 6. 项目测试和部署
 - 编写单元测试和集成测试
 - 确保项目可在教师机上运行
 - 准备项目展示材料
 ## 技术栈
 - Python 3.12
 - uv 进行项目管理
 - Polars + Pandas 进行数据处理
 - Pandera 进行数据验证
 - Scikit-learn + LightGBM 进行机器学习
 - pydantic-ai 作为 Agent 框架
 - DeepSeek API 作为 LLM 提供方
 ## 预期成果
 - 一个完整的垃圾短信分类系统
 - 中文翻译后的数据集
 - 可复现的机器学习模型
 - 基于 LLM 的智能建议生成
 - 结构化、可追溯的输出
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,41 @@
 [tool.uv]
 index-url = "https://mirrors.aliyun.com/pypi/simple/"
 [project]
 name = "spam-classification"
 version = "0.1.0"
 authors = [{ name = "Your Name", email = "your.email@example.com" }]
 description = "Spam message classification with ML and LLM integration"
 readme = "README.md"
 requires-python = ">=3.12"
 [project.dependencies]
 pandas = ">=2.2"
 polars = ">=0.20"
 pandera = ">=0.18"
 scikit-learn = ">=1.4"
 lightgbm = ">=4.3"
 pydantic = ">=2.5"
 pydantic-ai = ">=0.3"
 python-dotenv = ">=1.0"
 requests = ">=2.31"
 [project.optional-dependencies]
 dev = [
    "pytest>=7.4",
    "ruff>=0.2"
 ]
 [build-system]
 requires = ["uv>=0.1.0"]
 build-backend = "uv.build_api"
 [tool.ruff]
 select = ["E", "F", "W"]
 line-length = 88
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 python_files = "test_*.py"
 python_classes = "Test*"
 python_functions = "test_*"
--- a/simple_test.py
+++ b/simple_test.py
@ -0,0 +1,50 @@
 import requests
 # 直接测试DeepSeek API
 def test_deepseek_api():
    api_key = "sk-591e36a6b1bd4b34b663b466ff22085e"
    api_base = "https://api.deepseek.com"
    model = "deepseek-chat"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": [
            {
                "role": "system",
                "content": "You are a professional translator. Translate the following text to Chinese. Keep the original meaning and tone. Do not add any additional information."
            },
            {
                "role": "user",
                "content": "Hello, how are you?"
            }
        ],
        "max_tokens": 1000,
        "temperature": 0.1
    }
    try:
        response = requests.post(
            f"{api_base}/chat/completions",
            headers=headers,
            json=payload,
            timeout=30
        )
        response.raise_for_status()
        result = response.json()
        print("API响应:", result)
        translated_text = result["choices"][0]["message"]["content"].strip()
        print(f"翻译结果: {translated_text}")
        return translated_text
    except requests.exceptions.RequestException as e:
        print(f"翻译失败: {e}")
        return None
 if __name__ == "__main__":
    test_deepseek_api()
--- a/src/agent.py
+++ b/src/agent.py
@ -0,0 +1,250 @@
 import polars as pl
 import pandas as pd
 from typing import List, Dict, Any, Optional
 from pydantic import BaseModel, Field
 from pydantic_ai import AI
 from pydantic_ai.agent import Tool
 import joblib
 from pathlib import Path
 from config import settings
 from machine_learning import extract_features
 from translation import translate_text
 class Message(BaseModel):
    """短信模型"""
    content: str = Field(..., description="短信内容")
    is_english: bool = Field(default=True, description="短信是否为英文")
 class ClassificationResult(BaseModel):
    """分类结果模型"""
    label: str = Field(..., description="分类标签，ham或spam")
    confidence: float = Field(..., description="分类置信度")
 class Explanation(BaseModel):
    """解释模型"""
    key_words: List[str] = Field(..., description="关键特征词")
    reason: str = Field(..., description="分类原因")
    suggestion: str = Field(..., description="行动建议")
 class AnalysisResult(BaseModel):
    """分析结果模型"""
    message: str = Field(..., description="原始短信")
    message_zh: str = Field(..., description="中文翻译")
    classification: ClassificationResult = Field(..., description="分类结果")
    explanation: Explanation = Field(..., description="分类解释和建议")
 class SpamClassifier:
    """垃圾短信分类器"""
    def __init__(self, model_name: str = "lightgbm"):
        """初始化分类器"""
        self.model_name = model_name
        self.model = None
        self.vectorizer = None
        self.load_model()
    def load_model(self):
        """加载模型和向量器"""
        model_dir = Path(settings.model_save_path)
        # 加载模型
        model_path = model_dir / f"{self.model_name}_model.joblib"
        self.model = joblib.load(model_path)
        print(f"模型已从: {model_path} 加载")
        # 加载向量器
        vectorizer_path = model_dir / f"{self.model_name}_vectorizer.joblib"
        self.vectorizer = joblib.load(vectorizer_path)
        print(f"向量器已从: {vectorizer_path} 加载")
    def classify(self, message: str) -> Dict[str, Any]:
        """分类单条短信"""
        # 将短信转换为向量
        message_vector = self.vectorizer.transform([message])
        # 预测标签和置信度
        label = self.model.predict(message_vector)[0]
        confidence = self.model.predict_proba(message_vector)[0][label]
        # 转换标签为文本
        label_text = "spam" if label == 1 else "ham"
        return {
            "label": label_text,
            "confidence": confidence
        }
 class SpamAnalysisTool(Tool):
    """垃圾短信分析工具"""
    def __init__(self, classifier: SpamClassifier):
        super().__init__(name="spam_analysis_tool", description="分析短信是否为垃圾短信，并提供解释和建议")
        self.classifier = classifier
    async def __call__(self, message: str, is_english: bool = True) -> AnalysisResult:
        """调用工具分析短信"""
        # 如果是英文，翻译成中文
        message_zh = translate_text(message, "zh-CN") if is_english else message
        # 分类短信
        classification = self.classifier.classify(message)
        # 生成解释和建议
        explanation = self.generate_explanation(message, classification["label"])
        return AnalysisResult(
            message=message,
            message_zh=message_zh,
            classification=ClassificationResult(
                label=classification["label"],
                confidence=classification["confidence"]
            ),
            explanation=explanation
        )
    def generate_explanation(self, message: str, label: str) -> Explanation:
        """生成解释和建议"""
        # 简单的关键词提取（实际项目中可以使用更复杂的方法）
        key_words = self.extract_keywords(message)
        # 生成原因和建议
        if label == "spam":
            reason = f"该短信包含垃圾短信特征词: {', '.join(key_words)}"
            suggestion = "建议立即删除该短信，不要点击任何链接，不要回复，避免上当受骗"
        else:
            reason = f"该短信为正常短信，包含常用词汇: {', '.join(key_words)}"
            suggestion = "可以正常回复和处理该短信"
        return Explanation(
            key_words=key_words,
            reason=reason,
            suggestion=suggestion
        )
    def extract_keywords(self, message: str, top_n: int = 5) -> List[str]:
        """提取关键词"""
        # 使用TF-IDF向量器提取关键词
        words = message.lower().split()
        # 过滤停用词
        stop_words = set(self.vectorizer.get_stop_words()) if self.vectorizer.get_stop_words() else set()
        keywords = [word for word in words if word not in stop_words and len(word) > 2]
        # 只返回前top_n个关键词
        return keywords[:top_n]
 class ModelEvaluationTool(Tool):
    """模型评估工具"""
    def __init__(self, classifier: SpamClassifier):
        super().__init__(name="model_evaluation_tool", description="评估模型在给定数据集上的性能")
        self.classifier = classifier
    async def __call__(self, test_data: List[str], labels: List[str]) -> Dict[str, float]:
        """评估模型性能"""
        # 转换数据格式
        test_series = pl.Series("message", test_data)
        # 提取特征
        # 注意：这里我们需要重新训练向量器或使用已有的向量器
        # 为了简化，我们直接使用已有的向量器转换数据
        test_vectors = self.classifier.vectorizer.transform(test_data)
        # 预测
        predictions = self.classifier.model.predict(test_vectors)
        predictions_proba = self.classifier.model.predict_proba(test_vectors)[:, 1]
        # 转换标签为数值
        y_true = [1 if label == "spam" else 0 for label in labels]
        # 计算评估指标
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
        metrics = {
            "accuracy": accuracy_score(y_true, predictions),
            "precision": precision_score(y_true, predictions),
            "recall": recall_score(y_true, predictions),
            "f1": f1_score(y_true, predictions),
            "roc_auc": roc_auc_score(y_true, predictions_proba)
        }
        return metrics
 class SpamAnalysisAgent:
    """垃圾短信分析Agent"""
    def __init__(self, model_name: str = "lightgbm"):
        """初始化Agent"""
        # 创建分类器
        self.classifier = SpamClassifier(model_name)
        # 创建工具
        self.tools = [
            SpamAnalysisTool(self.classifier),
            ModelEvaluationTool(self.classifier)
        ]
        # 创建AI实例
        self.ai = AI(
            model=settings.deepseek_model,
            api_key=settings.deepseek_api_key,
            api_base=settings.deepseek_api_base,
            tools=self.tools
        )
    async def analyze_message(self, message: str, is_english: bool = True) -> AnalysisResult:
        """分析单条短信"""
        # 使用AI工具分析短信
        result = await self.ai.run(
            f"分析以下短信: {message}",
            output_model=AnalysisResult,
            max_tokens=1000,
            temperature=0.1
        )
        return result
    async def batch_analyze(self, messages: List[str], is_english: bool = True) -> List[AnalysisResult]:
        """批量分析短信"""
        results = []
        for message in messages:
            result = await self.analyze_message(message, is_english)
            results.append(result)
        return results
 async def main():
    """Agent主函数"""
    # 创建Agent实例
    agent = SpamAnalysisAgent()
    # 测试短信
    test_messages = [
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
        "Ok lar... Joking wif u oni...",
        "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
    ]
    # 分析短信
    for message in test_messages:
        print(f"\n=== 分析短信 ===")
        print(f"原始短信: {message}")
        result = await agent.analyze_message(message)
        print(f"分类结果: {result.classification.label} (置信度: {result.classification.confidence:.2f})")
        print(f"中文翻译: {result.message_zh}")
        print(f"关键特征词: {', '.join(result.explanation.key_words)}")
        print(f"分类原因: {result.explanation.reason}")
        print(f"行动建议: {result.explanation.suggestion}")
 if __name__ == "__main__":
    import asyncio
    asyncio.run(main())
--- a/src/config.py
+++ b/src/config.py
@ -0,0 +1,29 @@
 from pydantic_settings import BaseSettings
 from typing import Optional
 class Settings(BaseSettings):
    """项目配置类"""
    # DeepSeek API配置
    deepseek_api_key: str
    # 项目路径配置
    model_save_path: str = "./models"
    data_path: str = "./data"
    # 模型配置
    random_state: int = 42
    test_size: float = 0.2
    # DeepSeek API配置
    deepseek_api_base: str = "https://api.deepseek.com"
    deepseek_model: str = "deepseek-chat"
    class Config:
        import os
        env_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), ".env")
        env_file_encoding = "utf-8"
 # 创建全局配置实例
 settings = Settings()
--- a/src/data_processing.py
+++ b/src/data_processing.py
@ -0,0 +1,76 @@
 import polars as pl
 import pandas as pd
 from pathlib import Path
 from typing import Tuple
 def load_data(file_path: str) -> pl.DataFrame:
    """使用Polars加载数据集"""
    # 加载csv文件，处理编码问题
    df = pl.read_csv(
        file_path,
        encoding="latin-1",
        ignore_errors=True,
        has_header=True
    )
    return df
 def clean_data(df: pl.DataFrame) -> pl.DataFrame:
    """清洗数据集"""
    # 查看数据集基本信息
    print("原始数据集形状:", df.shape)
    print("原始数据集列名:", df.columns)
    # 删除不必要的列（最后三列都是空的）
    df = df.drop(df.columns[-3:])
    # 重命名列名
    df = df.rename({
        "v1": "label",
        "v2": "message"
    })
    # 查看清洗后的数据集
    print("清洗后数据集形状:", df.shape)
    print("清洗后数据集列名:", df.columns)
    print("标签分布:", df["label"].value_counts())
    return df
 def preprocess_data(df: pl.DataFrame) -> Tuple[pl.DataFrame, pl.Series]:
    """预处理数据，准备用于模型训练"""
    # 将标签转换为数值（ham=0, spam=1）
    df = df.with_columns(
        pl.when(pl.col("label") == "spam").then(1).otherwise(0).alias("label")
    )
    # 分离特征和标签
    X = df.drop("label")
    y = df["label"]
    return X, y
 def save_data(df: pl.DataFrame, file_path: str) -> None:
    """保存处理后的数据集"""
    df.write_csv(file_path, index=False)
    print(f"数据集已保存到: {file_path}")
 if __name__ == "__main__":
    # 测试数据处理流程
    file_path = "../spam.csv"
    # 检查文件是否存在
    import os
    if not os.path.exists(file_path):
        file_path = "./spam.csv"
    df = load_data(file_path)
    df_cleaned = clean_data(df)
    X, y = preprocess_data(df_cleaned)
    print("特征数据形状:", X.shape)
    print("标签数据形状:", y.shape)
    print("前5行数据:")
    print(df_cleaned.head())
--- a/src/machine_learning.py
+++ b/src/machine_learning.py
@ -0,0 +1,316 @@
 import polars as pl
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 import lightgbm as lgb
 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
 )
 import joblib
 from pathlib import Path
 from typing import Tuple, Dict, Any, Optional
 from config import settings
 class SpamClassifier:
    """垃圾短信分类器"""
    def __init__(self, model_name: str = "lightgbm"):
        """初始化分类器"""
        self.model_name = model_name
        self.model = None
        self.vectorizer = None
        self.load_model()
    def load_model(self):
        """加载模型和向量器"""
        model_dir = Path(settings.model_save_path)
        # 加载模型
        model_path = model_dir / f"{self.model_name}_model.joblib"
        self.model = joblib.load(model_path)
        print(f"模型已从: {model_path} 加载")
        # 加载向量器
        vectorizer_path = model_dir / f"{self.model_name}_vectorizer.joblib"
        self.vectorizer = joblib.load(vectorizer_path)
        print(f"向量器已从: {vectorizer_path} 加载")
    def classify(self, message: str) -> Dict[str, Any]:
        """分类单条短信"""
        # 将短信转换为向量
        message_vector = self.vectorizer.transform([message])
        # 预测标签和置信度
        label = self.model.predict(message_vector)[0]
        confidence = self.model.predict_proba(message_vector)[0][label]
        # 转换标签为文本
        label_text = "spam" if label == 1 else "ham"
        return {
            "label": label_text,
            "confidence": confidence
        }
 def extract_features(
    X_train: pl.Series, 
    X_test: pl.Series, 
    max_features: int = 1000
 ) -> Tuple[Any, Any, TfidfVectorizer]:
    """
    使用TF-IDF提取文本特征
    Args:
        X_train: 训练集文本
        X_test: 测试集文本
        max_features: 最大特征数
    Returns:
        训练集特征、测试集特征、TF-IDF向量化器
    """
    # 将Polars Series转换为Pandas Series
    X_train_pd = X_train.to_pandas()
    X_test_pd = X_test.to_pandas()
    # 初始化TF-IDF向量化器
    tfidf = TfidfVectorizer(
        max_features=max_features,
        stop_words="english",
        ngram_range=(1, 2)
    )
    # 拟合并转换训练集
    X_train_tfidf = tfidf.fit_transform(X_train_pd)
    # 转换测试集
    X_test_tfidf = tfidf.transform(X_test_pd)
    return X_train_tfidf, X_test_tfidf, tfidf
 def train_logistic_regression(
    X_train: Any, 
    y_train: pl.Series
 ) -> LogisticRegression:
    """
    训练Logistic Regression模型
    Args:
        X_train: 训练集特征
        y_train: 训练集标签
    Returns:
        训练好的Logistic Regression模型
    """
    # 将Polars Series转换为Pandas Series
    y_train_pd = y_train.to_pandas()
    # 初始化Logistic Regression模型
    log_reg = LogisticRegression(
        random_state=settings.random_state,
        max_iter=1000,
        class_weight="balanced"
    )
    # 训练模型
    log_reg.fit(X_train, y_train_pd)
    return log_reg
 def train_lightgbm(
    X_train: Any, 
    y_train: pl.Series
 ) -> lgb.LGBMClassifier:
    """
    训练LightGBM模型
    Args:
        X_train: 训练集特征
        y_train: 训练集标签
    Returns:
        训练好的LightGBM模型
    """
    # 将Polars Series转换为Pandas Series
    y_train_pd = y_train.to_pandas()
    # 初始化LightGBM模型
    lgb_clf = lgb.LGBMClassifier(
        random_state=settings.random_state,
        class_weight="balanced",
        n_estimators=1000,
        learning_rate=0.1,
        num_leaves=31
    )
    # 训练模型
    lgb_clf.fit(X_train, y_train_pd)
    return lgb_clf
 def evaluate_model(
    model: Any, 
    X_test: Any, 
    y_test: pl.Series
 ) -> Dict[str, float]:
    """
    评估模型性能
    Args:
        model: 训练好的模型
        X_test: 测试集特征
        y_test: 测试集标签
    Returns:
        模型评估指标
    """
    # 将Polars Series转换为Pandas Series
    y_test_pd = y_test.to_pandas()
    # 预测
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    # 计算评估指标
    metrics = {
        "accuracy": accuracy_score(y_test_pd, y_pred),
        "precision": precision_score(y_test_pd, y_pred),
        "recall": recall_score(y_test_pd, y_pred),
        "f1": f1_score(y_test_pd, y_pred)
    }
    # 计算ROC-AUC（如果模型支持概率预测）
    if y_pred_proba is not None:
        metrics["roc_auc"] = roc_auc_score(y_test_pd, y_pred_proba)
    # 打印分类报告和混淆矩阵
    print("分类报告:")
    print(classification_report(y_test_pd, y_pred))
    print("混淆矩阵:")
    print(confusion_matrix(y_test_pd, y_pred))
    return metrics
 def save_model(
    model: Any, 
    model_name: str, 
    vectorizer: Any = None
 ) -> None:
    """
    保存模型和向量器
    Args:
        model: 训练好的模型
        model_name: 模型名称
        vectorizer: TF-IDF向量化器
    """
    # 创建模型保存目录
    model_dir = Path(settings.model_save_path)
    model_dir.mkdir(exist_ok=True)
    # 保存模型
    model_path = model_dir / f"{model_name}_model.joblib"
    joblib.dump(model, model_path)
    print(f"模型已保存到: {model_path}")
    # 保存向量器（如果提供）
    if vectorizer is not None:
        vectorizer_path = model_dir / f"{model_name}_vectorizer.joblib"
        joblib.dump(vectorizer, vectorizer_path)
        print(f"向量器已保存到: {vectorizer_path}")
 def load_model(
    model_name: str
 ) -> Tuple[Any, Any]:
    """
    加载模型和向量器
    Args:
        model_name: 模型名称
    Returns:
        加载的模型和向量器
    """
    # 创建模型保存目录
    model_dir = Path(settings.model_save_path)
    # 加载模型
    model_path = model_dir / f"{model_name}_model.joblib"
    model = joblib.load(model_path)
    print(f"模型已从: {model_path} 加载")
    # 加载向量器
    vectorizer_path = model_dir / f"{model_name}_vectorizer.joblib"
    vectorizer = joblib.load(vectorizer_path)
    print(f"向量器已从: {vectorizer_path} 加载")
    return model, vectorizer
 def main():
    """机器学习主函数"""
    # 1. 加载数据集
    print("正在加载数据集...")
    df = pl.read_csv("../spam.csv", encoding="latin-1", ignore_errors=True)
    # 2. 清洗数据集
    print("正在清洗数据集...")
    df = df.drop(df.columns[-3:])
    df = df.rename({"v1": "label", "v2": "message"})
    df = df.with_columns(
        pl.when(pl.col("label") == "spam").then(1).otherwise(0).alias("label")
    )
    # 3. 分离特征和标签
    X = df["message"]
    y = df["label"]
    # 4. 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=settings.test_size, random_state=settings.random_state, stratify=y
    )
    print(f"训练集大小: {len(X_train)}")
    print(f"测试集大小: {len(X_test)}")
    # 5. 特征提取
    print("正在提取特征...")
    X_train_tfidf, X_test_tfidf, tfidf = extract_features(X_train, X_test)
    # 6. 训练Logistic Regression模型
    print("\n正在训练Logistic Regression模型...")
    log_reg_model = train_logistic_regression(X_train_tfidf, y_train)
    # 7. 评估Logistic Regression模型
    print("\n评估Logistic Regression模型:")
    log_reg_metrics = evaluate_model(log_reg_model, X_test_tfidf, y_test)
    print(f"Logistic Regression指标: {log_reg_metrics}")
    # 8. 训练LightGBM模型
    print("\n正在训练LightGBM模型...")
    lgb_model = train_lightgbm(X_train_tfidf, y_train)
    # 9. 评估LightGBM模型
    print("\n评估LightGBM模型:")
    lgb_metrics = evaluate_model(lgb_model, X_test_tfidf, y_test)
    print(f"LightGBM指标: {lgb_metrics}")
    # 10. 保存模型
    print("\n正在保存模型...")
    save_model(log_reg_model, "logistic_regression", tfidf)
    save_model(lgb_model, "lightgbm", tfidf)
    print("\n机器学习流程完成！")
 if __name__ == "__main__":
    main()
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,24 @@
 from data_processing import load_data, clean_data, save_data
 from translation import translate_dataset
 def main():
    """主函数"""
    # 1. 加载数据集
    print("正在加载数据集...")
    df = load_data("../spam.csv")
    # 2. 清洗数据集
    print("\n正在清洗数据集...")
    df_cleaned = clean_data(df)
    # 3. 只翻译前10条短信进行测试
    print("\n正在翻译前10条短信进行测试...")
    df_test = df_cleaned.head(10)
    translated_path = translate_dataset(df_test)
    print(f"\n测试完成！翻译后的测试数据集已保存到: {translated_path}")
 if __name__ == "__main__":
    main()
--- a/src/simple_agent.py
+++ b/src/simple_agent.py
@ -0,0 +1,150 @@
 import requests
 from typing import List, Dict, Any
 from config import settings
 from machine_learning import SpamClassifier
 from translation import translate_text
 class SimpleSpamAnalysis:
    """简单的垃圾短信分析系统"""
    def __init__(self, model_name: str = "lightgbm"):
        """初始化分析系统"""
        self.classifier = SpamClassifier(model_name)
    def analyze(self, message: str, is_english: bool = True) -> Dict[str, Any]:
        """分析单条短信"""
        # 1. 翻译短信
        message_zh = translate_text(message, "zh-CN") if is_english else message
        # 2. 分类短信
        classification = self.classifier.classify(message)
        # 3. 提取关键词
        key_words = self.extract_keywords(message)
        # 4. 生成解释和建议
        reason, suggestion = self.generate_explanation(key_words, classification["label"])
        # 5. 使用DeepSeek API生成更详细的解释
        detailed_explanation = self.generate_detailed_explanation(
            message, message_zh, classification["label"], key_words
        )
        return {
            "original_message": message,
            "translated_message": message_zh,
            "classification": classification,
            "key_words": key_words,
            "reason": reason,
            "suggestion": suggestion,
            "detailed_explanation": detailed_explanation
        }
    def extract_keywords(self, message: str, top_n: int = 5) -> List[str]:
        """提取关键词"""
        words = message.lower().split()
        stop_words = set([
            "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
            "with", "by", "from", "up", "down", "about", "above", "below", "of",
            "is", "are", "was", "were", "be", "been", "being", "have", "has",
            "had", "do", "does", "did", "will", "would", "shall", "should",
            "may", "might", "must", "can", "could", "not", "no", "yes", "if",
            "then", "than", "so", "because", "as", "when", "where", "who", "which",
            "that", "this", "these", "those", "i", "me", "my", "mine", "you",
            "your", "yours", "he", "him", "his", "she", "her", "hers", "it",
            "its", "we", "us", "our", "ours", "they", "them", "their", "theirs"
        ])
        keywords = [word for word in words if word not in stop_words and len(word) > 2]
        return keywords[:top_n]
    def generate_explanation(self, key_words: List[str], label: str) -> tuple:
        """生成基本解释和建议"""
        if label == "spam":
            reason = f"该短信包含垃圾短信特征词: {', '.join(key_words)}"
            suggestion = "建议立即删除该短信，不要点击任何链接，不要回复，避免上当受骗"
        else:
            reason = f"该短信为正常短信，包含常用词汇: {', '.join(key_words)}"
            suggestion = "可以正常回复和处理该短信"
        return reason, suggestion
    def generate_detailed_explanation(self, message: str, message_zh: str, label: str, key_words: List[str]) -> str:
        """使用DeepSeek API生成详细解释"""
        headers = {
            "Authorization": f"Bearer {settings.deepseek_api_key}",
            "Content-Type": "application/json"
        }
        prompt = f"""
        分析以下短信：
        英文：{message}
        中文：{message_zh}
        分类结果：{label}
        关键词：{', '.join(key_words)}
        请提供：
        1. 详细的分类原因
        2. 短信的主要特征
        3. 针对该短信的具体建议
        4. 如何识别类似的短信
        请使用中文回答，保持简洁明了。
        """
        payload = {
            "model": settings.deepseek_model,
            "messages": [
                {
                    "role": "system",
                    "content": "你是一名专业的垃圾短信分析师，请根据提供的信息进行详细分析。"
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "max_tokens": 500,
            "temperature": 0.1
        }
        try:
            response = requests.post(
                f"{settings.deepseek_api_base}/chat/completions",
                headers=headers,
                json=payload,
                timeout=30
            )
            response.raise_for_status()
            result = response.json()
            explanation = result["choices"][0]["message"]["content"].strip()
            return explanation
        except requests.exceptions.RequestException as e:
            print(f"生成详细解释失败: {e}")
            return "无法生成详细解释，请检查API连接。"
 if __name__ == "__main__":
    # 初始化分析系统
    analyzer = SimpleSpamAnalysis()
    # 测试短信
    test_messages = [
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
        "Ok lar... Joking wif u oni...",
        "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
    ]
    # 分析短信
    for i, message in enumerate(test_messages):
        print(f"\n=== 短信分析结果 {i+1} ===")
        result = analyzer.analyze(message)
        print(f"原始短信: {result['original_message']}")
        print(f"中文翻译: {result['translated_message']}")
        print(f"分类结果: {result['classification']['label']} (置信度: {result['classification']['confidence']:.2f})")
        print(f"关键词: {', '.join(result['key_words'])}")
        print(f"原因: {result['reason']}")
        print(f"建议: {result['suggestion']}")
        print(f"详细解释: {result['detailed_explanation']}")
--- a/src/translation.py
+++ b/src/translation.py
@ -0,0 +1,130 @@
 import requests
 from typing import List, Dict
 from config import settings
 import time
 def translate_text(text: str, target_lang: str = "zh-CN") -> str:
    """
    使用DeepSeek API将文本翻译成目标语言
    Args:
        text: 要翻译的文本
        target_lang: 目标语言，默认为中文(zh-CN)
    Returns:
        翻译后的文本
    """
    headers = {
        "Authorization": f"Bearer {settings.deepseek_api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": settings.deepseek_model,
        "messages": [
            {
                "role": "system",
                "content": f"You are a professional translator. Translate the following text to {target_lang}. Keep the original meaning and tone. Do not add any additional information."
            },
            {
                "role": "user",
                "content": text
            }
        ],
        "max_tokens": 1000,
        "temperature": 0.1
    }
    try:
        response = requests.post(
            f"{settings.deepseek_api_base}/chat/completions",
            headers=headers,
            json=payload,
            timeout=30
        )
        response.raise_for_status()
        result = response.json()
        translated_text = result["choices"][0]["message"]["content"].strip()
        return translated_text
    except requests.exceptions.RequestException as e:
        print(f"翻译失败: {e}")
        return text
 def translate_batch(texts: List[str], target_lang: str = "zh-CN", batch_size: int = 10) -> List[str]:
    """
    批量翻译文本
    Args:
        texts: 要翻译的文本列表
        target_lang: 目标语言，默认为中文(zh-CN)
        batch_size: 批量大小，默认为10
    Returns:
        翻译后的文本列表
    """
    translated_texts = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_translated = []
        for text in batch:
            translated = translate_text(text, target_lang)
            batch_translated.append(translated)
            # 添加延迟，避免API限流
            time.sleep(0.5)
        translated_texts.extend(batch_translated)
        print(f"已翻译 {min(i+batch_size, len(texts))}/{len(texts)} 条文本")
    return translated_texts
 def translate_dataset(df, text_column: str = "message", target_column: str = "message_zh") -> str:
    """
    翻译数据集中的文本列
    Args:
        df: Polars DataFrame
        text_column: 要翻译的文本列名
        target_column: 翻译后的文本列名
    Returns:
        翻译后的数据集文件路径
    """
    import polars as pl
    import os
    # 创建data目录（如果不存在）
    os.makedirs(settings.data_path, exist_ok=True)
    # 提取文本列表
    texts = df[text_column].to_list()
    # 翻译文本
    print(f"开始翻译 {len(texts)} 条文本...")
    translated_texts = translate_batch(texts)
    # 添加翻译后的列到数据集
    df = df.with_columns(
        pl.Series(target_column, translated_texts)
    )
    # 保存翻译后的数据集
    output_path = f"{settings.data_path}/spam_zh.csv"
    df.write_csv(output_path, index=False)
    print(f"翻译后的数据集已保存到: {output_path}")
    print(f"翻译完成！共翻译了 {len(texts)} 条文本")
    return output_path
 if __name__ == "__main__":
    # 测试翻译功能
    test_text = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
    translated = translate_text(test_text)
    print(f"原文: {test_text}")
    print(f"译文: {translated}")
--- a/test_analysis.py
+++ b/test_analysis.py
@ -0,0 +1,31 @@
 import sys
 import os
 # 添加src目录到Python路径
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
 from simple_agent import SimpleSpamAnalysis
 # 测试短信
 test_messages = [
    "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
    "Ok lar... Joking wif u oni...",
    "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
 ]
 # 初始化分析系统
 analyzer = SimpleSpamAnalysis()
 # 分析短信
 for i, message in enumerate(test_messages):
    print(f"\n=== 短信分析结果 {i+1} ===")
    result = analyzer.analyze(message)
    print(f"原始短信: {result['original_message'][:100]}...")
    print(f"中文翻译: {result['translated_message'][:100]}...")
    print(f"分类结果: {result['classification']['label']} (置信度: {result['classification']['confidence']:.2f})")
    print(f"关键词: {', '.join(result['key_words'])}")
    print(f"原因: {result['reason']}")
    print(f"建议: {result['suggestion']}")
    print(f"详细解释: {result['detailed_explanation'][:200]}...")
--- a/test_translation.py
+++ b/test_translation.py
@ -0,0 +1,7 @@
 from src.translation import translate_text
 # 测试单个翻译功能
 test_text = "Hello, how are you?"
 print(f"原文: {test_text}")
 translated = translate_text(test_text)
 print(f"译文: {translated}")