chore: 初始化项目结构与基线模型

2026-01-12 15:43:32 +08:00 · 2026-01-12 15:43:32 +08:00 · b077bae62d
commit b077bae62d
17 changed files with 287552 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1 @@
 DEEPSEEK_API_KEY=sk-aae8c48b3c654e9983dd2a8d612861d3
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
 .env
 .venv/
 __pycache__/
 .ipynb_checkpoints/
 .DS_Store
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
 3.12
--- a/.~完成记录.docx
+++ b/.~完成记录.docx
--- a/.~课设要求.docx
+++ b/.~课设要求.docx
--- a/README.md
+++ b/README.md
--- a/creditcard.csv
+++ b/creditcard.csv
--- a/main.py
+++ b/main.py
@ -0,0 +1,6 @@
 def main():
    print("Hello from ml-course-project!")
 if __name__ == "__main__":
    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,17 @@
 [project]
 name = "ml-course-project"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "jupyter>=1.1.1",
    "matplotlib>=3.10.8",
    "numpy>=2.4.1",
    "openai>=2.15.0",
    "pandas>=2.3.3",
    "python-dotenv>=1.2.1",
    "scikit-learn>=1.8.0",
    "seaborn>=0.13.2",
    "streamlit>=1.52.2",
 ]
--- a/src/init.py
+++ b/src/init.py
@ -0,0 +1 @@
 __all__ = []
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@ -0,0 +1,31 @@
 import pandas as pd
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import StandardScaler
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 class CleanTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.drop_duplicates()
        return X
 def build_preprocess(columns, target):
    num_cols = [c for c in columns if c != target]
    numeric = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
    ct = ColumnTransformer([("num", numeric, num_cols)], remainder="drop")
    return ct, num_cols
 def load_data(path):
    df = pd.read_csv(path)
    return df
 def split_Xy(df, target):
    X = df.drop(columns=[target])
    y = df[target].astype(int)
    return X, y
--- a/src/streamlit_app.py
+++ b/src/streamlit_app.py
@ -0,0 +1,53 @@
 import streamlit as st
 import os
 from dotenv import load_dotenv
 from openai import OpenAI
 # 加载环境变量
 load_dotenv()
 # 获取 API Key
 api_key = os.getenv("DEEPSEEK_API_KEY")
 st.title("DeepSeek Chat Demo")
 if not api_key or api_key == "your-key-here":
    st.error("请在 .env 文件中配置 DEEPSEEK_API_KEY")
    st.stop()
 # 初始化 DeepSeek 客户端
 client = OpenAI(
    api_key=api_key,
    base_url="https://api.deepseek.com"
 )
 # 初始化聊天历史
 if "messages" not in st.session_state:
    st.session_state.messages = []
 # 显示聊天历史
 for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])
 # 接收用户输入
 if prompt := st.chat_input("What is up?"):
    # 添加用户消息到历史
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(prompt)
    # 获取回复
    with st.chat_message("assistant"):
        stream = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": m["role"], "content": m["content"]}
                for m in st.session_state.messages
            ],
            stream=True,
        )
        response = st.write_stream(stream)
    # 添加助手消息到历史
    st.session_state.messages.append({"role": "assistant", "content": response})
--- a/src/train_baseline.py
+++ b/src/train_baseline.py
@ -0,0 +1,28 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import classification_report, roc_auc_score
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import Pipeline
 from src.data_pipeline import load_data, CleanTransformer, build_preprocess, split_Xy
 def main():
    df = load_data("creditcard.csv")
    target = "Class"
    cleaner = CleanTransformer()
    df = cleaner.transform(df)
    ct, num_cols = build_preprocess(df.columns.tolist(), target)
    X, y = split_Xy(df, target)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    clf = LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=1)
    pipe = Pipeline(steps=[("preprocess", ct), ("clf", clf)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]
    report = classification_report(y_test, y_pred, digits=4)
    auc = roc_auc_score(y_test, y_proba)
    print("Classification Report")
    print(report)
    print("ROC-AUC", round(auc, 4))
 if __name__ == "__main__":
    main()
--- a/uv.lock
+++ b/uv.lock
--- a/uv.toml
+++ b/uv.toml
@ -0,0 +1,3 @@
 [[index]]
 url = "https://mirrors.aliyun.com/pypi/simple/"
 default = true
--- a/完成记录.docx
+++ b/完成记录.docx
--- a/课设要求.docx
+++ b/课设要求.docx
		`@ -0,0 +1 @@`
							`DEEPSEEK_API_KEY=sk-aae8c48b3c654e9983dd2a8d612861d3`