chore: 初始化项目结构与基线模型

2026-01-12 15:43:32 +08:00 · 2026-01-12 15:43:32 +08:00 · b077bae62d
commit b077bae62d
17 changed files with 287552 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1 @@
+DEEPSEEK_API_KEY=sk-aae8c48b3c654e9983dd2a8d612861d3
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+.env
+.venv/
+__pycache__/
+.ipynb_checkpoints/
+.DS_Store
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.12
--- a/.~完成记录.docx
+++ b/.~完成记录.docx
--- a/.~课设要求.docx
+++ b/.~课设要求.docx
--- a/README.md
+++ b/README.md
--- a/creditcard.csv
+++ b/creditcard.csv
--- a/main.py
+++ b/main.py
@ -0,0 +1,6 @@
+def main():
+    print("Hello from ml-course-project!")
+
+
+if __name__ == "__main__":
+    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,17 @@
+[project]
+name = "ml-course-project"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "jupyter>=1.1.1",
+    "matplotlib>=3.10.8",
+    "numpy>=2.4.1",
+    "openai>=2.15.0",
+    "pandas>=2.3.3",
+    "python-dotenv>=1.2.1",
+    "scikit-learn>=1.8.0",
+    "seaborn>=0.13.2",
+    "streamlit>=1.52.2",
+]
--- a/src/init.py
+++ b/src/init.py
@ -0,0 +1 @@
+__all__ = []
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@ -0,0 +1,31 @@
+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+
+class CleanTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        X = X.drop_duplicates()
+        return X
+
+def build_preprocess(columns, target):
+    num_cols = [c for c in columns if c != target]
+    numeric = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
+    ct = ColumnTransformer([("num", numeric, num_cols)], remainder="drop")
+    return ct, num_cols
+
+def load_data(path):
+    df = pd.read_csv(path)
+    return df
+
+def split_Xy(df, target):
+    X = df.drop(columns=[target])
+    y = df[target].astype(int)
+    return X, y
--- a/src/streamlit_app.py
+++ b/src/streamlit_app.py
@ -0,0 +1,53 @@
+import streamlit as st
+import os
+from dotenv import load_dotenv
+from openai import OpenAI
+
+# 加载环境变量
+load_dotenv()
+
+# 获取 API Key
+api_key = os.getenv("DEEPSEEK_API_KEY")
+
+st.title("DeepSeek Chat Demo")
+
+if not api_key or api_key == "your-key-here":
+    st.error("请在 .env 文件中配置 DEEPSEEK_API_KEY")
+    st.stop()
+
+# 初始化 DeepSeek 客户端
+client = OpenAI(
+    api_key=api_key,
+    base_url="https://api.deepseek.com"
+)
+
+# 初始化聊天历史
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+# 显示聊天历史
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+
+# 接收用户输入
+if prompt := st.chat_input("What is up?"):
+    # 添加用户消息到历史
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(prompt)
+
+    # 获取回复
+    with st.chat_message("assistant"):
+        stream = client.chat.completions.create(
+            model="deepseek-chat",
+            messages=[
+                {"role": m["role"], "content": m["content"]}
+                for m in st.session_state.messages
+            ],
+            stream=True,
+        )
+        response = st.write_stream(stream)
+    
+    # 添加助手消息到历史
+    st.session_state.messages.append({"role": "assistant", "content": response})
--- a/src/train_baseline.py
+++ b/src/train_baseline.py
@ -0,0 +1,28 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, roc_auc_score
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from src.data_pipeline import load_data, CleanTransformer, build_preprocess, split_Xy
+
+def main():
+    df = load_data("creditcard.csv")
+    target = "Class"
+    cleaner = CleanTransformer()
+    df = cleaner.transform(df)
+    ct, num_cols = build_preprocess(df.columns.tolist(), target)
+    X, y = split_Xy(df, target)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+    clf = LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=1)
+    pipe = Pipeline(steps=[("preprocess", ct), ("clf", clf)])
+    pipe.fit(X_train, y_train)
+    y_pred = pipe.predict(X_test)
+    y_proba = pipe.predict_proba(X_test)[:, 1]
+    report = classification_report(y_test, y_pred, digits=4)
+    auc = roc_auc_score(y_test, y_proba)
+    print("Classification Report")
+    print(report)
+    print("ROC-AUC", round(auc, 4))
+
+if __name__ == "__main__":
+    main()
--- a/uv.lock
+++ b/uv.lock
--- a/uv.toml
+++ b/uv.toml
@ -0,0 +1,3 @@
+[[index]]
+url = "https://mirrors.aliyun.com/pypi/simple/"
+default = true
--- a/完成记录.docx
+++ b/完成记录.docx
--- a/课设要求.docx
+++ b/课设要求.docx
				`@ -0,0 +1 @@`
				`DEEPSEEK_API_KEY=sk-aae8c48b3c654e9983dd2a8d612861d3`