feat: 数据清洗流水线模块

2026-01-12 16:21:42 +08:00 · 2026-01-12 16:21:42 +08:00 · aabcd6a80a
commit aabcd6a80a
3 changed files with 37 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+.env
+.venv/
+__pycache__/
+.ipynb_checkpoints/
+.DS_Store
--- a/src/init.py
+++ b/src/init.py
@ -0,0 +1 @@
+__all__ = []
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@ -0,0 +1,31 @@
+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+
+class CleanTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        X = X.drop_duplicates()
+        return X
+
+def build_preprocess(columns, target):
+    num_cols = [c for c in columns if c != target]
+    numeric = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
+    ct = ColumnTransformer([("num", numeric, num_cols)], remainder="drop")
+    return ct, num_cols
+
+def load_data(path):
+    df = pd.read_csv(path)
+    return df
+
+def split_Xy(df, target):
+    X = df.drop(columns=[target])
+    y = df[target].astype(int)
+    return X, y