commit aabcd6a80aec9d0836589b287902759ac8dc6675 Author: 邢可易 <13816688325@163.com> Date: Mon Jan 12 16:21:42 2026 +0800 feat: 数据清洗流水线模块 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..36fbda3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.env +.venv/ +__pycache__/ +.ipynb_checkpoints/ +.DS_Store diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..a9a2c5b --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +__all__ = [] diff --git a/src/data_pipeline.py b/src/data_pipeline.py new file mode 100644 index 0000000..7399662 --- /dev/null +++ b/src/data_pipeline.py @@ -0,0 +1,31 @@ +import pandas as pd +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline + +class CleanTransformer(BaseEstimator, TransformerMixin): + def __init__(self): + pass + def fit(self, X, y=None): + return self + def transform(self, X): + X = X.drop_duplicates() + return X + +def build_preprocess(columns, target): + num_cols = [c for c in columns if c != target] + numeric = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]) + ct = ColumnTransformer([("num", numeric, num_cols)], remainder="drop") + return ct, num_cols + +def load_data(path): + df = pd.read_csv(path) + return df + +def split_Xy(df, target): + X = df.drop(columns=[target]) + y = df[target].astype(int) + return X, y