feat: 数据清洗流水线模块

This commit is contained in:
邢可易 2026-01-12 16:21:42 +08:00
commit aabcd6a80a
3 changed files with 37 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
.env
.venv/
__pycache__/
.ipynb_checkpoints/
.DS_Store

1
src/__init__.py Normal file
View File

@ -0,0 +1 @@
__all__ = []

31
src/data_pipeline.py Normal file
View File

@ -0,0 +1,31 @@
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
class CleanTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.drop_duplicates()
return X
def build_preprocess(columns, target):
num_cols = [c for c in columns if c != target]
numeric = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
ct = ColumnTransformer([("num", numeric, num_cols)], remainder="drop")
return ct, num_cols
def load_data(path):
df = pd.read_csv(path)
return df
def split_Xy(df, target):
X = df.drop(columns=[target])
y = df[target].astype(int)
return X, y