feat: 数据清洗流水线模块
This commit is contained in:
commit
aabcd6a80a
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
.env
|
||||
.venv/
|
||||
__pycache__/
|
||||
.ipynb_checkpoints/
|
||||
.DS_Store
|
||||
1
src/__init__.py
Normal file
1
src/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
__all__ = []
|
||||
31
src/data_pipeline.py
Normal file
31
src/data_pipeline.py
Normal file
@ -0,0 +1,31 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
class CleanTransformer(BaseEstimator, TransformerMixin):
|
||||
def __init__(self):
|
||||
pass
|
||||
def fit(self, X, y=None):
|
||||
return self
|
||||
def transform(self, X):
|
||||
X = X.drop_duplicates()
|
||||
return X
|
||||
|
||||
def build_preprocess(columns, target):
|
||||
num_cols = [c for c in columns if c != target]
|
||||
numeric = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
|
||||
ct = ColumnTransformer([("num", numeric, num_cols)], remainder="drop")
|
||||
return ct, num_cols
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
return df
|
||||
|
||||
def split_Xy(df, target):
|
||||
X = df.drop(columns=[target])
|
||||
y = df[target].astype(int)
|
||||
return X, y
|
||||
Loading…
Reference in New Issue
Block a user