feat: 数据清洗流水线模块
This commit is contained in:
commit
aabcd6a80a
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
.env
|
||||||
|
.venv/
|
||||||
|
__pycache__/
|
||||||
|
.ipynb_checkpoints/
|
||||||
|
.DS_Store
|
||||||
1
src/__init__.py
Normal file
1
src/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
__all__ = []
|
||||||
31
src/data_pipeline.py
Normal file
31
src/data_pipeline.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.compose import ColumnTransformer
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
|
||||||
|
class CleanTransformer(BaseEstimator, TransformerMixin):
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
def fit(self, X, y=None):
|
||||||
|
return self
|
||||||
|
def transform(self, X):
|
||||||
|
X = X.drop_duplicates()
|
||||||
|
return X
|
||||||
|
|
||||||
|
def build_preprocess(columns, target):
|
||||||
|
num_cols = [c for c in columns if c != target]
|
||||||
|
numeric = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
|
||||||
|
ct = ColumnTransformer([("num", numeric, num_cols)], remainder="drop")
|
||||||
|
return ct, num_cols
|
||||||
|
|
||||||
|
def load_data(path):
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
return df
|
||||||
|
|
||||||
|
def split_Xy(df, target):
|
||||||
|
X = df.drop(columns=[target])
|
||||||
|
y = df[target].astype(int)
|
||||||
|
return X, y
|
||||||
Loading…
Reference in New Issue
Block a user