데이터과학 삼학년

Feature Selection :: Recursive Feature Elimination (RFE) 본문

Feature Engineering

Feature Selection :: Recursive Feature Elimination (RFE)

Dan-k 2023. 9. 21. 09:00
반응형

Recursive Feature Elimination

- Feature Selection의 한 방법으로 전체 feature를 적용하고, 점진적으로 feature를 빼나가면서 최적의 feature수와 feature를 찾는 방법 (backward 방식)

- feature를 많이 사용할 수록 overfitting, 차원의 저주 등 실서비스 적용에 옳지 않은 방식이 이루어지기 때문에 적절한 갯수의 핵심적인 feature를 선정하는 것이 중요

 

RFE 방식

  1. 학습하고 싶은 모델선정
  2. 모델의 feature importance 도출
  3. feature importance가 낮은 feature부터 하나씩 제거
  4. 1~3 방식 반복

코드

# explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
 
# get the dataset
def get_dataset():
 X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
 return X, y
 
# get a list of models to evaluate
def get_models():
 models = dict()
 for i in range(2, 10):
 rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=i)
 model = DecisionTreeClassifier()
 models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
 return models
 
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
 cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
 scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
 return scores
 
# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
 scores = evaluate_model(model, X, y)
 results.append(scores)
 names.append(name)
 print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
# explore the algorithm wrapped by RFE
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
 
# get the dataset
def get_dataset():
 X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
 return X, y
 
# get a list of models to evaluate
def get_models():
 models = dict()
 # lr
 rfe = RFE(estimator=LogisticRegression(), n_features_to_select=5)
 model = DecisionTreeClassifier()
 models['lr'] = Pipeline(steps=[('s',rfe),('m',model)])
 # perceptron
 rfe = RFE(estimator=Perceptron(), n_features_to_select=5)
 model = DecisionTreeClassifier()
 models['per'] = Pipeline(steps=[('s',rfe),('m',model)])
 # cart
 rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)
 model = DecisionTreeClassifier()
 models['cart'] = Pipeline(steps=[('s',rfe),('m',model)])
 # rf
 rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
 model = DecisionTreeClassifier()
 models['rf'] = Pipeline(steps=[('s',rfe),('m',model)])
 # gbm
 rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=5)
 model = DecisionTreeClassifier()
 models['gbm'] = Pipeline(steps=[('s',rfe),('m',model)])
 return models
 
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
 cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
 scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
 return scores
 
# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
 scores = evaluate_model(model, X, y)
 results.append(scores)
 names.append(name)
 print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# 가상의 분류 데이터 생성
X, y = make_classification(n_samples=100, n_features=10, random_state=42)

# 로지스틱 회귀 모델을 사용하여 RFE를 수행
model = LogisticRegression()
rfe = RFE(model, 5)  # 남길 특성의 개수를 5로 설정 (원하는 개수로 변경 가능)

# RFE를 적용하여 특성 선택
X_rfe = rfe.fit_transform(X, y)

# 선택된 특성의 순위 확인
print("특성의 순위:", rfe.ranking_)

# 선택된 특성의 이름 확인
print("선택된 특성의 이름:", np.array(range(len(X.columns)))[rfe.support_])

참조

https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html#sphx-glr-auto-examples-feature-selection-plot-rfe-with-cross-validation-py

https://machinelearningmastery.com/rfe-feature-selection-in-python/

728x90
반응형
LIST
Comments