250x250
반응형
Notice
Recent Posts
Recent Comments
Link
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | 5 | 6 | 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 | 30 | 31 |
Tags
- hadoop
- tensorflow text
- BigQuery
- correlation
- gather_nd
- Counterfactual Explanations
- youtube data
- requests
- Airflow
- airflow subdag
- 상관관계
- flask
- Retry
- login crawling
- integrated gradient
- chatGPT
- session 유지
- top_k
- API
- API Gateway
- UDF
- TensorFlow
- 공분산
- subdag
- spark udf
- grad-cam
- XAI
- GCP
- GenericGBQException
- 유튜브 API
Archives
- Today
- Total
데이터과학 삼학년
Feature Selection :: Recursive Feature Elimination (RFE) 본문
Feature Engineering
Feature Selection :: Recursive Feature Elimination (RFE)
Dan-k 2023. 9. 21. 09:00반응형
Recursive Feature Elimination
- Feature Selection의 한 방법으로 전체 feature를 적용하고, 점진적으로 feature를 빼나가면서 최적의 feature수와 feature를 찾는 방법 (backward 방식)
- feature를 많이 사용할 수록 overfitting, 차원의 저주 등 실서비스 적용에 옳지 않은 방식이 이루어지기 때문에 적절한 갯수의 핵심적인 feature를 선정하는 것이 중요
RFE 방식
- 학습하고 싶은 모델선정
- 모델의 feature importance 도출
- feature importance가 낮은 feature부터 하나씩 제거
- 1~3 방식 반복
코드
# explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
# get the dataset
def get_dataset():
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
return X, y
# get a list of models to evaluate
def get_models():
models = dict()
for i in range(2, 10):
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=i)
model = DecisionTreeClassifier()
models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
return models
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
return scores
# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
# explore the algorithm wrapped by RFE
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
# get the dataset
def get_dataset():
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
return X, y
# get a list of models to evaluate
def get_models():
models = dict()
# lr
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=5)
model = DecisionTreeClassifier()
models['lr'] = Pipeline(steps=[('s',rfe),('m',model)])
# perceptron
rfe = RFE(estimator=Perceptron(), n_features_to_select=5)
model = DecisionTreeClassifier()
models['per'] = Pipeline(steps=[('s',rfe),('m',model)])
# cart
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)
model = DecisionTreeClassifier()
models['cart'] = Pipeline(steps=[('s',rfe),('m',model)])
# rf
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
model = DecisionTreeClassifier()
models['rf'] = Pipeline(steps=[('s',rfe),('m',model)])
# gbm
rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=5)
model = DecisionTreeClassifier()
models['gbm'] = Pipeline(steps=[('s',rfe),('m',model)])
return models
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
return scores
# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# 가상의 분류 데이터 생성
X, y = make_classification(n_samples=100, n_features=10, random_state=42)
# 로지스틱 회귀 모델을 사용하여 RFE를 수행
model = LogisticRegression()
rfe = RFE(model, 5) # 남길 특성의 개수를 5로 설정 (원하는 개수로 변경 가능)
# RFE를 적용하여 특성 선택
X_rfe = rfe.fit_transform(X, y)
# 선택된 특성의 순위 확인
print("특성의 순위:", rfe.ranking_)
# 선택된 특성의 이름 확인
print("선택된 특성의 이름:", np.array(range(len(X.columns)))[rfe.support_])
참조
https://machinelearningmastery.com/rfe-feature-selection-in-python/
728x90
반응형
LIST
'Feature Engineering' 카테고리의 다른 글
All about Feature Scaling (0) | 2022.06.21 |
---|---|
[Labeling] Snorkel 소개 (0) | 2020.11.20 |
Ch.9 Back to the Feature: Building an Academic Paper Recommender (0) | 2020.06.03 |
Ch.7 Nonlinear Featurization viaK-Means Model Stacking (0) | 2020.05.21 |
Ch.6 Dimensionality Reduction: Squashing the Data Pancake with PCA (0) | 2020.05.06 |
Comments