我试图了解如何获得已分解为虚拟变量的分类变量的功能重要性。我正在使用scikit-learn,它不像R或h2o那样为您处理分类变量。
如果将分类变量分解为虚拟变量,则该变量中每个类的功能重要性都不同。
我的问题是,将这些虚拟变量的重要性通过简单地求和重新组合为分类变量的重要性值是否有意义?
从《统计学习的要素》第368页开始:
变量的平方相对重要性是在所有的内部节点,例如平方改进它为之选择作为分割变量的总和
这使我认为,由于重要性值已经通过在每个节点上选择一个变量的总和来创建,因此我应该能够组合虚拟变量的变量重要性值以“恢复”分类变量的重要性。当然,我不希望它是完全正确的,但是无论如何这些值实际上都是准确的值,因为它们是通过随机过程找到的。
我已经编写了以下python代码(以jupyter格式)作为调查:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import animation, rc
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestClassifier
import re
#%matplotlib inline
from IPython.display import HTML
from IPython.display import set_matplotlib_formats
plt.rcParams['figure.autolayout'] = False
plt.rcParams['figure.figsize'] = 10, 6
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 14
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
# Get some data, I could not easily find a free data set with actual categorical variables, so I just created some from continuous variables
data = load_diabetes()
df = pd.DataFrame(data.data, columns=[data.feature_names])
df = df.assign(target=pd.Series(data.target))
# Functions to plot the variable importances
def autolabel(rects, ax):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2.,
1.05*height,
f'{round(height,3)}',
ha='center',
va='bottom')
def plot_feature_importance(X,y,dummy_prefixes=None, ax=None, feats_to_highlight=None):
# Find the feature importances by fitting a random forest
forest = RandomForestClassifier(n_estimators=100)
forest.fit(X,y)
importances_dummy = forest.feature_importances_
# If there are specified dummy variables, combing them into a single categorical
# variable by summing the importances. This code assumes the dummy variables were
# created using pandas get_dummies() method names the dummy variables as
# featurename_categoryvalue
if dummy_prefixes is None:
importances_categorical = importances_dummy
labels = X.columns
else:
dummy_idx = np.repeat(False,len(X.columns))
importances_categorical = []
labels = []
for feat in dummy_prefixes:
feat_idx = np.array([re.match(f'^{feat}_', col) is not None for col in X.columns])
importances_categorical = np.append(importances_categorical,
sum(importances_dummy[feat_idx]))
labels = np.append(labels,feat)
dummy_idx = dummy_idx | feat_idx
importances_categorical = np.concatenate((importances_dummy[~dummy_idx],
importances_categorical))
labels = np.concatenate((X.columns[~dummy_idx], labels))
importances_categorical /= max(importances_categorical)
indices = np.argsort(importances_categorical)[::-1]
# Plotting
if ax is None:
fig, ax = plt.subplots()
plt.title("Feature importances")
rects = ax.bar(range(len(importances_categorical)),
importances_categorical[indices],
tick_label=labels[indices],
align="center")
autolabel(rects, ax)
if feats_to_highlight is not None:
highlight = [feat in feats_to_highlight for feat in labels[indices]]
rects2 = ax.bar(range(len(importances_categorical)),
importances_categorical[indices]*highlight,
tick_label=labels[indices],
color='r',
align="center")
rects = [rects,rects2]
plt.xlim([-0.6, len(importances_categorical)-0.4])
ax.set_ylim((0, 1.125))
return rects
# Create importance plots leaving everything as categorical variables. I'm highlighting bmi and age as I will convert those into categorical variables later
X = df.drop('target',axis=1)
y = df['target'] > 140.5
plot_feature_importance(X,y, feats_to_highlight=['bmi', 'age'])
plt.title('Feature importance with bmi and age left as continuous variables')
#Create an animation of what happens to variable importance when I split bmi and age into n (n equals 2 - 25) different classes
# %%capture
fig, ax = plt.subplots()
def animate(i):
ax.clear()
# Split one of the continuous variables up into a categorical variable with i balanced classes
X_test = X.copy()
n_categories = i+2
X_test['bmi'] = pd.cut(X_test['bmi'],
np.percentile(X['bmi'], np.linspace(0,100,n_categories+1)),
labels=[chr(num+65) for num in range(n_categories)])
X_test['age'] = pd.cut(X_test['age'],
np.percentile(X['age'], np.linspace(0,100,n_categories+1)),
labels=[chr(num+65) for num in range(n_categories)])
X_test = pd.get_dummies(X_test, drop_first=True)
# Plot the feature importances
rects = plot_feature_importance(X_test,y,dummy_prefixes=['bmi', 'age'],ax=ax, feats_to_highlight=['bmi', 'age'])
plt.title(f'Feature importances for {n_categories} bmi and age categories')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
return [rects,]
anim = animation.FuncAnimation(fig, animate, frames=24, interval=1000)
HTML(anim.to_html5_video())
以下是一些结果:
我们可以观察到,变量的重要性主要取决于类别的数量,这使我对这些图表的实用性产生了普遍的质疑。尤其是age
要获得比连续不断更高的价值更高的价值。
最后,是一个示例,如果我将其保留为伪变量(仅bmi):
# Split one of the continuous variables up into a categorical variable with i balanced classes
X_test = X.copy()
n_categories = 5
X_test['bmi'] = pd.cut(X_test['bmi'],
np.percentile(X['bmi'], np.linspace(0,100,n_categories+1)),
labels=[chr(num+65) for num in range(n_categories)])
X_test = pd.get_dummies(X_test, drop_first=True)
# Plot the feature importances
rects = plot_feature_importance(X_test,y, feats_to_highlight=['bmi_B','bmi_C','bmi_D', 'bmi_E'])
plt.title(f"Feature importances for {n_categories} bmi categories")