可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
I would like to get the feature names of a data set after it has been transformed by SKLearn OneHotEncoder.
In active_features_ attribute in OneHotEncoder one can see a very good explanation how the attributes n_values_
, feature_indices_
and active_features_
get filled after transform()
was executed.
My question is:
For e.g. DataFrame based input data:
data = pd.DataFrame({"a": [0, 1, 2,0], "b": [0,1,4, 5], "c":[0,1,4, 5]}).as_matrix()
How does the code look like to get from the original feature names a
, b
and c
to a list of the transformed feature names
(like e.g:
a-0
,a-1
, a-2
, b-0
, b-1
, b-2
, b-3
, c-0
, c-1
, c-2
, c-3
or
a-0
,a-1
, a-2
, b-0
, b-1
, b-2
, b-3
, b-4
, b-5
, b-6
, b-7
, b-8
or anything that helps to see the assignment of encoded columns to the original columns).
Background: I would like to see the feature importances of some of the algorithms to get a feeling for which feature have the most effect on the algorithm used.
回答1:
You can use pd.get_dummies()
:
pd.get_dummies(data["a"],prefix="a")
will give you:
a_0 a_1 a_2
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
which can automatically generates the column names. You can apply this to all your columns and then get the columns names. No need to convert them to a numpy matrix.
So with:
df = pd.DataFrame({"a": [0, 1, 2,0], "b": [0,1,4, 5], "c":[0,1,4, 5]})
data = df.as_matrix()
the solution looks like:
columns = df.columns
my_result = pd.DataFrame()
temp = pd.DataFrame()
for runner in columns:
temp = pd.get_dummies(df[runner], prefix=runner)
my_result[temp.columns] = temp
print(my_result.columns)
>>Index(['a_0', 'a_1', 'a_2', 'b_0', 'b_1', 'b_4', 'b_5', 'c_0', 'c_1', 'c_4',
'c_5'],
dtype='object')
回答2:
If I understand correctly you can use feature_indices_
to identify which columns correspond to which feature.
e.g.
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
data = pd.DataFrame({"a": [0, 1, 2,0], "b": [0,1,4, 5], "c":[0,1,4, 5]}).as_matrix()
ohe = OneHotEncoder(sparse=False)
ohe_fitted = ohe.fit_transform(data)
print(ohe_fitted)
print(ohe.feature_indices_) # [ 0 3 9 15]
From the above feature_indices_
we know if we spliced the OneHotEncoded data from 0:3
we would get the features corresponding to the first column in data
like so:
print(ohe_fitted[:,0:3])
Each column in the spliced data represents a value in the first feature. The first column is 0, the second 1 and the third column is 2. To illustrate this on the spliced data, the column labels would look like:
a_0 a_1 a_2
[[ 1. 0. 0.]
[ 0. 1. 0.]
[ 0. 0. 1.]
[ 1. 0. 0.]]
Note that features are sorted first before they are encoded.
回答3:
You can do that with the open source package feature-engine:
import pandas as pd
from sklearn.model_selection import train_test_split
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
# load titanic data from openML
pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
# divide into train and test
X_train, X_test, y_train, y_test = train_test_split(
data[['sex', 'embarked']], # predictors for this example
data['survived'], # target
test_size=0.3, # percentage of obs in test set
random_state=0) # seed to ensure reproducibility
ohe_enc = OneHotCategoricalEncoder(
top_categories=None,
variables=['sex', 'embarked'],
drop_last=True)
ohe_enc.fit(X_train)
X_train = ohe_enc.transform(X_train)
X_test = ohe_enc.transform(X_test)
X_train.head()
You should see this output returned:
sex_female embarked_S embarked_C embarked_Q
501 1 1 0 0
588 1 1 0 0
402 1 0 1 0
1193 0 0 0 1
686 1 0 0 1
More details about feature engine here:
https://www.trainindata.com/feature-engine
https://github.com/solegalli/feature_engine
https://feature-engine.readthedocs.io/en/latest/
回答4:
There is a OneHotEncoder that does all the work for you.
Package sksurv has a OneHotEncoder that will return a pandas Dataframe with all the column names set-up for you. Check it out. Make sure you set-up an environment to play with the encoder to ensure it doesn't break your current environment. This encoder saved me a lot of time and effort.
scikit-suvival GitHub
OneHotEncoder Documentation
回答5:
OneHotEncoder
now has a method get_feature_names
. You can use input_features=data.columns
to match to the training data.