Impute categorical missing values in scikit-learn

2019-01-16 01:47发布

I've got pandas data with some columns of text type. There are some NaN values along with these text columns. What I'm trying to do is to impute those NaN's by sklearn.preprocessing.Imputer (replacing NaN by the most frequent value). The problem is in implementation. Suppose there is a Pandas dataframe df with 30 columns, 10 of which are of categorical nature. Once I run:

from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(df) 

Python generates an error: 'could not convert string to float: 'run1'', where 'run1' is an ordinary (non-missing) value from the first column with categorical data.

Any help would be very welcome

8条回答
萌系小妹纸
2楼-- · 2019-01-16 01:54
  • strategy = 'most_frequent' can be used only with quantitative feature, not with qualitative. This custom impuer can be used for both qualitative and quantitative. Also with scikit learn imputer either we can use it for whole data frame(if all features are quantitative) or we can use 'for loop' with list of similar type of features/columns(see the below example). But custom imputer can be used with any combinations.

        from sklearn.preprocessing import Imputer
        impute = Imputer(strategy='mean')
        for cols in ['quantitative_column', 'quant']:  # here both are quantitative features.
              xx[cols] = impute.fit_transform(xx[[cols]])
    
  • Custom Imputer :

       from sklearn.preprocessing import Imputer
       from sklearn.base import TransformerMixin
    
       class CustomImputer(TransformerMixin):
             def __init__(self, cols=None, strategy='mean'):
                   self.cols = cols
                   self.strategy = strategy
    
             def transform(self, df):
                   X = df.copy()
                   impute = Imputer(strategy=self.strategy)
                   if self.cols == None:
                          self.cols = list(X.columns)
                   for col in self.cols:
                          if X[col].dtype == np.dtype('O') : 
                                 X[col].fillna(X[col].value_counts().index[0], inplace=True)
                          else : X[col] = impute.fit_transform(X[[col]])
    
                   return X
    
             def fit(self, *_):
                   return self
    
  • Dataframe:

          X = pd.DataFrame({'city':['tokyo', np.NaN, 'london', 'seattle', 'san 
                                     francisco', 'tokyo'], 
              'boolean':['yes', 'no', np.NaN, 'no', 'no', 'yes'], 
              'ordinal_column':['somewhat like', 'like', 'somewhat like', 'like', 
                                'somewhat like', 'dislike'], 
              'quantitative_column':[1, 11, -.5, 10, np.NaN, 20]})
    
    
                city              boolean   ordinal_column  quantitative_column
            0   tokyo             yes       somewhat like   1.0
            1   NaN               no        like            11.0
            2   london            NaN       somewhat like   -0.5
            3   seattle           no        like            10.0
            4   san francisco     no        somewhat like   NaN
            5   tokyo             yes       dislike         20.0
    
  • 1) Can be used with list of similar type of features.

     cci = CustomImputer(cols=['city', 'boolean']) # here default strategy = mean
     cci.fit_transform(X)
    
  • can be used with strategy = median

     sd = CustomImputer(['quantitative_column'], strategy = 'median')
     sd.fit_transform(X)
    
  • 3) Can be used with whole data frame, it will use default mean(or we can also change it with median. for qualitative features it uses strategy = 'most_frequent' and for quantitative mean/median.

     call = CustomImputer()
     call.fit_transform(X)   
    
查看更多
我想做一个坏孩纸
3楼-- · 2019-01-16 01:57

Copying and modifying sveitser's answer, I made an imputer for a pandas.Series object

import numpy
import pandas 

from sklearn.base import TransformerMixin

class SeriesImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        If the Series is of dtype Object, then impute with the most frequent object.
        If the Series is not of dtype Object, then impute with the mean.  

        """
    def fit(self, X, y=None):
        if   X.dtype == numpy.dtype('O'): self.fill = X.value_counts().index[0]
        else                            : self.fill = X.mean()
        return self

    def transform(self, X, y=None):
       return X.fillna(self.fill)

To use it you would do:

# Make a series
s1 = pandas.Series(['k', 'i', 't', 't', 'e', numpy.NaN])


a  = SeriesImputer()   # Initialize the imputer
a.fit(s1)              # Fit the imputer
s2 = a.transform(s1)   # Get a new series
查看更多
小情绪 Triste *
4楼-- · 2019-01-16 01:59

Similar. Modify Imputer for strategy='most_frequent':

class GeneralImputer(Imputer):
    def __init__(self, **kwargs):
        Imputer.__init__(self, **kwargs)

    def fit(self, X, y=None):
        if self.strategy == 'most_frequent':
            self.fills = pd.DataFrame(X).mode(axis=0).squeeze()
            self.statistics_ = self.fills.values
            return self
        else:
            return Imputer.fit(self, X, y=y)

    def transform(self, X):
        if hasattr(self, 'fills'):
            return pd.DataFrame(X).fillna(self.fills).values.astype(str)
        else:
            return Imputer.transform(self, X)

where pandas.DataFrame.mode() finds the most frequent value for each column and then pandas.DataFrame.fillna() fills missing values with these. Other strategy values are still handled the same way by Imputer.

查看更多
Viruses.
5楼-- · 2019-01-16 02:00

There is a package sklearn-pandas which has option for imputation for categorical variable https://github.com/scikit-learn-contrib/sklearn-pandas#categoricalimputer

>>> from sklearn_pandas import CategoricalImputer
>>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
>>> imputer = CategoricalImputer()
>>> imputer.fit_transform(data)
array(['a', 'b', 'b', 'b'], dtype=object)
查看更多
干净又极端
6楼-- · 2019-01-16 02:05

Inspired by the answers here and for the want of a goto Imputer for all use-cases I ended up writing this. It supports four strategies for imputation mean, mode, median, fill works on both pd.DataFrame and Pd.Series.

mean and median works only for numeric data, mode and fill works for both numeric and categorical data.

class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='mean',filler='NA'):
       self.strategy = strategy
       self.fill = filler

    def fit(self, X, y=None):
       if self.strategy in ['mean','median']:
           if not all(X.dtypes == np.number):
               raise ValueError('dtypes mismatch np.number dtype is \
                                 required for '+ self.strategy)
       if self.strategy == 'mean':
           self.fill = X.mean()
       elif self.strategy == 'median':
           self.fill = X.median()
       elif self.strategy == 'mode':
           self.fill = X.mode().iloc[0]
       elif self.strategy == 'fill':
           if type(self.fill) is list and type(X) is pd.DataFrame:
               self.fill = dict([(cname, v) for cname,v in zip(X.columns, self.fill)])
       return self

   def transform(self, X, y=None):
       return X.fillna(self.fill)

usage

>> df   
    MasVnrArea  FireplaceQu
Id  
1   196.0   NaN
974 196.0   NaN
21  380.0   Gd
5   350.0   TA
651 NaN     Gd


>> CustomImputer(strategy='mode').fit_transform(df)
MasVnrArea  FireplaceQu
Id      
1   196.0   Gd
974 196.0   Gd
21  380.0   Gd
5   350.0   TA
651 196.0   Gd

>> CustomImputer(strategy='fill', filler=[0, 'NA']).fit_transform(df)
MasVnrArea  FireplaceQu
Id      
1   196.0   NA
974 196.0   NA
21  380.0   Gd
5   350.0   TA
651 0.0     Gd 
查看更多
疯言疯语
7楼-- · 2019-01-16 02:07

This code fills in a series with the most frequent category:

import pandas as pd
import numpy as np

# create fake data 
m = pd.Series(list('abca'))
m.iloc[1] = np.nan #artificially introduce nan

print('m = ')
print(m)

#make dummy variables, count and sort descending:
most_common = pd.get_dummies(m).sum().sort_values(ascending=False).index[0] 

def replace_most_common(x):
    if pd.isnull(x):
        return most_common
    else:
        return x

new_m = m.map(replace_most_common) #apply function to original data

print('new_m = ')
print(new_m)

Outputs:

m =
0      a
1    NaN
2      c
3      a
dtype: object

new_m =
0    a
1    a
2    c
3    a
dtype: object
查看更多
登录 后发表回答