I am learning the book "Hands On Machine Learning" and writing some code of transformation pipelines to clean up my data and find the output of the same pipeline method varies according to the size of the dataframe I chose to input. Here is the code:
from sklearn.base import BaseEstimator,TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names =attribute_names
def fit(self,X,y=None):
return self
def transform(self,X):
return X[self.attribute_names].values
from sklearn.pipeline import FeatureUnion
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
def __init__(self, sparse_output=False):
self.sparse_output = sparse_output
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
enc = LabelBinarizer(sparse_output=self.sparse_output)
return enc.fit_transform(X)
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scalar', StandardScaler())
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', CustomLabelBinarizer())
])
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline)
])
housing_prepared = full_pipeline.fit_transform(housing)
data_prepared = full_pipeline.transform(housing.iloc[:5])
data_prepared1 = full_pipeline.transform(housing.iloc[:1000])
data_prepared2 = full_pipeline.transform(housing.iloc[:10000])
print(data_prepared.shape)
print(data_prepared1.shape)
print(data_prepared2.shape)
The output of these three print will be (5, 14) (1000, 15) (10000, 16) Could anyone help me explain this?