I want to run several regression types (Lasso, Ridge, ElasticNet and SVR) on a dataset with around 5,000 rows and 6 features. Linear regression. Use GridSearchCV for cross validation. The code is extensive but here are some critical parts:
def splitTrainTestAdv(df):
y = df.iloc[:,-5:] # last 5 columns
X = df.iloc[:,:-5] # Except for last 5 columns
#Scaling and Sampling
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)
return X_train, X_test, y_train, y_test
def performSVR(x_train, y_train, X_test, parameter):
C = parameter[0]
epsilon = parameter[1]
kernel = parameter[2]
model = svm.SVR(C = C, epsilon = epsilon, kernel = kernel)
model.fit(x_train, y_train)
return model.predict(X_test) #prediction for the test
def performRidge(X_train, y_train, X_test, parameter):
alpha = parameter[0]
model = linear_model.Ridge(alpha=alpha, normalize=True)
model.fit(X_train, y_train)
return model.predict(X_test) #prediction for the test
MODELS = {
'lasso': (
linear_model.Lasso(),
{'alpha': [0.95]}
),
'ridge': (
linear_model.Ridge(),
{'alpha': [0.01]}
),
)
}
def performParameterSelection(model_name, feature, X_test, y_test, X_train, y_train):
print("# Tuning hyper-parameters for %s" % feature)
print()
model, param_grid = MODELS[model_name]
gs = GridSearchCV(model, param_grid, n_jobs= 1, cv=5, verbose=1, scoring='%s_weighted' % feature)
gs.fit(X_train, y_train)
print("Best parameters set found on development set:")
print(gs.best_params_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in gs.grid_scores_:
print("%0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() * 2, params))
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
y_true, y_pred = y_test, gs.predict(X_test)
print(classification_report(y_true, y_pred))
soil = pd.read_csv('C:/training.csv', index_col=0)
soil = getDummiedSoilDepth(soil)
np.random.seed(2015)
soil = shuffleData(soil)
soil = soil.drop('Depth', 1)
X_train, X_test, y_train, y_test = splitTrainTestAdv(soil)
scores = ['precision', 'recall']
for score in scores:
for model in MODELS.keys():
print '####################'
print model, score
print '####################'
performParameterSelection(model, score, X_test, y_test, X_train, y_train)
You can assume that all required imports are done
I am getting this error and do not know why:
ValueError Traceback (most recent call last)
in () 18 print model, score 19 print '####################' ---> 20 performParameterSelection(model, score, X_test, y_test, X_train, y_train) 21
<ipython-input-27-304555776e21> in performParameterSelection(model_name, feature, X_test, y_test, X_train, y_train)
12 # cv=5 - constant; verbose - keep writing
13
---> 14 gs.fit(X_train, y_train) # Will get grid scores with outputs from ALL models described above
15
16 #pprint(sorted(gs.grid_scores_, key=lambda x: -x.mean_validation_score))
C:\Users\Tony\Anaconda\lib\site-packages\sklearn\grid_search.pyc in fit(self, X, y)
C:\Users\Tony\Anaconda\lib\site-packages\sklearn\metrics\classification.pyc in _check_targets(y_true, y_pred)
90 if (y_type not in ["binary", "multiclass", "multilabel-indicator",
91 "multilabel-sequences"]):
---> 92 raise ValueError("{0} is not supported".format(y_type))
93
94 if y_type in ["binary", "multiclass"]:
ValueError: continuous-multioutput is not supported
I am still very new to Python and this error puzzles me. This should not because I have 6 features, of course. I tried to follow standard buil-in functions.
Please, help