I have a dataset that contains 510
samples for training and 127
samples for testing, each sample has 7680
features. I want to design a model to predict the height (cm)-label-from the training data. Currently, I used SVM but it provided very bad result. Could you look at my code and give me some comments. You can try it in your machine using the dataset and a runnable code
import numpy as np
from sklearn.svm import SVR
# Training Data
train_X = np.loadtxt('trainX.txt') # 510 x 7680
train_Y = np.loadtxt('trainY.txt') # 510 x 1
test_X = np.loadtxt('testX.txt') # 127 x 7680
test_Y = np.loadtxt('testY.txt') # 127 x 1
my_svr = SVR(C=1000, epsilon=0.2)
my_svr.fit(train_X,train_Y)
p_regression = my_svr.predict(test_X)
print (p_regression)
print (test_Y)
Some results:
p_regression
[15.67367165 16.35094166 13.10510262 14.03943211 12.7116549 11.45071423
13.27225207 9.44959181 10.45775627 13.23953143 14.95568324 11.35994414
10.69531821 12.42556347 14.54712287 12.25965911 9.04101931 14.03604126
12.41237627 13.51951317 10.36302674 9.86389635 11.41448842 15.67146184
14.74764672 11.22794536 12.04429175 12.48199183 14.29790809 16.21724184
10.94478135 9.68210872 14.8663311 8.62974573 15.17281425 12.97230127
9.46515876 16.24388177 10.35742683 15.65336366 11.04652502 16.35094166
14.03943211 10.29066405 13.27225207 9.44959181 10.45775627 13.23953143
14.95568324 11.35994414 10.69531821 12.42556347 14.54712287 12.25965911
9.04101931 14.03604126 12.41237627 13.51951317 10.36302674 9.86389635
11.41448842 15.67146184 14.74764672 11.22794536 12.04429175 12.48199183
14.29790809 16.21724184 10.94478135 9.68210872 14.8663311 8.62974573
15.17281425 12.97230127 9.46515876 16.24388177 10.35742683 15.65336366
11.04652502 16.35094166 14.03943211 10.29066405 13.27225207 9.44959181
10.45775627 13.23953143 14.95568324 11.35994414 10.69531821 12.42556347
14.54712287 12.25965911 9.04101931 14.03604126 12.41237627 13.51951317
10.36302674 9.86389635 11.41448842 15.67146184 14.74764672 11.22794536
12.04429175 12.48199183 14.29790809 16.21724184 10.94478135 9.68210872
14.8663311 8.62974573 15.17281425 12.97230127 9.46515876 16.24388177
10.35742683 15.65336366 11.04652502 16.35094166 14.03943211 10.29066405
13.27225207 9.44959181 10.45775627 13.23953143 14.95568324 11.35994414
10.69531821]
test_Y
[13. 14. 13. 15. 15. 17. 13. 17. 16. 12. 17. 6. 4. 3. 4. 6. 6. 8.
9. 18. 3. 6. 4. 6. 7. 8. 11. 11. 13. 12. 12. 14. 13. 12. 15. 15.
16. 15. 17. 18. 17. 14. 15. 17. 13. 17. 16. 12. 17. 6. 4. 3. 4. 6.
6. 8. 9. 18. 3. 6. 4. 6. 7. 8. 11. 11. 13. 12. 12. 14. 13. 12.
15. 15. 16. 15. 17. 18. 17. 14. 15. 17. 13. 17. 16. 12. 17. 6. 4. 3.
4. 6. 6. 8. 9. 18. 3. 6. 4. 6. 7. 8. 11. 11. 13. 12. 12. 14.
13. 12. 15. 15. 16. 15. 17. 18. 17. 14. 15. 17. 13. 17. 16. 12. 17. 6.
4.]
Here is a similar approach. We will split data sets into train
and test
ones. train
data set will be used for tuning hyperparameters and for fitting different models. Then we will choose the best (in terms of MSE) model and predict values from the test
data set.
All trained (fitted) models will be saved as Pickle files, so they can be loaded later on using joblib.load()
method.
Output:
----------------------------- [SVR_rbf] ------------------------------
Fitting 3 folds for each of 4 candidates, totalling 12 fits
---------------------------- [SVR_linear] ----------------------------
Fitting 3 folds for each of 4 candidates, totalling 12 fits
------------------------------ [Ridge] -------------------------------
Fitting 3 folds for each of 7 candidates, totalling 21 fits
------------------------------ [Lasso] -------------------------------
Fitting 3 folds for each of 6 candidates, totalling 18 fits
--------------------------- [RandomForest] ---------------------------
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------------------------- [SVR_rbf] ------------------------------
Score: 44.88%
Parameters: {'SVR_rbf__C': 10, 'SVR_rbf__max_iter': 500}
**********************************************************************
---------------------------- [SVR_linear] ----------------------------
Score: 33.40%
Parameters: {'SVR_linear__C': 0.01, 'SVR_linear__max_iter': 1000}
**********************************************************************
------------------------------ [Ridge] -------------------------------
Score: 34.83%
Parameters: {'Ridge__alpha': 500, 'Ridge__max_iter': 200}
**********************************************************************
------------------------------ [Lasso] -------------------------------
Score: 22.90%
Parameters: {'Lasso__alpha': 0.1, 'Lasso__max_iter': 1000}
**********************************************************************
--------------------------- [RandomForest] ---------------------------
Score: 36.87%
Parameters: {'RandomForest__max_depth': 5, 'RandomForest__n_estimators': 250}
**********************************************************************
Mean Squared Error: {'SVR_rbf': 5.375, 'SVR_linear': 7.036, 'Ridge': 7.02, 'Lasso': 8.108, 'RandomForest': 9.475}
Code:
import os
#import contextlib
from operator import itemgetter
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
def get_data(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return (pd.concat([X_train, X_test], ignore_index=True),
pd.concat([y_train, y_test], ignore_index=True)[0])
def get_data_split(path='.', test_size=0.25):
X, y = get_data(path)
return train_test_split(X, y, test_size=test_size)
def tune_models_hyperparams(X, y, models, **common_grid_kwargs):
grids = {}
for model in models:
print('{:-^70}'.format(' [' + model['name'] + '] '))
pipe = Pipeline([
("scale", StandardScaler()),
(model['name'], model['model']) ])
grids[model['name']] = (GridSearchCV(pipe,
param_grid=model['param_grid'],
**common_grid_kwargs)
.fit(X, y))
# saving single trained model ...
joblib.dump(grids[model['name']], './{}.pkl'.format(model['name']))
return grids
def get_best_model(grid, X_test, y_test,
metric_func=mean_squared_error):
res = {name : round(metric_func(y_test, model.predict(X_test)), 3)
for name, model in grid.items()}
print('Mean Squared Error:', res)
best_model_name = min(res, key=itemgetter(1))
return grid[best_model_name]
def test_dataset(grid, X_test, y_test):
res = {}
for name, model in grid.items():
y_pred = model.predict(X_test)
res[name] = {'MSE': mean_squared_error(y_test, y_pred),
'R2': r2_score(y_test, y_pred)
}
return res
def predict(grid, X_test, model_name):
return grid[model_name].predict(X_test)
def print_grid_results(grids):
for name, model in grids.items():
print('{:-^70}'.format(' [' + name + '] '))
print('Score:\t\t{:.2%}'.format(model.best_score_))
print('Parameters:\t{}'.format(model.best_params_))
print('*' * 70)
models = [
{ 'name': 'SVR_rbf',
'model': SVR(),
'title': "SVR_rbf",
'param_grid': {
'SVR_rbf__C': [0.1, 1, 5, 10],
'SVR_rbf__max_iter': [500]
}
},
{ 'name': 'SVR_linear',
'model': SVR(kernel='linear'),
'title': "SVR_rbf",
'param_grid': {
'SVR_linear__C': [0.01, 0.1, 1, 5],
'SVR_linear__max_iter': [1000]
}
},
{ 'name': 'Ridge',
'model': Ridge(),
'title': "Ridge",
'param_grid': {
'Ridge__alpha': [0.1, 0.5, 5, 10, 50, 100, 500],
'Ridge__max_iter': [200]
}
},
{ 'name': 'Lasso',
'model': Lasso(),
'title': "Lasso",
'param_grid': {
'Lasso__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
'Lasso__max_iter': [1000]
}
},
{ 'name': 'RandomForest',
'model': RandomForestRegressor(),
'title': "RandomForest",
'param_grid': {
'RandomForest__n_estimators': [50, 250, 500],
'RandomForest__max_depth': [5],
}
},
]
def main(path):
os.chdir(str(path))
X_train, X_test, y_train, y_test = \
get_data_split(path, test_size=127/510.)
grid = tune_models_hyperparams(X_train, y_train, models, cv=3,
verbose=2, n_jobs=-1)
print_grid_results(grid)
model = get_best_model(grid, X_test, y_test)
df = pd.DataFrame({'predicted': model.predict(X_test)})
df.to_csv('predicted.csv', index=False)
if __name__ == "__main__":
p = Path(__file__).parent.resolve()
main(p)
I agree with @George - "there is something "wrong" with the test set"
. I got similar results of MSE - approx. 21.
I also tried to put train and test datasets together and feed it to GridSearchCV.
Here are the results of those attempts:
In [33]: print_grid_results(grid)
----------------------------- [SVR_rbf] ------------------------------
Score: 48.98%
Parameters: {'SVR_rbf__C': 5, 'SVR_rbf__max_iter': 500}
**********************************************************************
---------------------------- [SVR_linear] ----------------------------
Score: 64.07%
Parameters: {'SVR_linear__C': 0.1, 'SVR_linear__max_iter': 500}
**********************************************************************
------------------------------ [Ridge] -------------------------------
Score: 63.98%
Parameters: {'Ridge__alpha': 100, 'Ridge__max_iter': 200}
**********************************************************************
------------------------------ [Lasso] -------------------------------
Score: 60.36%
Parameters: {'Lasso__alpha': 0.001, 'Lasso__max_iter': 1000}
**********************************************************************
--------------------------- [RandomForest] ---------------------------
Score: 44.01%
Parameters: {'RandomForest__max_depth': 5, 'RandomForest__n_estimators': 100}
**********************************************************************
Also different splits are giving very different test scores:
In [43]: clf = grid['SVR_linear']
In [44]: {k:v for k,v in clf.cv_results_.items() if k.endswith('_test_score')}
Out[44]:
{'mean_test_score': array([0.64067998, 0.63919104, 0.6391681 , 0.64067998, 0.63919104, 0.6391681 , 0.64067998, 0.63919104, 0.6391681 ]),
'rank_test_score': array([1, 4, 7, 1, 4, 7, 1, 4, 7]),
'split0_test_score': array([0.98557453, 0.98876705, 0.98883802, 0.98557453, 0.98876705, 0.98883802, 0.98557453, 0.98876705, 0.98883802]),
'split1_test_score': array([0.69915178, 0.69750946, 0.69740475, 0.69915178, 0.69750946, 0.69740475, 0.69915178, 0.69750946, 0.69740475]),
'split2_test_score': array([0.23568677, 0.22964765, 0.22961214, 0.23568677, 0.22964765, 0.22961214, 0.23568677, 0.22964765, 0.22961214]),
'std_test_score': array([0.30903146, 0.31275403, 0.31278954, 0.30903146, 0.31275403, 0.31278954, 0.30903146, 0.31275403, 0.31278954])}
Here is a full code:
import os
#import contextlib
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
def get_data_split(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return X_train, y_train[0], X_test, y_test[0]
def get_data(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return (pd.concat([X_train, X_test], ignore_index=True),
pd.concat([y_train, y_test], ignore_index=True)[0])
def fit_all_classifiers_grid(X, y, classifiers, **common_grid_kwargs):
grids = {}
for clf in classifiers:
print('{:-^70}'.format(' [' + clf['name'] + '] '))
pipe = Pipeline([
("scale", StandardScaler()),
(clf['name'], clf['clf']) ])
grids[clf['name']] = (GridSearchCV(pipe,
param_grid=clf['param_grid'],
**common_grid_kwargs)
.fit(X, y))
# saving single trained model ...
joblib.dump(grids[clf['name']], './{}.pkl'.format(clf['name']))
return grids
def test_dataset(grid, X_test, y_test):
res = {}
for name, clf in grid.items():
y_pred = clf.predict(X_test)
res[name] = {'MSE': mean_squared_error(y_test, y_pred),
'R2': r2_score(y_test, y_pred)
}
return res
def print_grid_results(grids):
for name, clf in grids.items():
print('{:-^70}'.format(' [' + name + '] '))
print('Score:\t\t{:.2%}'.format(clf.best_score_))
print('Parameters:\t{}'.format(clf.best_params_))
print('*' * 70)
classifiers = [
{ 'name': 'SVR_rbf',
'clf': SVR(),
'title': "SVR_rbf",
'param_grid': {
'SVR_rbf__C': [0.1, 1, 5],
'SVR_rbf__max_iter': [500, 1000, 5000]
}
},
{ 'name': 'SVR_linear',
'clf': SVR(kernel='linear'),
'title': "SVR_rbf",
'param_grid': {
'SVR_linear__C': [0.1, 1, 5],
'SVR_linear__max_iter': [500, 1000, 5000]
}
},
{ 'name': 'Ridge',
'clf': Ridge(),
'title': "Ridge",
'param_grid': {
'Ridge__alpha': [0.1, 1, 5, 10, 50, 100],
'Ridge__max_iter': [200, 500]
}
},
{ 'name': 'Lasso',
'clf': Lasso(),
'title': "Lasso",
'param_grid': {
'Lasso__alpha': [0.001, 0.01, 0.1, 1, 5, 10],
'Lasso__max_iter': [1000, 5000]
}
},
{ 'name': 'RandomForest',
'clf': RandomForestRegressor(),
'title': "RandomForest",
'param_grid': {
'RandomForest__n_estimators': [10, 100],
'RandomForest__max_depth': [3, 5],
}
},
]
def main(path):
#path = r'D:\data\work\.ML\SO\49094242-SVM provided a bad result in my data'
os.chdir(path)
X, y = get_data(path)
grid = fit_all_classifiers_grid(X, y, classifiers, cv=3, verbose=2, n_jobs=-1)
print_grid_results(grid)
#X_train, y_train, X_test, y_test = get_data_split(path)
#grid = fit_all_classifiers_grid(X_train, y_train, classifiers, cv=2, verbose=2, n_jobs=-1)
#res = test_dataset(grid, X_test, y_test)
#print(res)
PS sorry for using name classifier
instead of regressor
- I just reused my old code where I was searching for the best classifier....