In our lab, we have NVIDIA Tesla K80 GPU accelerator computing with the following characteristics: Intel(R) Xeon(R) CPU E5-2670 v3 @2.30GHz, 48 CPU processors, 128GB RAM, 12 CPU cores
running under Linux 64-bit.
I am running the following code which does GridSearchCV
after vertically appends different sets of dataframes into a single series of a RandomForestRegressor
model. The two sample datasets I am considering are found in this link
import sys
import imp
import glob
import os
import pandas as pd
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.cross_validation import train_test_split
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "cubic*.csv"))), ignore_index=True)
#df = pd.read_csv('cubic31.csv')
for i in range(1,3):
df['X_t'+str(i)] = df['X'].shift(i)
print(df)
df.dropna(inplace=True)
X = (pd.DataFrame({ 'X_%d'%i : df['X'].shift(i) for i in range(3)}).apply(np.nan_to_num, axis=0).values)
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)
X_train = X_train.drop('time', axis=1)
X_test = X_test.drop('time', axis=1)
#Fit models with some grid search CV=5 (not to low), use the best model
parameters = {'n_estimators': [10,30,100,500,1000]}
clf_rf = RandomForestRegressor(random_state=1)
clf = GridSearchCV(clf_rf, parameters, cv=5, scoring='neg_mean_squared_error')
model = clf.fit(X_train, y_train)
model.cv_results_['params'][model.best_index_]
math.sqrt(model.best_score_*-1)
model.grid_scores_
#####
print()
print(model.grid_scores_)
print(math.sqrt(model.best_score_*-1))
#reg = RandomForestRegressor(criterion='mse')
clf_rf.fit(X_train,y_train)
modelPrediction = clf_rf.predict(X_test)
print(modelPrediction)
print("Number of predictions:",len(modelPrediction))
meanSquaredError=mean_squared_error(y_test, modelPrediction)
print("Mean Square Error (MSE):", meanSquaredError)
rootMeanSquaredError = sqrt(meanSquaredError)
print("Root-Mean-Square Error (RMSE):", rootMeanSquaredError)
####### to add the trendline
fig, ax = plt.subplots()
#df.plot(x='time', y='Y', ax=ax)
ax.plot(df['time'].values, df['Y'].values)
fig, ax = plt.subplots()
index_values=range(0,len(y_test))
y_test.sort_index(inplace=True)
X_test.sort_index(inplace=True)
modelPred_test = clf_rf.predict(X_test)
ax.plot(pd.Series(index_values), y_test.values)
PlotInOne=pd.DataFrame(pd.concat([pd.Series(modelPred_test), pd.Series(y_test.values)], axis=1))
plt.figure(); PlotInOne.plot(); plt.legend(loc='best')
When I run this program for a huge dataset (around 2 million rows), it is taking more than 3 days to do the GridSearchCV
. I was, therefore, wondering if Python
threads can utilize more than one CPU. How can we make this (or other Python
programs) utilize more than one CPU so that it does the task faster in a short period of time? Thank you for any tips!