I am trying to fit an SSVM as shown on the examples page: https://github.com/pystruct/pystruct/blob/master/examples/multi_class_svm.py
The code runs fine with the given example, but not with my own data.
The input dataset is a csv file with the following columns and rows for example:
user_name facility start_date day_of_week monthweek month
TestUserA FacilityA 2/1/2015 1 1 2
... ... ... ... ... ...
out of the above columns, the only predictors I use are 'facility' and day_of_week'
and I generate the labels by concatenating the 3-digit dayOfWeek string with the facility:
for example MONFacilityA
The structure of my data is as follows:
('Predictors shape: ', (518, 2))
('Labels shape: ', (518,))
('X_train type: ', <type 'numpy.ndarray'>)
('X_train shape: ', (440, 2))
('X_test shape: ', (78, 2))
('y_train type: ', <type 'numpy.ndarray'>)
('y_train shape: ', (440,))
('y_test shape: ', (78,))
('X_train example 1st row element type: ', <type 'numpy.int64'>)
('y_train example 1st row element type: ', <type 'numpy.int64'>)
('Unique labels: len(np.unique(y_train)) ', 20)
.. and my labels start from 0...to...n as advied on this post: IndexError when fitting SSVM model in PyStruct
But I get this error:
Traceback (most recent call last):
File "userSchedulePredictor_MultiClassSVM.py", line 151, in <module>
one_slack_svm.fit(X_train_bias, y_train)
File "/usr/local/lib/python2.7/dist-packages/pystruct/learners/one_slack_ssvm.
py", line 455, in fit
X, Y, joint_feature_gt, constraints)
File "/usr/local/lib/python2.7/dist-packages/pystruct/learners/one_slack_ssvm.
py", line 355, in _find_new_constraint
X, Y, self.w, relaxed=True)
File "/usr/local/lib/python2.7/dist-packages/pystruct/models/unstructured_svm.
py", line 323, in batch_loss_augmented_inference
scores[other_classes] += np.repeat(self.class_weight[Y],
IndexError: index 20 is out of bounds for size 20
*** Error in `python': double free or corruption (!prev): 0x000000000228af90 ***
Aborted (core dumped)
This is my code:
# Imports
import os
import numpy as np
import pandas as pd
# Fetch paths
workingDir = os.getcwd()
print("Current working directory: ", workingDir)
# Get the raw csv data
rawData = pd.read_csv(workingDir + "/data/data2.csv")
# print('Dataset columns names: ', list(rawData.columns.values))
# print('Unique Days_of_week from dataset: ', np.unique(rawData['day_of_week']))
# print('Unique facility names from dataset: ', np.unique(rawData['facility']))
# Keep only the columns that are needed for prediction
data = pd.DataFrame({'facility' : rawData['facility'], 'day_of_week': rawData['day_of_week'], 'currentState': '', 'nextState': ''})
weekDays = {
1 : 'SUN',
2 : 'MON',
3 : 'TUE',
4 : 'WED',
5 : 'THU',
6 : 'FRI',
7 : 'SAT'
}
# Fill currentState column
for i, row in data.iterrows():
facility = row['facility']
weekDayStr = weekDays[row['day_of_week']]
nextStateStr = weekDayStr + row['facility']
data.ix[i, 'currentState'] = nextStateStr
#data.ix[i, 'day_of_week'] = weekDayStr
# Fill next state column
for i, row in data.iterrows():
if i - 1 and (i-1) > 0:
data.ix[i - 1, 'nextState'] = row['currentState']
# Remove unwanted columns
del data['currentState']
#Keep only the rows which have nextState values
data = data[data.nextState != '']
predictors = pd.DataFrame({'facility' : data['facility'], 'day_of_week': data['day_of_week']})
#labels = pd.DataFrame({'nextState' : data['nextState']})
#labels = np.array(pd.Series(data['nextState']))
tempLabels = pd.DataFrame({'nextState' : data['nextState']})
labels = pd.Series(data['nextState'])
#Convert labels to INT values
uniqueLabels = labels.unique()
strLabelToInt = {}
for i in range(len(uniqueLabels)):
strLabelToInt[uniqueLabels[i]] = i
print('Label lookup dictionary: ', strLabelToInt)
#Convert labels to INT values
# uniqueLabels = pd.unique(labels['nextState'])
# strLabelToInt = {}
# for i in range(len(uniqueLabels)):
# strLabelToInt[uniqueLabels[i]] = i
#print('Label lookup dictionary: ', strLabelToInt)
# for i, row in labels.iterrows():
# labels.ix[i, 'nextState'] = int(strLabelToInt[row['nextState']])
for i, row in tempLabels.iterrows():
tempLabels.ix[i, 'nextState'] = int(strLabelToInt[row['nextState']])
labels.update(tempLabels['nextState'])
labels = np.array(labels, dtype='int64')
# Convert labels to numpy array
# labels = labels.values
#Perform Imputation on data
predictors = predictors.dropna()
# Perform One-hot encoding on Categorical variables in the dataset
#predictors = pd.get_dummies(predictors)
#Convert predictors to INT values
uniqueFacilityNames = pd.unique(predictors['facility'])
strFacilityToInt = {}
for i in range(len(uniqueFacilityNames)):
strFacilityToInt[uniqueFacilityNames[i]] = i + 1
print('Facility lookup dictionary: ', strFacilityToInt)
tempFacilityLabels = pd.DataFrame({'facility' : predictors['facility']})
for i, row in tempFacilityLabels.iterrows():
tempFacilityLabels.ix[i, 'facility'] = int(strFacilityToInt[row['facility']])
predictors['facility'].update(tempFacilityLabels['facility'])
predictors = np.array(predictors, dtype='int64')
#Create train/test split
from sklearn.cross_validation import train_test_split
print('Predictors shape: ',predictors.shape)
print('Labels shape: ', labels.shape)
X_train, X_test, y_train, y_test = train_test_split(predictors, labels, test_size=0.15, random_state=0)
# convert train/test data to numpy arrays (Not sure if PyStruct supports pandas df's)
#X_train = X_train.values
#X_test = X_test.values
print('X_train type: ', type(X_train))
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)
print('y_train type: ', type(y_train))
print('y_train shape: ', y_train.shape)
print('y_test shape: ', y_test.shape)
print('X_train example 1st row element type: ', type(X_train[0][0]))
print('y_train example 1st row element type: ', type(y_train[0]))
X_train_bias = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_test_bias = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
from time import time
from pystruct.models import MultiClassClf
from pystruct.learners import (NSlackSSVM, OneSlackSSVM,SubgradientSSVM, FrankWolfeSSVM)
target_class_count = len(np.unique(y_train))
print('Unique labels: ', len(uniqueLabels))
model = MultiClassClf(n_features=X_train_bias.shape[1], n_classes=target_class_count)
n_slack_svm = NSlackSSVM(model, verbose=2, check_constraints=False, C=0.1,
batch_size=100, tol=1e-2)
one_slack_svm = OneSlackSSVM(model, verbose=2, C=.10, tol=.001)
subgradient_svm = SubgradientSSVM(model, C=0.1, learning_rate=0.000001,
max_iter=1000, verbose=0)
fw_bc_svm = FrankWolfeSSVM(model, C=.1, max_iter=50)
fw_batch_svm = FrankWolfeSSVM(model, C=.1, max_iter=50, batch_mode=True)
# n-slack cutting plane ssvm
#start = time()
#n_slack_svm.fit(X_train_bias, y_train)
#time_n_slack_svm = time() - start
#y_pred = np.hstack(n_slack_svm.predict(X_test_bias))
#print("Score with pystruct n-slack ssvm: %f (took %f seconds)"
# % (np.mean(y_pred == y_test), time_n_slack_svm))
## 1-slack cutting plane ssvm
start = time()
one_slack_svm.fit(X_train_bias, y_train)
time_one_slack_svm = time() - start
y_pred = np.hstack(one_slack_svm.predict(X_test_bias))
print("Score with pystruct 1-slack ssvm: %f (took %f seconds)"
% (np.mean(y_pred == y_test), time_one_slack_svm))
#online subgradient ssvm
start = time()
subgradient_svm.fit(X_train_bias, y_train)
time_subgradient_svm = time() - start
y_pred = np.hstack(subgradient_svm.predict(X_test_bias))
print("Score with pystruct subgradient ssvm: %f (took %f seconds)"
% (np.mean(y_pred == y_test), time_subgradient_svm))
# the standard one-vs-rest multi-class would probably be as good and faster
# but solving a different model
libsvm = LinearSVC(multi_class='crammer_singer', C=.1)
start = time()
libsvm.fit(X_train, y_train)
time_libsvm = time() - start
print("Score with sklearn and libsvm: %f (took %f seconds)"
% (libsvm.score(X_test, y_test), time_libsvm))
start = time()
fw_bc_svm.fit(X_train_bias, y_train)
y_pred = np.hstack(fw_bc_svm.predict(X_test_bias))
time_fw_bc_svm = time() - start
print("Score with pystruct frankwolfe block coordinate ssvm: %f (took %f seconds)" %
(np.mean(y_pred == y_test), time_fw_bc_svm))
start = time()
fw_batch_svm.fit(X_train_bias, y_train)
y_pred = np.hstack(fw_batch_svm.predict(X_test_bias))
time_fw_batch_svm = time() - start
print("Score with pystruct frankwolfe batch ssvm: %f (took %f seconds)" %
(np.mean(y_pred == y_test), time_fw_batch_svm))
I have tried using 1-hot representation of my data (int numpy arrays), but in vain... Does this have something to do with joint_feature(x, y) described on the wiki? https://pystruct.github.io/user_guide.html
If yes, can someone please please shed light on what am i missing here? I am not sure that i understand if the X, Y have to have any specific shape, etc. for this to work well...
Please advise,
Thanks!