Index out of bounds: Fitting SSVM using Pystruct

2019-07-24 02:01发布

问题:

I am trying to fit an SSVM as shown on the examples page: https://github.com/pystruct/pystruct/blob/master/examples/multi_class_svm.py

The code runs fine with the given example, but not with my own data.

The input dataset is a csv file with the following columns and rows for example:

user_name   facility    start_date  day_of_week monthweek   month
TestUserA   FacilityA   2/1/2015       1           1          2
  ...          ...        ...          ...         ...       ...

out of the above columns, the only predictors I use are 'facility' and day_of_week'

and I generate the labels by concatenating the 3-digit dayOfWeek string with the facility:

for example MONFacilityA

The structure of my data is as follows:

('Predictors shape: ', (518, 2))
('Labels shape: ', (518,))
('X_train type: ', <type 'numpy.ndarray'>)
('X_train shape: ', (440, 2))
('X_test shape: ', (78, 2))
('y_train type: ', <type 'numpy.ndarray'>)
('y_train shape: ', (440,))
('y_test shape: ', (78,))
('X_train example 1st row element type: ', <type 'numpy.int64'>)
('y_train example 1st row element type: ', <type 'numpy.int64'>)
('Unique labels: len(np.unique(y_train)) ', 20)

.. and my labels start from 0...to...n as advied on this post: IndexError when fitting SSVM model in PyStruct

But I get this error:

    Traceback (most recent call last):
  File "userSchedulePredictor_MultiClassSVM.py", line 151, in <module>
    one_slack_svm.fit(X_train_bias, y_train)
  File "/usr/local/lib/python2.7/dist-packages/pystruct/learners/one_slack_ssvm.
py", line 455, in fit
    X, Y, joint_feature_gt, constraints)
  File "/usr/local/lib/python2.7/dist-packages/pystruct/learners/one_slack_ssvm.
py", line 355, in _find_new_constraint
    X, Y, self.w, relaxed=True)
  File "/usr/local/lib/python2.7/dist-packages/pystruct/models/unstructured_svm.
py", line 323, in batch_loss_augmented_inference
    scores[other_classes] += np.repeat(self.class_weight[Y],
IndexError: index 20 is out of bounds for size 20
*** Error in `python': double free or corruption (!prev): 0x000000000228af90 ***

Aborted (core dumped)

This is my code:

# Imports
import os
import numpy as np
import pandas as pd

# Fetch paths
workingDir = os.getcwd()
print("Current working directory: ", workingDir)


# Get the raw csv data
rawData = pd.read_csv(workingDir + "/data/data2.csv")
# print('Dataset columns names: ', list(rawData.columns.values))
# print('Unique Days_of_week from dataset: ', np.unique(rawData['day_of_week']))
# print('Unique facility names from dataset: ', np.unique(rawData['facility']))

# Keep only the columns that are needed for prediction
data = pd.DataFrame({'facility' : rawData['facility'], 'day_of_week': rawData['day_of_week'], 'currentState': '', 'nextState': ''})

weekDays = {
    1 : 'SUN',
    2 : 'MON',
    3 : 'TUE',
    4 : 'WED',
    5 : 'THU',
    6 : 'FRI',
    7 : 'SAT'
}

# Fill currentState column
for i, row in data.iterrows():
    facility = row['facility']
    weekDayStr = weekDays[row['day_of_week']]
    nextStateStr = weekDayStr + row['facility']
    data.ix[i, 'currentState'] = nextStateStr
    #data.ix[i, 'day_of_week']  = weekDayStr

# Fill next state column
for i, row in data.iterrows():
    if i - 1 and (i-1) > 0:
        data.ix[i - 1, 'nextState'] = row['currentState']

# Remove unwanted columns 
del data['currentState']

#Keep only the rows which have nextState values
data = data[data.nextState != '']

predictors  = pd.DataFrame({'facility' : data['facility'], 'day_of_week': data['day_of_week']})
#labels      = pd.DataFrame({'nextState' : data['nextState']})
#labels = np.array(pd.Series(data['nextState']))
tempLabels      = pd.DataFrame({'nextState' : data['nextState']})
labels = pd.Series(data['nextState'])

#Convert labels to INT values
uniqueLabels = labels.unique()
strLabelToInt = {}
for i in range(len(uniqueLabels)):
    strLabelToInt[uniqueLabels[i]] = i 
print('Label lookup dictionary: ', strLabelToInt)

#Convert labels to INT values
# uniqueLabels = pd.unique(labels['nextState'])
# strLabelToInt = {}
# for i in range(len(uniqueLabels)):
#     strLabelToInt[uniqueLabels[i]] = i
#print('Label lookup dictionary: ', strLabelToInt)

# for i, row in labels.iterrows():
#     labels.ix[i, 'nextState']  = int(strLabelToInt[row['nextState']])

for i, row in tempLabels.iterrows():
    tempLabels.ix[i, 'nextState']  = int(strLabelToInt[row['nextState']])


labels.update(tempLabels['nextState'])
labels = np.array(labels, dtype='int64')

# Convert labels to numpy array
# labels = labels.values

#Perform Imputation on data
predictors = predictors.dropna()

# Perform One-hot encoding on Categorical variables in the dataset
#predictors = pd.get_dummies(predictors)

#Convert predictors to INT values
uniqueFacilityNames = pd.unique(predictors['facility'])
strFacilityToInt = {}
for i in range(len(uniqueFacilityNames)):
    strFacilityToInt[uniqueFacilityNames[i]] = i + 1
print('Facility lookup dictionary: ', strFacilityToInt)
tempFacilityLabels = pd.DataFrame({'facility' : predictors['facility']})
for i, row in tempFacilityLabels.iterrows():
    tempFacilityLabels.ix[i, 'facility']  = int(strFacilityToInt[row['facility']])

predictors['facility'].update(tempFacilityLabels['facility'])
predictors = np.array(predictors, dtype='int64')

#Create train/test split
from sklearn.cross_validation import train_test_split
print('Predictors shape: ',predictors.shape)
print('Labels shape: ', labels.shape)

X_train, X_test, y_train, y_test = train_test_split(predictors, labels, test_size=0.15, random_state=0) 

# convert train/test data to numpy arrays (Not sure if PyStruct supports pandas df's)
#X_train = X_train.values
#X_test = X_test.values

print('X_train type: ', type(X_train))
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)
print('y_train type: ', type(y_train))
print('y_train shape: ', y_train.shape)
print('y_test shape: ', y_test.shape)
print('X_train example 1st row element type: ', type(X_train[0][0]))
print('y_train example 1st row element type: ', type(y_train[0]))

X_train_bias = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_test_bias = np.hstack([X_test, np.ones((X_test.shape[0], 1))])

from time import time
from pystruct.models import MultiClassClf
from pystruct.learners import (NSlackSSVM, OneSlackSSVM,SubgradientSSVM, FrankWolfeSSVM)

target_class_count = len(np.unique(y_train))
print('Unique labels: ', len(uniqueLabels))

model = MultiClassClf(n_features=X_train_bias.shape[1], n_classes=target_class_count)
n_slack_svm = NSlackSSVM(model, verbose=2, check_constraints=False, C=0.1,
                         batch_size=100, tol=1e-2)
one_slack_svm = OneSlackSSVM(model, verbose=2, C=.10, tol=.001)
subgradient_svm = SubgradientSSVM(model, C=0.1, learning_rate=0.000001,
                                  max_iter=1000, verbose=0)

fw_bc_svm = FrankWolfeSSVM(model, C=.1, max_iter=50)
fw_batch_svm = FrankWolfeSSVM(model, C=.1, max_iter=50, batch_mode=True)

# n-slack cutting plane ssvm
#start = time()
#n_slack_svm.fit(X_train_bias, y_train)
#time_n_slack_svm = time() - start
#y_pred = np.hstack(n_slack_svm.predict(X_test_bias))
#print("Score with pystruct n-slack ssvm: %f (took %f seconds)"
#      % (np.mean(y_pred == y_test), time_n_slack_svm))

## 1-slack cutting plane ssvm
start = time()
one_slack_svm.fit(X_train_bias, y_train)
time_one_slack_svm = time() - start
y_pred = np.hstack(one_slack_svm.predict(X_test_bias))
print("Score with pystruct 1-slack ssvm: %f (took %f seconds)"
      % (np.mean(y_pred == y_test), time_one_slack_svm))

#online subgradient ssvm
start = time()
subgradient_svm.fit(X_train_bias, y_train)
time_subgradient_svm = time() - start
y_pred = np.hstack(subgradient_svm.predict(X_test_bias))

print("Score with pystruct subgradient ssvm: %f (took %f seconds)"
      % (np.mean(y_pred == y_test), time_subgradient_svm))

# the standard one-vs-rest multi-class would probably be as good and faster
# but solving a different model
libsvm = LinearSVC(multi_class='crammer_singer', C=.1)
start = time()
libsvm.fit(X_train, y_train)
time_libsvm = time() - start
print("Score with sklearn and libsvm: %f (took %f seconds)"
      % (libsvm.score(X_test, y_test), time_libsvm))


start = time()
fw_bc_svm.fit(X_train_bias, y_train)
y_pred = np.hstack(fw_bc_svm.predict(X_test_bias))
time_fw_bc_svm = time() - start
print("Score with pystruct frankwolfe block coordinate ssvm: %f (took %f seconds)" %
      (np.mean(y_pred == y_test), time_fw_bc_svm))

start = time()
fw_batch_svm.fit(X_train_bias, y_train)
y_pred = np.hstack(fw_batch_svm.predict(X_test_bias))
time_fw_batch_svm = time() - start
print("Score with pystruct frankwolfe batch ssvm: %f (took %f seconds)" %
      (np.mean(y_pred == y_test), time_fw_batch_svm))

I have tried using 1-hot representation of my data (int numpy arrays), but in vain... Does this have something to do with joint_feature(x, y) described on the wiki? https://pystruct.github.io/user_guide.html

If yes, can someone please please shed light on what am i missing here? I am not sure that i understand if the X, Y have to have any specific shape, etc. for this to work well...

Please advise,

Thanks!