I have a sparse csr_matrix, and I want to change the values of a single row to different values. I can't find an easy and efficient implementation however. This is what it has to do:
A = csr_matrix([[0, 1, 0],
[1, 0, 1],
[0, 1, 0]])
new_row = np.array([-1, -1, -1])
print(set_row_csr(A, 2, new_row).todense())
>>> [[ 0, 1, 0],
[ 1, 0, 1],
[-1, -1, -1]]
This is my current implementation of set_row_csr
:
def set_row_csr(A, row_idx, new_row):
A[row_idx, :] = new_row
return A
But this gives me a SparseEfficiencyWarning
. Is there a way of getting this done without manual index juggling, or is this my only way out?
In the end, I managed to get this done with index juggling.
def set_row_csr(A, row_idx, new_row):
'''
Replace a row in a CSR sparse matrix A.
Parameters
----------
A: csr_matrix
Matrix to change
row_idx: int
index of the row to be changed
new_row: np.array
list of new values for the row of A
Returns
-------
None (the matrix A is changed in place)
Prerequisites
-------------
The row index shall be smaller than the number of rows in A
The number of elements in new row must be equal to the number of columns in matrix A
'''
assert sparse.isspmatrix_csr(A), 'A shall be a csr_matrix'
assert row_idx < A.shape[0], \
'The row index ({0}) shall be smaller than the number of rows in A ({1})' \
.format(row_idx, A.shape[0])
try:
N_elements_new_row = len(new_row)
except TypeError:
msg = 'Argument new_row shall be a list or numpy array, is now a {0}'\
.format(type(new_row))
raise AssertionError(msg)
N_cols = A.shape[1]
assert N_cols == N_elements_new_row, \
'The number of elements in new row ({0}) must be equal to ' \
'the number of columns in matrix A ({1})' \
.format(N_elements_new_row, N_cols)
idx_start_row = A.indptr[row_idx]
idx_end_row = A.indptr[row_idx + 1]
additional_nnz = N_cols - (idx_end_row - idx_start_row)
A.data = np.r_[A.data[:idx_start_row], new_row, A.data[idx_end_row:]]
A.indices = np.r_[A.indices[:idx_start_row], np.arange(N_cols), A.indices[idx_end_row:]]
A.indptr = np.r_[A.indptr[:row_idx + 1], A.indptr[(row_idx + 1):] + additional_nnz]
physicalattraction's answer is indeed significantly quicker. It's much faster than my solution, which was to just add a separate matrix with that single row set. Though the addition solution was faster than the slicing solution.
The take away for me is that the fastest way to set rows in a csr_matrix or columns in a csc_matrix is to modify the underlying data yourself.
def time_copy(A, num_tries = 10000):
start = time.time()
for i in range(num_tries):
B = A.copy()
end = time.time()
return end - start
def test_method(func, A, row_idx, new_row, num_tries = 10000):
start = time.time()
for i in range(num_tries):
func(A.copy(), row_idx, new_row)
end = time.time()
copy_time = time_copy(A, num_tries)
print("Duration {}".format((end - start) - copy_time))
def set_row_csr_slice(A, row_idx, new_row):
A[row_idx,:] = new_row
def set_row_csr_addition(A, row_idx, new_row):
indptr = np.zeros(A.shape[1] + 1)
indptr[row_idx +1:] = A.shape[1]
indices = np.arange(A.shape[1])
A += csr_matrix((new_row, indices, indptr), shape=A.shape)
>>> A = csr_matrix((np.ones(1000), (np.random.randint(0,1000,1000), np.random.randint(0, 1000, 1000))))
>>> test_method(set_row_csr_slice, A, 200, np.ones(A.shape[1]), num_tries = 10000)
Duration 4.938395977020264
>>> test_method(set_row_csr_addition, A, 200, np.ones(A.shape[1]), num_tries = 10000)
Duration 2.4161765575408936
>>> test_method(set_row_csr, A, 200, np.ones(A.shape[1]), num_tries = 10000)
Duration 0.8432261943817139
The slice solution also scales much worse with the size and sparsity of the matrix.
# Larger matrix, same fraction sparsity
>>> A = csr_matrix((np.ones(10000), (np.random.randint(0,10000,10000), np.random.randint(0, 10000, 10000))))
>>> test_method(set_row_csr_slice, A, 200, np.ones(A.shape[1]), num_tries = 10000)
Duration 18.335174798965454
>>> test_method(set_row_csr, A, 200, np.ones(A.shape[1]), num_tries = 10000)
Duration 1.1089558601379395
# Super sparse matrix
>>> A = csr_matrix((np.ones(100), (np.random.randint(0,10000,100), np.random.randint(0, 10000, 100))))
>>> test_method(set_row_csr_slice, A, 200, np.ones(A.shape[1]), num_tries = 10000)
Duration 13.371600151062012
>>> test_method(set_row_csr, A, 200, np.ones(A.shape[1]), num_tries = 10000)
Duration 1.0454308986663818
Something is wrong with this set_row_csr
. Yes, it is fast and it seemed to work for some test cases. However, it seems to garble the internal csr structure of the csr sparse matrix in my test cases. Try lil_matrix(A)
afterwards and you will see error messages.
In physicalattraction's answer, the len(new_row)
must be equal to A.shape[1]
what may not be interesting when adding sparse rows.
So, based on his answer I've came up with a method to set rows in csr while it keeps the sparcity property. Additionally I've added a method to convert dense arrays to sparse arrays (on data, indices format)
def to_sparse(dense_arr):
sparse = [(data, index) for index, data in enumerate(dense_arr) if data != 0]
# Convert list of tuples to lists
sparse = list(map(list, zip(*sparse)))
# Return data and indices
return sparse[0], sparse[1]
def set_row_csr_unbounded(A, row_idx, new_row_data, new_row_indices):
'''
Replace a row in a CSR sparse matrix A.
Parameters
----------
A: csr_matrix
Matrix to change
row_idx: int
index of the row to be changed
new_row_data: np.array
list of new values for the row of A
new_row_indices: np.array
list of indices for new row
Returns
-------
None (the matrix A is changed in place)
Prerequisites
-------------
The row index shall be smaller than the number of rows in A
Row data and row indices must have the same size
'''
assert isspmatrix_csr(A), 'A shall be a csr_matrix'
assert row_idx < A.shape[0], \
'The row index ({0}) shall be smaller than the number of rows in A ({1})' \
.format(row_idx, A.shape[0])
try:
N_elements_new_row = len(new_row_data)
except TypeError:
msg = 'Argument new_row_data shall be a list or numpy array, is now a {0}'\
.format(type(new_row_data))
raise AssertionError(msg)
try:
assert N_elements_new_row == len(new_row_indices), \
'new_row_data and new_row_indices must have the same size'
except TypeError:
msg = 'Argument new_row_indices shall be a list or numpy array, is now a {0}'\
.format(type(new_row_indices))
raise AssertionError(msg)
idx_start_row = A.indptr[row_idx]
idx_end_row = A.indptr[row_idx + 1]
A.data = np.r_[A.data[:idx_start_row], new_row_data, A.data[idx_end_row:]]
A.indices = np.r_[A.indices[:idx_start_row], new_row_indices, A.indices[idx_end_row:]]
A.indptr = np.r_[A.indptr[:row_idx + 1], A.indptr[(row_idx + 1):] + N_elements_new_row]