How to implement vectorization over loops of block

I have a code that runs sample_size sequences of matrix multiplication, and for each sequence there involves seq_length operations of sums of matrices multiplications. But the shortcoming of my code is that as soon as seq_length gets any higher than around 300, the algorithm slows down, and needless to say that as seq_length gets bigger, the whole algorithm becomes slower and slower. So I was wondering if there are any optimization/vectorization that can be implemented with the way I've written my code, or just on my code in general.

Basically, here I'm just defining a bunch of (2x2) matrices with complex entries that will be used within my algorithm later. The cliff_operators() function would then pick a random matrix out of all of them.

import random
import numpy as np
import matplotlib.pyplot as plt
import time
import sys

init_state = np.array([[1, 0], [0, 0]], dtype=complex)
II = np.identity(2, dtype=complex)
X = np.array([[0, 1], [1, 0]], dtype=complex)
Y = np.array([[0, -1j], [1j, 0]], dtype=complex)
Z = np.array([[1, 0], [0, -1]], dtype=complex)
PPP = (-II + 1j*X + 1j*Y + 1j*Z)/2
PPM = (-II + 1j*X + 1j*Y - 1j*Z)/2
PMM = (-II + 1j*X - 1j*Y - 1j*Z)/2
MMM = (-II - 1j*X - 1j*Y - 1j*Z)/2
MMP = (-II - 1j*X - 1j*Y + 1j*Z)/2
MPP = (-II - 1j*X + 1j*Y + 1j*Z)/2
PMP = (-II + 1j*X - 1j*Y + 1j*Z)/2
MPM = (-II - 1j*X + 1j*Y - 1j*Z)/2

def cliff_operators():
    return random.choice([II, X, Y, Z, PPP, PPM, PMM, MMM, MMP, MPP, PMP, MPM])

Here compute_channel_operation function does element-wise matrix dot products within the input block matrix/tensor, and then do element-wise summation of all matrices within the tensor.

def compute_channel_operation(rho, operators):
    return np.sum(operators@rho@operators.transpose(0, 2, 1).conj(), axis=0)

def depolarizing_error(param):
    XYZ = np.sqrt(param/3)*np.array([X, Y, Z])
    return np.array([np.sqrt(1-param)*II, XYZ[0], XYZ[1], XYZ[2]])

def random_angles(sd):
    return np.random.normal(0, sd, 3)

def unitary_error(params):
    e_1 = np.exp(-1j*(params[0]+params[2])/2)*np.cos(params[1]/2)
    e_2 = np.exp(-1j*(params[0]-params[2])/2)*np.sin(params[1]/2)
    return np.array([[[e_1, e_2], [-e_2.conj(), e_1.conj()]]])


def rb(input_state, seq_length, sample_size, noise_mean,
                            noise_sd, noise2_sd):
    fidelity = []
    for i in range(1, sample_size+1, 1):
        rho = input_state
        sequence = []
        for j in range(1, seq_length+1, 1):
            noise = depolarizing_error(np.random.normal(noise_mean, noise_sd))
            noise_2 = unitary_error(random_angles(noise2_sd))
            unitary = cliff_operators()
            sequence.append(unitary)
            i_ideal_operator = compute_channel_operation(rho,
                                                         np.array([unitary]))

            i_noisy_operator = compute_channel_operation(i_ideal_operator,
                                                         noise)
            i_noisy_operator_2 = compute_channel_operation(i_noisy_operator,
                                                           noise_2)
            sys.stdout.write("\r" + "gate applied: " + str(j))
            rho = i_noisy_operator_2

        # Final random noise
        noise = depolarizing_error(np.random.normal(noise_mean, noise_sd))
        noise_2 = unitary_error(random_angles(noise2_sd))

        # Computes the Hermitian of the forward operators sequence
        unitary_plus_1 = np.linalg.multi_dot(sequence[::-1]).conj().T

        # Final ideal&noisy density operator
        f_ideal_operator = compute_channel_operation(rho,
                                                     np.array([unitary_plus_1]))

        f_noisy_operator = compute_channel_operation(f_ideal_operator, noise)
        f_noisy_operator_2 = compute_channel_operation(f_noisy_operator,
                                                       noise_2)
        fidelity.append(np.trace(input_state@f_noisy_operator_2))

    avg_fidelity = (1/sample_size)*np.sum(fidelity)
    return avg_fidelity


def get_data(rho, seq_length, sample_size, noise_mean, noise_sd, noise2_sd):
    length = []
    fidelity_s = []
    for s in range(2, seq_length, 1):
        avg_fidelity = rb(rho, s, sample_size, noise_mean,
                                               noise_sd, noise2_sd)
        length.append(s)
        fidelity_s.append(avg_fidelity)

    plt.plot(length, fidelity_s)
    plt.title("Fidelity vs Clifford length")
    plt.ylim(0.5, 1)
    plt.xlabel("Clifford length")
    plt.ylabel("Fidelity")
    plt.xlim(0, 100)
    plt.show()

starttime = time.time()
get_data(init_state, 402, 1, 0.005, 0.001, 0.01)
timeElapsed = time.time() - starttime
print(timeElapsed)

So could vectorization be potentially implemented to remove the i and j loop and make it run faster as seq_length get bigger? Can the loops over sample_size be vectorised so that n sequences are all running at the same time in a one big matrix?