the Difference between running time and time of ob

I am trying to implement My algorithm on GPU using CUDA. this program work well but there is a problem. when I try to print out the results, they will be shown too late . here are some of my code. Assume True Results is not matter.

__device__ unsigned char dev_state[128];

__device__ unsigned char GMul(unsigned char a, unsigned char b) { // Galois Field (256) Multiplication of two Bytes
    unsigned char p = 0;
    int counter;
    unsigned char hi_bit_set;
    for (counter = 0; counter < 8; counter++) {
        if ((b & 1) != 0) {
            p ^= a;
        }
        hi_bit_set = a & 0x80;
        a <<= 1;
        if (hi_bit_set != 0) {
            a ^= 0x1b; /* x^8 + x^4 + x^3 + x + 1 */
        }
        b >>= 1;
    }
    return p;
}


__global__ void AESROUND()
{
    __shared__ unsigned char dev_rkey;
    __shared__ unsigned char dev_sh_state;
    int state_idx = blockIdx.x;
    int offset = ((state_idx / 4)) *4;

    for (int i = 0; i < 512; i++)
    {
        dev_rkey = dev_state[state_idx];

        dev_sh_state= GMul(dev_state[state_idx], 0x02) ^ GMul(dev_state[(state_idx + 5) % 16], 0x03) ^ dev_state[(offset + 5) % 16] ^ dev_state[(offset + 5) % 16];


        dev_state[state_idx] = dev_sh_state ^ dev_rkey;
    }


}

calling AESROUND

int main()
{

    unsigned char p[] = { 0x19, 0x3d, 0xe3, 0xbe, 0xa0, 0xf4, 0xe2, 0x2b, 0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08 };

unsigned char h_state[128];
for (long long i = 0; i < 128; i++)
    h_state[i] = p[i%16];

cudaMemcpyToSymbolAsync(dev_state, h_state, 128, 0, cudaMemcpyHostToDevice);

clock_t start, finish;

start = clock();
for (long long i = 0; i < 1024; i++)

     AESROUND << <128, 128 >> >();
finish = clock();

float Time = finish - start;

printf("\n\nprocessing time: %2.15f (ms)\n", Time);

cudaMemcpyFromSymbolAsync(h_state, dev_state, 128, 0, cudaMemcpyDeviceToHost);
printf("\n\state After Encryption:\n ");
for (int i = 0; i < 16; i++)
    printf("%x ", h_state[i]);

getchar();
return 0;
}

here are the Results:

processing time: 1.0000000000000 (ms)

-after a long time ( ~ 5 seconds), next line will be shown-

state after encryption:
88 91 23 09 78 65 11 87 65 43 56 71 20 93 18 70

as you can see, The processing time are too fast but the 128Byte will be shown Too late. why this happened? is this related to GPU? how can I fix it?

The confusion here seems to have arisen out of using a host-based timing method to time what is (mostly) device activity.

Kernel launches are asynchronous. The host code launches the kernel, and then proceeds without waiting for the kernel to complete. Therefore this kind of timing:

start = clock();
for (long long i = 0; i < 1024; i++)

     AESROUND << <128, 128 >> >();
finish = clock();

is only measuring the kernel launch time. (This is true even in this case where the kernel is launched repeatedly in a loop. If some device queues are not exceeded, each kernel launch will be asynchronous, allowing the host thread, i.e. the for-loop, to continue.)

In order to measure the complete device execution time, you could do something like this:

start = clock();
for (long long i = 0; i < 1024; i++)

     AESROUND << <128, 128 >> >();
cudaDeviceSynchronize();   //wait for device to finish
finish = clock();