Cuda program not working for more than 1024 thread

My program is of Odd-even merge sort and it's not working for more than 1024 threads.

I have already tried increasing the block size to 100 but it still not working for more than 1024 threads.

I'm using Visual Studio 2012 and I have Nvidia Geforce 610M. This is my program

#include<stdio.h>
#include<iostream>
#include<conio.h>
#include <random>
#include <stdint.h>
#include <driver_types.h >


__global__ void odd(int *arr,int n){
    int i=threadIdx.x;
    int temp;
    if(i%2==1&&i<n-1){
        if(arr[i]>arr[i+1])
        {
            temp=arr[i];
            arr[i]=arr[i+1];
            arr[i+1]=temp;
        }
    }
}

__global__ void even(int *arr,int n){
    int i=threadIdx.x;
    int temp;
    if(i%2==0&&i<n-1){
        if(arr[i]>arr[i+1])
        {
            temp=arr[i];
            arr[i]=arr[i+1];
            arr[i+1]=temp;
        }
    }
}

int main(){
    int SIZE,k,*A,p,j;
    int *d_A;
    float time;

    printf("Enter the size of the array\n");
    scanf("%d",&SIZE);
    A=(int *)malloc(SIZE*sizeof(int));
    cudaMalloc(&d_A,SIZE*sizeof(int));
    for(k=0;k<SIZE;k++)
    A[k]=rand()%1000;


    cudaMemcpy(d_A,A,SIZE*sizeof(int),cudaMemcpyHostToDevice);
    if(SIZE%2==0)
        p=SIZE/2;
    else
        p=SIZE/2+1;


    for(j=0;j<p;j++){
        even<<<3,SIZE>>>(d_A,SIZE);
        if(j!=p-1)
            odd<<<3,SIZE>>>(d_A,SIZE);
        if(j==p-1&&SIZE%2==0)
            odd<<<1,SIZE>>>(d_A,SIZE);
    }


    cudaMemcpy(A,d_A,SIZE*sizeof(int),cudaMemcpyDeviceToHost);
    for(k=0;k<SIZE;k++)
        printf("%d ",A[k]);


    free(A);
    cudaFree(d_A);

    getch();

}

CUDA threadblocks are limited to 1024 threads (or 512 threads, for cc 1.x gpus). The size of the threadblock is indicated in the second kernel configuration parameter in the kernel launch:

    even<<<3,SIZE>>>(d_A,SIZE);
             ^^^^

So when you enter a SIZE value greater than 1024, this kernel will not launch.

You're getting no indication of this because you're not doing proper cuda error checking which is always a good idea any time you're having trouble with a CUDA code. You can also, as a quick test, run your code with cuda-memcheck to look for API errors.