I need to optimize my kernel function! Plz help me!

I am trying to build kernel function and measure the performance.
This code is operating well, but I want to optimize it to get higher speed.

Is there better way to implement kernel function?

#define SIGNAL_SIZE 1024*1024
#define TILE_WIDTH 512


__global__ void point_wise_product(cufftComplex *a, int *b){

	const int Row = blockIdx.y*TILE_WIDTH + threadIdx.y;
	const int Col = blockIdx.x*TILE_WIDTH + threadIdx.x;
	const int numThreads = SIGNAL_SIZE*SIGNAL_SIZE;

	if(Row < SIGNAL_SIZE && Col < SIGNAL_SIZE){
		a[Row*TILE_WIDTH + Col].x = a[Row*TILE_WIDTH + Col].x * b[Row*TILE_WIDTH + Col];
		a[Row*TILE_WIDTH + Col].y = a[Row*TILE_WIDTH + Col].y * b[Row*TILE_WIDTH + Col];
	}
}


int main()
{
        ...........
        dim3 dimGrids((SIGNAL_SIZE-1)/TILE_WIDTH + 1, (SIGNAL_SIZE-1)/TILE_WIDTH + 1, 1);
	dim3 dimBlocks(TILE_WIDTH, TILE_WIDTH, 1);

	//Multiply the coefficients tohether and normalize the result
	printf("Launching ComplexPointwiseAndScale<<< >>>\n");
	point_wise_product<<<dimGrids, dimBlocks>>>(x_w, h_w);
}