How to efficiently sort 5 arrays of integers?

MutantJohn · May 23, 2015, 5:22pm

Alright, I have some thrust code that’s proving to be the bottleneck of my code or at least that’s what NVVP tells me should be my number one priority when considering performance optimizations.

Here’s the call :

template<typename T>
struct tuple_comp
{
    __host__ __device__
    bool operator()(const thrust::tuple<T, T, T, T, T> t,
                    const thrust::tuple<T, T, T, T, T> v)
    {
        return ((unsigned& ) thrust::get<0>(t)) < ((unsigned& ) thrust::get<0>(v));
    }
};

/* ... */

	    // Sort everything so that positives are up front
	    thrust::sort(thrust::make_zip_iterator(
	                    thrust::make_tuple(thrust::device_ptr<int>(pa),
	                                       thrust::device_ptr<int>(ta),
	                                       thrust::device_ptr<int>(la),
	                                       thrust::device_ptr<int>(fs),
	                                       thrust::device_ptr<int>(nm))),
	                 thrust::make_zip_iterator(
	                    thrust::make_tuple(thrust::device_ptr<int>(pa + array_capacity),
	                    				   thrust::device_ptr<int>(ta + array_capacity),
	                    				   thrust::device_ptr<int>(la + array_capacity),
	                    				   thrust::device_ptr<int>(fs + array_capacity),
	                    				   thrust::device_ptr<int>(nm + array_capacity))),
	                 tuple_comp<int>());

Yeah, having to wrap raw pointers is pretty ugly but nevertheless, how do I turn this code into something faster? Or do I just accept this as how things go and try to rework my algorithm in other places such that this code is no longer needed?

Robert_Crovella · May 23, 2015, 6:01pm

My experience is that as the data to be sorted “by key” grows large, it eventually becomes more efficient to just sort-by-key the original “key” array with an index “value”, then use the scrambled index value array to “sort” the remaining arrays. You end up moving each element in the remaining arrays once, rather than several times. This has the additional benefit (in your case, assuming int base type) that the thrust algorithm can use the much quicker radix sort method since you will be passing a POD type instead of a structure, for which it would normally use a slower merge sort.

And, based on the comment in the code, if what you actually need is to just move the positive values to the front of the array, you could probably use thrust::partition or thrust::stable_partition instead of thrust::sort. It should be less “expensive” than a sort.

And if you don’t care about the negative values, and just want the positive values, then thrust::remove or some variant may be even less expensive.

MutantJohn · May 23, 2015, 6:15pm

I’m sorry, I should’ve elaborated on that. I do want the smaller pa values to be up front.

But I’m curious what you mean in your first paragraph. Are you talking about using thrust::gather?

Robert_Crovella · May 23, 2015, 7:00pm

You could use thrust::gather. But you can also just use thrust::copy with a permutation iterator.

Here’s a worked example. For my test case (5 arrays each of 1M elements) my method is about 3x faster than your method (K40c, CUDA 7, CentOS 6.2):

$ cat t746.cu
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>

#define DSIZE 1048576

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

template<typename T>
struct tuple_comp
{
    __host__ __device__
    bool operator()(const thrust::tuple<T, T, T, T, T> t,
                    const thrust::tuple<T, T, T, T, T> v)
    {
        return ((unsigned& ) thrust::get<0>(t)) < ((unsigned& ) thrust::get<0>(v));
    }
};

void my_sort(int *pa, int *ta, int *la, int *fs, int *nm, size_t array_capacity){
    thrust::sort(thrust::make_zip_iterator(
                      thrust::make_tuple(thrust::device_ptr<int>(pa),
                                         thrust::device_ptr<int>(ta),
                                         thrust::device_ptr<int>(la),
                                         thrust::device_ptr<int>(fs),
                                         thrust::device_ptr<int>(nm))),
                   thrust::make_zip_iterator(
                      thrust::make_tuple(thrust::device_ptr<int>(pa + array_capacity),
                                                   thrust::device_ptr<int>(ta + array_capacity),
                                                   thrust::device_ptr<int>(la + array_capacity),
                                                   thrust::device_ptr<int>(fs + array_capacity),
                                                   thrust::device_ptr<int>(nm + array_capacity))),
                   tuple_comp<int>());
}

int main(){

  int *h1, *h2, *h3, *h4, *h5, *d0, *d1, *d2, *d3, *d4, *d5, *d6, *d7, *d8, *d9;
  int *h1_1, *h1_2, *h2_1, *h2_2, *h3_1, *h3_2, *h4_1, *h4_2, *h5_1, *h5_2;
  int dsize = DSIZE*sizeof(int);
  h1   = (int *)malloc(dsize);
  h2   = (int *)malloc(dsize);
  h3   = (int *)malloc(dsize);
  h4   = (int *)malloc(dsize);
  h5   = (int *)malloc(dsize);
  h1_1 = (int *)malloc(dsize);
  h2_1 = (int *)malloc(dsize);
  h3_1 = (int *)malloc(dsize);
  h4_1 = (int *)malloc(dsize);
  h5_1 = (int *)malloc(dsize);
  h1_2 = (int *)malloc(dsize);
  h2_2 = (int *)malloc(dsize);
  h3_2 = (int *)malloc(dsize);
  h4_2 = (int *)malloc(dsize);
  h5_2 = (int *)malloc(dsize);
  cudaMalloc(&d1, dsize);
  cudaMalloc(&d2, dsize);
  cudaMalloc(&d3, dsize);
  cudaMalloc(&d4, dsize);
  cudaMalloc(&d5, dsize);
  for (int i = 0; i < DSIZE; i++) {
    h1[i] = (rand() - RAND_MAX/2);
    h2[i] = (rand() - RAND_MAX/2);
    h3[i] = (rand() - RAND_MAX/2);
    h4[i] = (rand() - RAND_MAX/2);
    h5[i] = (rand() - RAND_MAX/2);}
// warm-up
  my_sort(d1, d2, d3, d4, d5, DSIZE);
  cudaMemcpy(d1, h1, dsize, cudaMemcpyHostToDevice);
  cudaMemcpy(d2, h2, dsize, cudaMemcpyHostToDevice);
  cudaMemcpy(d3, h3, dsize, cudaMemcpyHostToDevice);
  cudaMemcpy(d4, h4, dsize, cudaMemcpyHostToDevice);
  cudaMemcpy(d5, h5, dsize, cudaMemcpyHostToDevice);
  unsigned long long mytime = dtime_usec(0);
  my_sort(d1, d2, d3, d4, d5, DSIZE);
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  cudaMemcpy(h1_1, d1, dsize, cudaMemcpyDeviceToHost);
  cudaMemcpy(h2_1, d2, dsize, cudaMemcpyDeviceToHost);
  cudaMemcpy(h3_1, d3, dsize, cudaMemcpyDeviceToHost);
  cudaMemcpy(h4_1, d4, dsize, cudaMemcpyDeviceToHost);
  cudaMemcpy(h5_1, d5, dsize, cudaMemcpyDeviceToHost);
  cudaMemcpy(d1, h1, dsize, cudaMemcpyHostToDevice);
  cudaMemcpy(d2, h2, dsize, cudaMemcpyHostToDevice);
  cudaMemcpy(d3, h3, dsize, cudaMemcpyHostToDevice);
  cudaMemcpy(d4, h4, dsize, cudaMemcpyHostToDevice);
  cudaMemcpy(d5, h5, dsize, cudaMemcpyHostToDevice);
  printf("t1: %fs\n", mytime/(float)USECPSEC);
  mytime = dtime_usec(0);
  cudaMalloc(&d0, dsize);
  cudaMalloc(&d6, dsize);
  cudaMalloc(&d7, dsize);
  cudaMalloc(&d8, dsize);
  cudaMalloc(&d9, dsize);
  thrust::sequence(thrust::device_ptr<int>(d0), thrust::device_ptr<int>(d0+DSIZE));
  thrust::sort_by_key(thrust::device_ptr<unsigned>((unsigned *)d1), thrust::device_ptr<unsigned>((unsigned *)d1+DSIZE), thrust::device_ptr<int>(d0));
  thrust::copy_n(thrust::make_permutation_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::device_ptr<int>(d2), thrust::device_ptr<int>(d3), thrust::device_ptr<int>(d4), thrust::device_ptr<int>(d5))), thrust::device_ptr<int>(d0)), DSIZE, thrust::make_zip_iterator(thrust::make_tuple(thrust::device_ptr<int>(d6), thrust::device_ptr<int>(d7), thrust::device_ptr<int>(d8), thrust::device_ptr<int>(d9))));
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  printf("t2: %fs\n", mytime/(float)USECPSEC);
  cudaMemcpy(h1_2, d1, dsize, cudaMemcpyDeviceToHost);
  cudaMemcpy(h2_2, d6, dsize, cudaMemcpyDeviceToHost);
  cudaMemcpy(h3_2, d7, dsize, cudaMemcpyDeviceToHost);
  cudaMemcpy(h4_2, d8, dsize, cudaMemcpyDeviceToHost);
  cudaMemcpy(h5_2, d9, dsize, cudaMemcpyDeviceToHost);
// validate results
  for (int i = 0; i < DSIZE; i++){
    if (h1_1[i] != h1_2[i]) {printf("h1 mismatch at %d, was: %d, should be: %d\n", i, h1_2[i], h1_1[i]); return 1;}
    if (h2_1[i] != h2_2[i]) {printf("h2 mismatch at %d, was: %d, should be: %d\n", i, h2_2[i], h2_1[i]); return 1;}
    if (h3_1[i] != h3_2[i]) {printf("h3 mismatch at %d, was: %d, should be: %d\n", i, h3_2[i], h3_1[i]); return 1;}
    if (h4_1[i] != h4_2[i]) {printf("h4 mismatch at %d, was: %d, should be: %d\n", i, h4_2[i], h4_1[i]); return 1;}
    if (h5_1[i] != h5_2[i]) {printf("h5 mismatch at %d, was: %d, should be: %d\n", i, h5_2[i], h5_1[i]); return 1;}
    }
  return 0;
}
$ nvcc -o t746 t746.cu
$ ./t746
t1: 0.016679s
t2: 0.005404s
$

MutantJohn · May 24, 2015, 5:28pm

I’ll have to look into this later but thanks!

There’s one thing, your replacement solution doesn’t look like it’s in-place. That may or may not be a problem for me… I’m not sure if memory is going to become a constraint or not because this is just one small part of an overall picture, despite the sometimes very large size of the arrays.

Robert_Crovella · May 24, 2015, 5:36pm

In-place for thrust::sort memory usage is a mythical idea. Of course it’s correct from the standpoint that it returns the answer in-place of the original data, but thrust::sort allocates O(N) temporary data, where (N) is the total size of the data you are sorting (all 5 vectors, for your example).

So your method uses the storage for 5 vectors, plus O(N) temporary space, i.e. the equivalent storage for 10 vectors.

You can modify my exact sequence of allocations (i.e. move lines 108-111 to in-between 113 and 114) if you’re very concerned about memory space:

My method:
start: 5 vectors
thrust::sort: add an index vector (+1), plus O(N) sort (+2) = 8 vector space storage
after sort: two temp storage “vectors” released by thrust, so back down to 6
vector swap: allocate 4 new vectors, so up to 10, same as thrust peak demand.

If you’d like to test this for yourself, about thrust::sort O(N) storage, try sorting a vector set that is greater than about 40% of your GPU memory, in size. It will fail due to memory allocation failure in thrust. Or you can google for more info.

MutantJohn · May 25, 2015, 5:35pm

Oh, okay. I didn’t realize the normal in-place sort allocated temporary data. Hey, that means I don’t need to care about memory consumption now. Thanks, txbob. You’re so awesome :)

MutantJohn · June 19, 2015, 4:55am

Uh, I hate to necro this thread but I’m curious, what do I do after the copy_n call if I want to keep this all in a loop and re-use the same storage repeatedly?

This is my code now :

// Permutation iterator sort method
	    thrust::sequence(thrust::device_ptr<int>(indices),
	    		         thrust::device_ptr<int>(indices + array_capacity));

	    thrust::sort_by_key(thrust::device_ptr<unsigned>((unsigned* ) pa),
	    		            thrust::device_ptr<unsigned>((unsigned* ) pa + array_capacity),
	    		            thrust::device_ptr<int>(indices));

	    thrust::copy_n(
	      thrust::make_permutation_iterator(
	          thrust::make_zip_iterator(
	              thrust::make_tuple(thrust::device_ptr<int>(ta),
	                                 thrust::device_ptr<int>(la),
	                                 thrust::device_ptr<int>(fs),
	                                 thrust::device_ptr<int>(nm))),
	          thrust::device_ptr<int>(indices)),
	      array_capacity,
	      thrust::make_zip_iterator(
	          thrust::make_tuple(thrust::device_ptr<int>(c_ta),
	                             thrust::device_ptr<int>(c_la),
	                             thrust::device_ptr<int>(c_fs),
	                             thrust::device_ptr<int>(c_nm))));

	    cudaDeviceSynchronize();

	    int *old[4] = { c_ta, c_la, c_fs, c_nm };

	    c_ta = ta;
	    c_la = la;
	    c_fs = fs;
	    c_nm = nm;

	    ta = old[0];
	    la = old[1];
	    fs = old[2];
	    nm = old[3];

Basically, is this part of the code necessary?

c_ta = ta;
	    c_la = la;
	    c_fs = fs;
	    c_nm = nm;

	    ta = old[0];
	    la = old[1];
	    fs = old[2];
	    nm = old[3];

I’ve tried running it with and without and so far, they both seem to work so I’m confused lol.

I’m starting to think the re-assignment is pointless, actually, At the end of the copy_n call, I want ta, fs, la, nm to point to the new data (which makes sense) and I’m confident that I need to re-assign the addresses but I’m not sure. I think I would have to otherwise all my arrays would point the same place…

Edit : Btw, this code is amazingly faster O_o

Like, I got a pretty good performance boost from this (about 47%).