Registering Mapped Linux Character Device Memory with cudaHostRegister Results in Invalid Argument

I’m trying to boost DMA<->CPU<->GPU data transfer by:

  1. Mapping my (proprietary) device Linux Kernel allocated memory to user space
  2. Registering the later (mapped memory) to Cuda with cudaHostRegister API function.

While mapping User Space allocated memory mapped to my device DMA and then registered to Cuda with cudaHostRegister works just fine, trying to register “kmalloced” memory results in “Invalid Argument” error returned by cudaHostRegister.

First I thought the problem was with alignment or my device driver complicated memory pool management, so I’ve written a simplest character device which implements .mmap() where kzalloced 10Kb buffer is remapped with remap_pfn_range and the problem still stands.

Unfortunately, I did not find any resembling questions over the Net, so I sincerely hope I’ll find an answer here.

Some system info and Kernel driver <-> user space app code + runtime log info:

CUDA 8.0
Ubuntu 14.04
Kernel 3.16.0-31-generic
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 375.26                 Driver Version: 375.26                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|                                                                               
|   0  GeForce GTX 770     Off  | 0000:83:00.0     N/A |                  N/A |
| 26%   32C    P8    N/A /  N/A |     79MiB /  1997MiB |     N/A      Default |
+-------------------------------+----------------------+----------------------+

Character device mmap() code:

#define MEM_CHUNK_SIZE  4 * _K
#define MEM_POOL_SIZE   10 * _K
/**/
static int  chdv_mmap(struct file *filp, struct vm_area_struct *vma)
{
    unsigned int pages_per_buf = ( MEM_CHUNK_SIZE >> PAGE_SHIFT ) ;
    unsigned long pfn, vsize;

    /*make sure the buffer is allocated*/
    if((NULL == g_membuff) && 
       (NULL == (g_membuff = kzalloc(MEM_POOL_SIZE , GFP_KERNEL))))
    {
        kdbgprintln("Error: Not enough memory");
        return -ENOMEM;
    }

    vsize = vma->vm_end - vma->vm_start ;

    kdbgprintln("MEM_CHUNK_SIZE %u, pages_per_buf %u, vsize %lu  vma->vm_pgoff %lu",
            MEM_CHUNK_SIZE,
            pages_per_buf,
            vsize,
            vma->vm_pgoff);
    if(vsize > MEM_POOL_SIZE)
    {
        kdbgprintln("Error: vsize %lu > MEM_POOL_SIZE %u", vsize, MEM_POOL_SIZE);
        return -EINVAL;
    }

    /* We allow only mapping of one whole buffer so offset must be multiple
     * of pages_per_buf and size must be equal to dma_buf_size.
     */
    if( vma->vm_pgoff % pages_per_buf ) 
    {
        kdbgprintln("Error:Mapping DMA buffers is allowed only from beginning");
        return -EINVAL ;
    }

    vma->vm_flags = vma->vm_flags | (VM_DONTEXPAND | VM_LOCKED | VM_IO);

    /*Get the PFN for remap*/
    pfn = page_to_pfn(virt_to_page((unsigned char *)g_membuff));

    kdbgprintln("PFN : %lu", pfn);

    if(remap_pfn_range(vma, vma->vm_start, pfn, vsize, vma->vm_page_prot))
    {
        kdbgprintln("Error:Failed to remap memory");
        return -EINVAL;
    }

    /*Sealing data header & footer*/
    *((unsigned long *)g_membuff)       = 0xCDFFFFFFFFFFFFAB;
    *((unsigned long *)g_membuff + 1)   = 0xAB000000000000EF;
    *(unsigned long *)((unsigned char *)g_membuff + vsize - sizeof(unsigned long)) = 0xEF0000000C0000AA;

    kdbgprintln("Mapped 'kalloc' buffer" \
            "\n\t\tFirst  8 bytes: %lX" \
            "\n\t\tSecond 8 bytes: %lX" \
            "\n\t\tLast   8 bytes: %lX",
            *((unsigned long *)g_membuff),
            *((unsigned long *)g_membuff + 1),
            *(unsigned long *)((unsigned char *)g_membuff + vsize - sizeof(unsigned long)));

    return 0;
}

Test Application code:

static unsigned long map_mem_size;

int main(int argc, char** argv)
{
    int fd;
    const char dev_name[] = "/dev/chardev";
    void * address = NULL;
    long page_off = 0;
    cudaError_t cudarc;

    switch(argc)
    {
    case 2:
        page_off = atoi(argv[1]) * getpagesize();
        break;
    default:
        page_off = 0;
        break;
    }

    map_mem_size = 2 * getpagesize();

    printf("Opening %s file\n", dev_name);
    errno = 0;
    if(0 > (fd = open(dev_name, O_RDWR) ))
    {
        printf("Error %d - %s\n", errno, strerror(errno));
    }
    else
    {
        printf("About to map %lu bytes of %s device memory\n", map_mem_size, dev_name);

        errno = 0;
        if(MAP_FAILED == (address = mmap(NULL, map_mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, page_off)))
        {
            printf("Error %d - %s\n", errno, strerror(errno));
        }
        else
        {
            printf("mapped %s driver 'kmalloc' memory" \
                    "\n\t\tFirst  8 bytes: %lX" \
                    "\n\t\tSecond 8 bytes: %lX" \
                    "\n\t\tLast   8 bytes: %lX\n",
                    dev_name,
                    *((unsigned long *)address),
                    *((unsigned long *)address + 1),
                    *(unsigned long *)((unsigned char *)address + map_mem_size - sizeof(unsigned long)));

            if (cudaSuccess != (cudarc = cudaHostRegister(address, map_mem_size, cudaHostRegisterDefault)))
            {
                printf("Error: Failed cudaHostRegister: %s, address %p\n", cudaGetErrorString(cudarc), address);
            }
        }
    }

    /*Release resources block*/

    return EXIT_SUCCESS;
}

Run time debug information:
Kernel space (tail -f /var/log/syslog):

[ 4814.119537] [chardev] chardev.c, chdv_mmap, line 292:MEM_CHUNK_SIZE 4096, 
                          pages_per_buf 1, vsize 8192  vma->vm_pgoff 0
 [ 4814.119538] [chardev] chardev.c, chdv_mmap, line 311:PFN : 16306184
 [ 4814.119543] [chardev] chardev.c, chdv_mmap, line 330:Mapped 'kzalloced' buffer
 [ 4814.119543]           First  8 bytes: CDFFFFFFFFFFFFAB
 [ 4814.119543]           Second 8 bytes: AB000000000000EF
 [ 4814.119543]           Last   8 bytes: EF0000000C0000AA

User space:

./chrdev_test 
Opening /dev/chardev file
About to map 8192 bytes of /dev/chardev device memory
mapped /dev/chardev driver 'kmalloc' memory
                First  8 bytes: CDFFFFFFFFFFFFAB
                Second 8 bytes: AB000000000000EF
                Last   8 bytes: EF0000000C0000AA
Error: Failed cudaHostRegister: invalid argument
Unmapping /dev/chardev file
Closing /dev/chardev file

Thanks ahead.

Note :

Mapping any (ordinary) user space file then registering with cudaHostRegister works just fine and 100% operational, however, it does not provide the data transfer boost I’m eager for.

Update:
Made it work :

  • Change the cudaHostRegister flag in chrdev_test from cudaHostRegisterDefault to cudaHostRegisterIoMemory.
  • After that you still will get "cudaHostRegister: operation not permitted" if you don't run as super user while running the same code for userspace allocated memory with cudaHostRegisterDefault flag does not require any super user permissions. But I can live with that for now

If I map and register chunks of size of 1 - 2 pages (4 - 8K )to NVIDIA I see the same buffer changed in the Linux kernel (by the driver) and modified by the Cuda kernel after that

chardev.ko

|DBG|chardev.c|chdv_mmap|0346|MEM_CHUNK_SIZE 4096, pages_per_buf 1, vsize 4096  vma->vm_pgoff 0
|DBG|chardev.c|chdv_mmap|0375|PFN : 15976256
|DBG|chardev.c|chdv_mmap|0403|Mapped & sealed'kzalloced' buffer
          1st  8 bytes: CDFFFFFF00000007
          2nd  8 bytes: AB00000000000007
          8th  8 bytes: AB000EAC00000007
          Last 8 bytes: EF00000000000007
|DBG|chardev.c|chdv_mmap|0346|MEM_CHUNK_SIZE 4096, pages_per_buf 1, vsize 4096  vma->vm_pgoff 1
|DBG|chardev.c|chdv_mmap|0375|PFN : 15976257
|DBG|chardev.c|chdv_mmap|0403|Mapped & sealed'kzalloced' buffer
          1st  8 bytes: CDFFFFFF00000008
          2nd  8 bytes: AB00000000000008
          8th  8 bytes: AB000EAC00000008
          Last 8 bytes: EF00000000000008
./run_chrdevtest.sh -h
Usage:   ./chrdev_test <device name> <number of pages> <number of buffers>
Example: ./chrdev_test /dev/chardev 2 40
i.e.: Need 40 buffers, 2 pages (2 x 4096 bytes) each

~/ws/yoel/drv_tst$ sudo ./run_chrdevtest.sh /dev/chardev 1 2
Command input:
Argc:4
<n0>:./chrdev_test
<n1>:/dev/chardev
<n2>:1
<n3>:2
App Params:
Device :/dev/chardev
Map memory size  :4096
Page offset      :0
Number of buffers:2
Opening /dev/chardev file
--------------------------
About to map      buff[0000] 4096 bytes offset 0 of /dev/chardev device memory
About to register buff[0000] 4096 bytes offset 0 of /dev/chardev device memory to NVIDIA
-------------->From Linux Kernel buFFer [0]---------------------
Start address 0x7fdbc039a000
CDFFFFFF00000007 AB00000000000007 0000 0000 0000 0000 0000 AB000EAC00000007 
-----------------------------------------------
Padding Cuda random value: <b>5C4625FB</b>
-------------<-From CUDA kernel buFFer [0]---------------------
Start address 0x7fdbc039a000
5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 
5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 
-----------------------------------------------
About to map      buff[0001] 4096 bytes offset 4096 of /dev/chardev device memory
About to register buff[0001] 4096 bytes offset 4096 of /dev/chardev device memory to NVIDIA
-------------->From Linux Kernel buFFer [1]---------------------
Start address 0x7fdbc0366000
CDFFFFFF00000008 AB00000000000008 0000 0000 0000 0000 0000 AB000EAC00000008 
-----------------------------------------------
Padding Cuda random value: <b>152B173C</b>
-------------<-From CUDA kernel buFFer [1]---------------------
Start address 0x7fdbc0366000
152B173C 152B173C 152B173C 152B173C 152B173C 152B173C 152B173C 152B173C 
152B173C 152B173C 152B173C 152B173C 152B173C 152B173C 152B173C 152B173C 
--------------<-From CUDA kernel buFFer [0]---------------------
Start address 0x7fdbc039a000
5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 
5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 5C4625FB 
-----------------------------------------------
cudaDeviceSynchronize Stats:
                Max Time:334752 ns
                Min Time:331806 ns
                Avg Time:333279.000000 ns
  • However, mapped chunks with size more than 2 pages (> 4096 bytes) are immune to Cuda kernel modifications while the later does not report any errors to application and no NVIDIA drivers errors are observed in dmesg
sudo ./run_chrdevtest.sh /dev/chardev 4 1 
Command input:
Argc:4
<n0>:./chrdev_test
<n1>:/dev/chardev
<n2>:4
<n3>:1
App Params:
Device :/dev/chardev
Map memory size  :16384
Page offset      :0
Number of buffers:1
Opening /dev/chardev file
--------------------------
About to map      buff[0000] 16384 bytes offset 0 of /dev/chardev device memory
About to register buff[0000] 16384 bytes offset 0 of /dev/chardev device memory to NVIDIA
-------------->From Linux Kernel buFFer [0]---------------------
Start address 0x7f8ef343c000
CDFFFFFF00000009 AB00000000000009 0000 0000 0000 0000 0000 AB000EAC00000009 
-----------------------------------------------
Padding Cuda random value: <b>16BAC09F</b>
-------------<-From CUDA kernel buFFer [0]---------------------
Start address 0x7f8ef343c000
CDFFFFFF00000009 AB00000000000009 0000 0000 0000 0000 0000 AB000EAC00000009 
0000 0000 0000 0000 0000 0000 0000 0000 
-----------------------------------------------
cudaDeviceSynchronize Stats:
                Max Time:9692 ns
                Min Time:9692 ns
                Avg Time:9692.000000 ns

The question is whether NVIDIA is unable to handle Linux kernel memory chunks grater than 8K or I (as usual) missed something?

Attached the code of test application and the updated test driver.

Thanks,
Yoel.

cdev.zip (23.1 KB)