Dear all ,
I have read a very interesting post Accelerated Ray Tracing in One Weekend in CUDA by By Roger Allen (https://devblogs.nvidia.com/accelerated-ray-tracing-cuda/). In this post virtual functions were employed to create ray-traced pictures of spheres. I modified the code provided in Chapter 5
(GitHub - rogerallen/raytracinginoneweekendincuda at ch05_normals_cuda)
a little bit in order to create_world on host instead of device. In other words I replaced device functions with host device functions in headers ray.h; hitable.h; hitable_list.h; sphere.h; and finally I added host function in main.cu file
void create_world_host(hitable **d_list, hitable **d_world) {
*(d_list) = new sphere(vec3(0,0,-1), 0.5);
*(d_list+1) = new sphere(vec3(0,-100.5,-1), 100);
*d_world = new hitable_list(d_list,2);
}
and used unified memory
// make our world of hitables
//but using unified memory
hitable **d_list;
checkCudaErrors(cudaMallocManaged((void **)&d_list, 2*sizeof(hitable *)));
hitable **d_world;
checkCudaErrors(cudaMallocManaged((void **)&d_world, sizeof(hitable *)));)
I found it is impossible to fill in the unified memory using my host function create_world_host instead of similar global void create_world . I have Quadro P4000 card with Pascal architecture.
Why does usage of the host function create_world_host instead of global void create_world lead to
the error : endering a 1200x600 image in 8x8 blocks.
CUDA error = 700 at main.cu:114 ‘cudaDeviceSynchronize()’ ???
My version of ray.h is
#ifndef RAYH
#define RAYH
#include “vec3.h”
class ray
{
public:
host device ray() {}
host device ray(const vec3& a, const vec3& b) { A = a; B = b; }
host device vec3 origin() const { return A; }
host device vec3 direction() const { return B; }
host device vec3 point_at_parameter(float t) const { return A + t*B; }
vec3 A;
vec3 B;
};
My version of hitable.h is
#ifndef HITABLEH
#define HITABLEH
#include “ray.h”
struct hit_record
{
float t;
vec3 p;
vec3 normal;
};
class hitable {
public:
host device virtual bool hit(const ray& r, float t_min, float t_max, hit_record& rec) const = 0;
};
My version of hitable_list.h is:
#ifndef HITABLELISTH
#define HITABLELISTH
#include “hitable.h”
class hitable_list: public hitable {
public:
host device hitable_list() {}
host device hitable_list(hitable **l, int n) {list = l; list_size = n; }
host device virtual bool hit(const ray& r, float tmin, float tmax, hit_record& rec) const;
hitable **list;
int list_size;
};
host device bool hitable_list::hit(const ray& r, float t_min, float t_max, hit_record& rec) const {
hit_record temp_rec;
bool hit_anything = false;
float closest_so_far = t_max;
for (int i = 0; i < list_size; i++) {
if (list[i]->hit(r, t_min, closest_so_far, temp_rec)) {
hit_anything = true;
closest_so_far = temp_rec.t;
rec = temp_rec;
}
}
return hit_anything;
}
My version of sphere.h is:
#ifndef SPHEREH
#define SPHEREH
#include “hitable.h”
class sphere: public hitable {
public:
host device sphere() {}
host device sphere(vec3 cen, float r) : center(cen), radius(r) {};
host device virtual bool hit(const ray& r, float tmin, float tmax, hit_record& rec) const;
vec3 center;
float radius;
};
host device bool sphere::hit(const ray& r, float t_min, float t_max, hit_record& rec) const {
vec3 oc = r.origin() - center;
float a = dot(r.direction(), r.direction());
float b = dot(oc, r.direction());
float c = dot(oc, oc) - radiusradius;
float discriminant = bb - a*c;
if (discriminant > 0) {
float temp = (-b - sqrt(discriminant))/a;
if (temp < t_max && temp > t_min) {
rec.t = temp;
rec.p = r.point_at_parameter(rec.t);
rec.normal = (rec.p - center) / radius;
return true;
}
temp = (-b + sqrt(discriminant)) / a;
if (temp < t_max && temp > t_min) {
rec.t = temp;
rec.p = r.point_at_parameter(rec.t);
rec.normal = (rec.p - center) / radius;
return true;
}
}
return false;
}
and finally my version of main.cu:
#include
#include <time.h>
#include <float.h>
#include “vec3.h”
#include “ray.h”
#include “sphere.h”
#include “hitable_list.h”
// limited version of checkCudaErrors from helper_cuda.h in CUDA examples
#define checkCudaErrors(val) check_cuda( (val), #val, FILE, LINE )
void check_cuda(cudaError_t result, char const *const func, const char *const file, int const line) {
if (result) {
std::cerr << “CUDA error = " << static_cast(result) << " at " <<
file << “:” << line << " '” << func << “’ \n”;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
device vec3 color(const ray& r, hitable *world) {
hit_record rec;
if ((world)->hit(r, 0.0, FLT_MAX, rec)) {
return 0.5fvec3(rec.normal.x()+1.0f, rec.normal.y()+1.0f, rec.normal.z()+1.0f);
}
else {
vec3 unit_direction = unit_vector(r.direction());
float t = 0.5f(unit_direction.y() + 1.0f);
return (1.0f-t)vec3(1.0, 1.0, 1.0) + tvec3(0.5, 0.7, 1.0);
}
}
global void render(vec3 fb, int max_x, int max_y,
vec3 lower_left_corner, vec3 horizontal, vec3 vertical, vec3 origin,
hitable **world) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if((i >= max_x) || (j >= max_y)) return;
int pixel_index = jmax_x + i;
float u = float(i) / float(max_x);
float v = float(j) / float(max_y);
ray r(origin, lower_left_corner + uhorizontal + vvertical);
fb[pixel_index] = color(r, world);
}
global void create_world(hitable **d_list, hitable **d_world) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
*(d_list) = new sphere(vec3(0,0,-1), 0.5);
*(d_list+1) = new sphere(vec3(0,-100.5,-1), 100);
*d_world = new hitable_list(d_list,2);
}
}
void create_world_host(hitable **d_list, hitable **d_world) {
*(d_list) = new sphere(vec3(0,0,-1), 0.5);
*(d_list+1) = new sphere(vec3(0,-100.5,-1), 100);
*d_world = new hitable_list(d_list,2);
}
global void free_world(hitable **d_list, hitable **d_world) {
delete *(d_list);
delete *(d_list+1);
delete *d_world;
}
int main() {
int nx = 1200;
int ny = 600;
int tx = 8;
int ty = 8;
std::cerr << "Rendering a " << nx << "x" << ny << " image ";
std::cerr << "in " << tx << "x" << ty << " blocks.\n";
int num_pixels = nx*ny;
size_t fb_size = num_pixels*sizeof(vec3);
// allocate FB
vec3 *fb;
checkCudaErrors(cudaMallocManaged((void **)&fb, fb_size));
// make our world of hitables
//but using unified memory
hitable **d_list;
checkCudaErrors(cudaMallocManaged((void **)&d_list, 2*sizeof(hitable *)));
hitable **d_world;
checkCudaErrors(cudaMallocManaged((void **)&d_world, sizeof(hitable *)));
//***Creation of world on device using unified memory works***//
create_world<<<1,1>>>(d_list,d_world);
//Creation of world on host using unified memory fails//
// create_world_host(d_list,d_world);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
clock_t start, stop;
start = clock();
// Render our buffer
dim3 blocks(nx/tx+1,ny/ty+1);
dim3 threads(tx,ty);
render<<<blocks, threads>>>(fb, nx, ny,
vec3(-2.0, -1.0, -1.0),
vec3(4.0, 0.0, 0.0),
vec3(0.0, 2.0, 0.0),
vec3(0.0, 0.0, 0.0),
d_world);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
stop = clock();
double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC;
std::cerr << "took " << timer_seconds << " seconds.\n";
// Output FB as Image
std::cout << "P3\n" << nx << " " << ny << "\n255\n";
for (int j = ny-1; j >= 0; j--) {
for (int i = 0; i < nx; i++) {
size_t pixel_index = j*nx + i;
int ir = int(255.99*fb[pixel_index].r());
int ig = int(255.99*fb[pixel_index].g());
int ib = int(255.99*fb[pixel_index].b());
std::cout << ir << " " << ig << " " << ib << "\n";
}
}
// clean up
checkCudaErrors(cudaDeviceSynchronize());
free_world<<<1,1>>>(d_list,d_world);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaFree(d_list));
checkCudaErrors(cudaFree(d_world));
checkCudaErrors(cudaFree(fb));
// useful for cuda-memcheck --leak-check full
cudaDeviceReset();
}