Problem with 'malloc'

Hello there, I got a ‘malloc’ problem when I try to run this .cu.
it displays like this:
a.out(89356,0x7fff7e41e300) malloc: *** error for object 0x7faf5c17a208: incorrect checksum for freed object - object was probably modified after being freed.
*** set a breakpoint in malloc_error_break to debug
Abort trap: 6

i’m thinking that problem may come from lines 98-103 and lines 151-156.
Thanks a lot for the help from anyone

#include <cuda.h>
#include <complex>
#include <iostream>
#include <fstream>
#include <math.h>
#include <float.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <iomanip>
#include <vector>
#include <algorithm>
#define THREADS_PER_BLOCK 512
using namespace std;
const double pi = 3.14159265358979323846E+000;

__global__ void spline(double *x, double *y, int *n, double *yp1, double *ypn, double *y2){

    int i= blockIdx.x * blockDim.x + threadIdx.x;
    int k;
    double p, qn, sig, un;//, *u

    double *u = new double[*n-1];
    //vector<double> *u(*n-1);
    if (*yp1 > 0.99e30)
        y2[0] = u[0] = 0.0;
    else{
        y2[0] = -0.5;
        u[0] = (3.0/(x[1]-x[0]))*((y[1]-y[0])/(x[1]-x[0])-*yp1);
    }
    for (i=1; i<*n-1; i++) {
        sig = (x[i]-x[i-1])/(x[i+1]-x[i-1]);
        p = sig*y2[i-1]+2.0;
        y2[i] = (sig-1.0)/p;
        u[i] = (y[i+1]-y[i])/(x[i+1]-x[i])-(y[i]-y[i-1])/(x[i]-x[i-1]);
        u[i] = (6.0*u[i]/(x[i+1]-x[i-1])-sig*u[i-1])/p;
    }
    if (*ypn > 0.99e30)
        qn = un =0.0;
    else{
        qn = 0.5;
        un = (3.0/(x[*n-1]-x[*n-2]))*(*ypn-(y[*n-1]-y[*n-2])/(x[*n-1]-x[*n-2]));
    }
    y2[*n-1] = (un-qn*u[*n-2])/(qn*y2[*n-2]+1.0);
    for (k=*n-2; k>=0; k--)
        y2[k] = y2[k]*y2[k+1]+u[k];

}

void splint(double xa[], double ya[], double y2a[], int n, double x, double yy[2])
{
    
    void nrerror(char error_text[]);
    int klo, khi;
    double h, i, b, a;
    
    klo = 0;
    khi = n-1;
    while (khi-klo>1) {
        if (xa[khi]>x && xa[khi-1]>x)
            khi -=1;
        else if (xa[khi]>x && xa[khi-1]<x) klo =khi-1;
        //cout<<khi<<" "<<klo<<" "<<khi-klo<<endl;           //show this if we wantto know the results of khis  and klos (test)
    }
    h = xa[khi]-xa[klo];
    i = ya[khi]-ya[klo];
    if (h == 0.0) cout<<"Bad Xa input to routine splint"<<endl;
    a = (xa[khi]-x)/h;
    b = (x-xa[klo])/h;
    yy[0] = a*ya[klo]+b*ya[khi]+((a*a*a-a)*y2a[klo]+(b*b*b-b)*y2a[khi])*(h*h)/6.0;     // y at x
    yy[1] =  i/h-(3.0*a*a-1.0)*h*y2a[klo]/6.0+(3.0*b*b-1.0)*h*y2a[khi]/6.0;            // y' at x
}


void chemi(double _gamma, double& _mu_local, double& _d_mu_n, double& _d_mu_gamma){
const double CC = 0.050;
const int M = 100020; double _mu_local_gamma;

//////////////
double *x, *y, *yp1, *ypn, *y2;  int *n;
double *d_x, *d_y, *d_yp1, *d_ypn, *d_y2; int *d_n;
cudaMalloc( (void **) &d_x, M);
cudaMalloc( (void **) &d_y, M);
cudaMalloc( (void **) &d_n, 1);
cudaMalloc( (void **) &d_yp1, 1);
cudaMalloc( (void **) &d_ypn, 1);
cudaMalloc( (void **) &d_y2, M);
x = (double *)malloc(M);
y = (double *)malloc(M);
n = (int *)malloc(1);
yp1 = (double *)malloc(1);
ypn = (double *)malloc(1);
y2 = (double *)malloc(M);
//////////////
static double data_1[M], data_2[M];
//read file1 to data_1 --- gamma
std::ifstream input("gamma.dat");         
for (int i = 0; i < M; i++) {
input >> data_1[i];
x[i] = data_1[i];
}
std::ifstream inputt("mu.dat");
for (int i = 0; i < M; i++) {
inputt >> data_2[i];
y[i] = data_2[i];
}


if (_gamma == 0.0) {
_mu_local = 0.0;
_d_mu_n = 0.0;
_d_mu_gamma = 0.0;

}
else if (_gamma <= 0.008) {
_mu_local = (CC*CC) / (2.0*_gamma*_gamma) * (2.0*_gamma - 2.0*pow(_gamma, 3.0 / 2.0) / pi);
_d_mu_n = CC / (2.0*_gamma) * (2.0*_gamma - pow(_gamma, 3.0 / 2.0) / pi);

}
else if (_gamma > 0.008 && _gamma <= 1000.0){
double y2der[M], res[2]; // 2nd derivative
double yp1 = 1.91564450000017e+000, ypn = 250.032500701103e-003; //1st derivative at n=1,n
res[0] = 0.0;
//////////////
cudaMemcpy( d_x, &x, M, cudaMemcpyHostToDevice );
cudaMemcpy( d_y, &y, M, cudaMemcpyHostToDevice );
cudaMemcpy( d_n, &n, 1, cudaMemcpyHostToDevice );
cudaMemcpy( d_yp1, &yp1, 1, cudaMemcpyHostToDevice );
cudaMemcpy( d_ypn, &ypn, 1, cudaMemcpyHostToDevice );
//////////////
spline<<<( M+THREADS_PER_BLOCK -1 )/THREADS_PER_BLOCK,THREADS_PER_BLOCK  >>> (d_x, d_y, d_n, d_yp1, d_ypn, d_y2);
//////////////
cudaMemcpy( y2der, d_y2, M, cudaMemcpyDeviceToHost);
//////////////
splint(data_1, data_2, y2der, M, _gamma, res);
_mu_local_gamma = res[0];
_mu_local = CC*CC / (2.0*_gamma*_gamma) * _mu_local_gamma;
_d_mu_gamma = res[1];
_d_mu_n = CC / (2 * _gamma) * (2.0*_mu_local_gamma - _gamma*_d_mu_gamma);
//////////////
free(x);
free(y);

cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_y2);
//////////////
}
else if (_gamma > 1000.0){
_mu_local = (pi*pi) * (CC*CC) / (2.0*_gamma*_gamma) * (1.0 - 16.0 / (3.0*_gamma) + 20.0 / (_gamma*_gamma));
_d_mu_n = (pi*pi) * CC / _gamma * (1.0 - 8.0 / _gamma + 40.0 / (_gamma*_gamma));

}
else  cout << "gamma is " << _gamma << " , not real!" << endl;

}


int main(){

	////////////////////////////////////////////////////////////////////////////////////////
	//==> Case 4
    struct tm when;
    time_t now;
    time(&now);
    when = *localtime(&now);
    cout << "current time is " << asctime(&when) << flush;
    const int NCUT = 1596;
	double gamma = 0.0, mu_local = 0.0, d_mu_n = 0.0, d_mu_gamma = 0.0;
    for (int i=0; i<NCUT; i++) {
        gamma = 0.0012456*i +0.000012254;
	cout << "Gamma is " << gamma << endl;
	chemi(gamma, mu_local, d_mu_n, d_mu_gamma); // we get the local chemical and its derivative
	cout << "mu(n) is " << mu_local << endl;
	cout << "d_mu(n) is " << d_mu_n << endl;
    }
    time(&now);
    when = *localtime(&now);
    cout << "current time is " << asctime(&when) << flush;
    getchar();//system("pause");
}

This seems to be a question about host code, and quite independent of CUDA. My hypothesis is your code either uses an incorrect sequence of malloc/free operations, or is writing beyond the bounds of allocated memory, thereby corrupting the heap manager data structures. Your platform appears to be Linux. Try the following to debug the issue:

(1) Export the environment variable MALLOC_CHECK_=1 to log malloc/free/realloc activity. You can also try MALLOC_CHECK_=2 to bail at the first sign that things are out of whack.

(2) Run the code under valgrind. Note that this can cause massive slowdown, so it may or may not be an option. At least with older versions of valgrind there were also some issue with false positives reported on host data structures into which the GPU transfers data by DMA.

Thank you njuffa for the reply.
yes, I run it on Mac. I’m a new learner, may you please tell me how to ‘Export the environment variable MALLOC_CHECK_=1 to log malloc/free/realloc activity. You can also try MALLOC_CHECK_=2 to bail at the first sign that things are out of whack.’
i don’t understand it.
Thanks a lot.

malloc really has nothing to do with CUDA but these lines don’t look right to me:

x = (double *)malloc(M);
y = (double *)malloc(M);
n = (int *)malloc(1);
yp1 = (double *)malloc(1);
ypn = (double *)malloc(1);
y2 = (double *)malloc(M);

Perhaps you think that malloc returns storage for the number of requested elements of the data type. It does not. It returns storage for the number of bytes requested. So this:

yp1 = (double *)malloc(1);

returns storage for 1 byte, which is not enough to hold even one double quantity. With this type of coding error, a subsequent code line like this:

double yp1 = 1.91564450000017e+000, ypn = 250.032500701103e-003; //1st derivative at n=1,n

will almost certainly corrupt the program stack. Furthermore, the above line of code has various other errors inherent in it. If you fix your malloc statements, that line of code should simply be:

*yp1 = 1.91564450000017e+000; *ypn = 250.032500701103e-003; //1st derivative at n=1,n

You should remove the double declaration; you should not be redefining those variables at this point.

Probably you should scale all your malloc operations by the size of the datatype:

x = (double *)malloc(M*sizeof(double));
y = (double *)malloc(M*sizeof(double));
n = (int *)malloc(1*sizeof(int));
yp1 = (double *)malloc(1*sizeof(double));
ypn = (double *)malloc(1*sizeof(double));
y2 = (double *)malloc(M*sizeof(double));

Your cudaMemcpy operations are similarly incorrect. The transfer size parameter for cudaMemcpy is in terms of bytes. So something like this:

cudaMemcpy( d_yp1, &yp1, 1, cudaMemcpyHostToDevice );

should be changed to:

cudaMemcpy( d_yp1, &yp1, 1*sizeof(double), cudaMemcpyHostToDevice );

and similarly for other lines like that in your code.

And similarly, you have the same problem with your cudaMalloc statements, which also expect an allocation size parameter in bytes.

The types of errors in this code and in your last question suggest to me you have little experience with C/C++ coding. CUDA depends heavily on C/C++ language concepts. If you acquire better C/C++ skills, you’ll be a better CUDA programmer. It strikes me that your code seems to have a great many of these types of basic coding errors in it, so I’m not at all confident that this addresses them all.

txbob, great thanks.
It’s working now, you are right, I misunderstand the size definition.
Besides, here in the code, i can not delete the double, since this declare for the yp1, ypn
It says

error: a value of type "double" cannot be assigned to an entity of type "double *"

after i delete it.

double yp1 = 1.91564450000017e+000, ypn = 250.032500701103e-003; //1st derivative at n=1,n

That line of code should be:

*yp1 = 1.91564450000017e+000; *ypn = 250.032500701103e-003; //1st derivative at n=1,n

not:

yp1 = 1.91564450000017e+000; ypn = 250.032500701103e-003; //1st derivative at n=1,n

again, there is a distinction between ordinary variables and pointers. I overlooked this in my original response as I am not actively compiling your code and looking at the compiler output.