CUDA with Fermi: Array of structs or arrays? Which is more efficient for memory access?
So I'm wondering, with Fermi's cache memory architecture, which kernel would be more efficient?
Arrays:
[codebox]__global__ void test1a(float *g1, float *g2, float *g3, float *g4, float *sum)
{
int i = blockIdx.x*512+threadIdx.x;
g1[i]=g1[i]*0.99f;
g2[i]=g2[i]*0.99f;
g3[i]=g3[i]*0.99f;
g4[i]=g1[i]*0.99f;
sum[i]=g1[i]+g2[i]+g3[i]+g4[i];
g1[i]=g1[i]+(g1[i]<=0.5f)*(0.7f-g1[i]);
g2[i]=g2[i]+(g2[i]<=0.5f)*(0.7f-g2[i]);
g3[i]=g3[i]+(g3[i]<=0.5f)*(0.7f-g3[i]);
g4[i]=g4[i]+(g4[i]<=0.5f)*(0.7f-g4[i]);
}[/codebox]
Also, what if I break up the above kernel into multiple kernels each handling a single operation? Would that be more efficient?

or Array of structs:
[codebox]struct TestStruct
{
float g1, g2, g3, g4, sum;
};

__global__ void test1b(TestStruct *structArr)
{
int i = blockIdx.x*512+threadIdx.x;
structArr[i].g1=structArr[i].g1*0.99f;
structArr[i].g2=structArr[i].g2*0.99f;
structArr[i].g3=structArr[i].g3*0.99f;
structArr[i].g4=structArr[i].g4*0.99f;
structArr[i].sum=structArr[i].g1+structArr[i].g2+structArr[i
].g3+structArr[i].g4;
structArr[i].g1=structArr[i].g1+(structArr[i].g1<=0.5f)*(0.7f-structArr[i].g1);
structArr[i].g2=structArr[i].g2+(structArr[i].g2<=0.5f)*(0.7f-structArr[i].g2);
structArr[i].g3=structArr[i].g3+(structArr[i].g3<=0.5f)*(0.7f-structArr[i].g3);
structArr[i].g4=structArr[i].g4+(structArr[i].g4<=0.5f)*(0.7f-structArr[i].g4);
}[/codebox]
I can pad the struct with dummy fields if that makes the memory access more efficient.

I've searched all over the place for this, but I still can't find a clear answer, and somehow the nvidia forum search engine is not working for me right now. So apologies if this has been discussed before.

Thanks,

hammer
So I'm wondering, with Fermi's cache memory architecture, which kernel would be more efficient?

Arrays:

[codebox]__global__ void test1a(float *g1, float *g2, float *g3, float *g4, float *sum)

{

int i = blockIdx.x*512+threadIdx.x;

g1[i]=g1[i]*0.99f;

g2[i]=g2[i]*0.99f;

g3[i]=g3[i]*0.99f;

g4[i]=g1[i]*0.99f;

sum[i]=g1[i]+g2[i]+g3[i]+g4[i];

g1[i]=g1[i]+(g1[i]<=0.5f)*(0.7f-g1[i]);

g2[i]=g2[i]+(g2[i]<=0.5f)*(0.7f-g2[i]);

g3[i]=g3[i]+(g3[i]<=0.5f)*(0.7f-g3[i]);

g4[i]=g4[i]+(g4[i]<=0.5f)*(0.7f-g4[i]);

}[/codebox]

Also, what if I break up the above kernel into multiple kernels each handling a single operation? Would that be more efficient?



or Array of structs:

[codebox]struct TestStruct

{

float g1, g2, g3, g4, sum;

};



__global__ void test1b(TestStruct *structArr)

{

int i = blockIdx.x*512+threadIdx.x;

structArr[i].g1=structArr[i].g1*0.99f;

structArr[i].g2=structArr[i].g2*0.99f;

structArr[i].g3=structArr[i].g3*0.99f;

structArr[i].g4=structArr[i].g4*0.99f;

structArr[i].sum=structArr[i].g1+structArr[i].g2+structArr[i

].g3+structArr[i].g4;

structArr[i].g1=structArr[i].g1+(structArr[i].g1<=0.5f)*(0.7f-structArr[i].g1);

structArr[i].g2=structArr[i].g2+(structArr[i].g2<=0.5f)*(0.7f-structArr[i].g2);

structArr[i].g3=structArr[i].g3+(structArr[i].g3<=0.5f)*(0.7f-structArr[i].g3);

structArr[i].g4=structArr[i].g4+(structArr[i].g4<=0.5f)*(0.7f-structArr[i].g4);

}[/codebox]

I can pad the struct with dummy fields if that makes the memory access more efficient.



I've searched all over the place for this, but I still can't find a clear answer, and somehow the nvidia forum search engine is not working for me right now. So apologies if this has been discussed before.



Thanks,



hammer

#1
Posted 10/28/2010 05:39 PM   
So I'm wondering, with Fermi's cache memory architecture, which kernel would be more efficient?
Arrays:
[codebox]__global__ void test1a(float *g1, float *g2, float *g3, float *g4, float *sum)
{
int i = blockIdx.x*512+threadIdx.x;
g1[i]=g1[i]*0.99f;
g2[i]=g2[i]*0.99f;
g3[i]=g3[i]*0.99f;
g4[i]=g1[i]*0.99f;
sum[i]=g1[i]+g2[i]+g3[i]+g4[i];
g1[i]=g1[i]+(g1[i]<=0.5f)*(0.7f-g1[i]);
g2[i]=g2[i]+(g2[i]<=0.5f)*(0.7f-g2[i]);
g3[i]=g3[i]+(g3[i]<=0.5f)*(0.7f-g3[i]);
g4[i]=g4[i]+(g4[i]<=0.5f)*(0.7f-g4[i]);
}[/codebox]
Also, what if I break up the above kernel into multiple kernels each handling a single operation? Would that be more efficient?

or Array of structs:
[codebox]struct TestStruct
{
float g1, g2, g3, g4, sum;
};

__global__ void test1b(TestStruct *structArr)
{
int i = blockIdx.x*512+threadIdx.x;
structArr[i].g1=structArr[i].g1*0.99f;
structArr[i].g2=structArr[i].g2*0.99f;
structArr[i].g3=structArr[i].g3*0.99f;
structArr[i].g4=structArr[i].g4*0.99f;
structArr[i].sum=structArr[i].g1+structArr[i].g2+structArr[i
].g3+structArr[i].g4;
structArr[i].g1=structArr[i].g1+(structArr[i].g1<=0.5f)*(0.7f-structArr[i].g1);
structArr[i].g2=structArr[i].g2+(structArr[i].g2<=0.5f)*(0.7f-structArr[i].g2);
structArr[i].g3=structArr[i].g3+(structArr[i].g3<=0.5f)*(0.7f-structArr[i].g3);
structArr[i].g4=structArr[i].g4+(structArr[i].g4<=0.5f)*(0.7f-structArr[i].g4);
}[/codebox]
I can pad the struct with dummy fields if that makes the memory access more efficient.

I've searched all over the place for this, but I still can't find a clear answer, and somehow the nvidia forum search engine is not working for me right now. So apologies if this has been discussed before.

Thanks,

hammer
So I'm wondering, with Fermi's cache memory architecture, which kernel would be more efficient?

Arrays:

[codebox]__global__ void test1a(float *g1, float *g2, float *g3, float *g4, float *sum)

{

int i = blockIdx.x*512+threadIdx.x;

g1[i]=g1[i]*0.99f;

g2[i]=g2[i]*0.99f;

g3[i]=g3[i]*0.99f;

g4[i]=g1[i]*0.99f;

sum[i]=g1[i]+g2[i]+g3[i]+g4[i];

g1[i]=g1[i]+(g1[i]<=0.5f)*(0.7f-g1[i]);

g2[i]=g2[i]+(g2[i]<=0.5f)*(0.7f-g2[i]);

g3[i]=g3[i]+(g3[i]<=0.5f)*(0.7f-g3[i]);

g4[i]=g4[i]+(g4[i]<=0.5f)*(0.7f-g4[i]);

}[/codebox]

Also, what if I break up the above kernel into multiple kernels each handling a single operation? Would that be more efficient?



or Array of structs:

[codebox]struct TestStruct

{

float g1, g2, g3, g4, sum;

};



__global__ void test1b(TestStruct *structArr)

{

int i = blockIdx.x*512+threadIdx.x;

structArr[i].g1=structArr[i].g1*0.99f;

structArr[i].g2=structArr[i].g2*0.99f;

structArr[i].g3=structArr[i].g3*0.99f;

structArr[i].g4=structArr[i].g4*0.99f;

structArr[i].sum=structArr[i].g1+structArr[i].g2+structArr[i

].g3+structArr[i].g4;

structArr[i].g1=structArr[i].g1+(structArr[i].g1<=0.5f)*(0.7f-structArr[i].g1);

structArr[i].g2=structArr[i].g2+(structArr[i].g2<=0.5f)*(0.7f-structArr[i].g2);

structArr[i].g3=structArr[i].g3+(structArr[i].g3<=0.5f)*(0.7f-structArr[i].g3);

structArr[i].g4=structArr[i].g4+(structArr[i].g4<=0.5f)*(0.7f-structArr[i].g4);

}[/codebox]

I can pad the struct with dummy fields if that makes the memory access more efficient.



I've searched all over the place for this, but I still can't find a clear answer, and somehow the nvidia forum search engine is not working for me right now. So apologies if this has been discussed before.



Thanks,



hammer

#2
Posted 10/28/2010 05:39 PM   
as written the struct is probably going to result in bank conflicts / uncoalesced access, and since it's 5 floats it's not going to align.
so here i'd say it'd definetly benefit from separate arrays.
and then i'd read the data into registers and operate on them there instead of doing so many memory reads and writes.
and loop so that the array size is not limited by/tied to the active thread count.

[code]__global__ void test1a(float *g1, float *g2, float *g3, float *g4, float *sum, int n) {
int is = blockDim.x*gridDim.x;
for( int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += is) {
float _g1=g1[i]*0.99f;
float _g2=g2[i]*0.99f;
float _g3=g3[i]*0.99f;
float _g4=g4[i]*0.99f;
sum[i]=_g1+_g2+_g3+_g4;
g1[i]=_g1+(_g1<=0.5f)*(0.7f-_g1);
g2[i]=_g2+(_g2<=0.5f)*(0.7f-_g2);
g3[i]=_g3+(_g3<=0.5f)*(0.7f-_g3);
g4[i]=_g4+(_g4<=0.5f)*(0.7f-_g4);
}
}[/code]
as written the struct is probably going to result in bank conflicts / uncoalesced access, and since it's 5 floats it's not going to align.

so here i'd say it'd definetly benefit from separate arrays.

and then i'd read the data into registers and operate on them there instead of doing so many memory reads and writes.

and loop so that the array size is not limited by/tied to the active thread count.



__global__ void test1a(float *g1, float *g2, float *g3, float *g4, float *sum, int n) {

int is = blockDim.x*gridDim.x;

for( int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += is) {

float _g1=g1[i]*0.99f;

float _g2=g2[i]*0.99f;

float _g3=g3[i]*0.99f;

float _g4=g4[i]*0.99f;

sum[i]=_g1+_g2+_g3+_g4;

g1[i]=_g1+(_g1<=0.5f)*(0.7f-_g1);

g2[i]=_g2+(_g2<=0.5f)*(0.7f-_g2);

g3[i]=_g3+(_g3<=0.5f)*(0.7f-_g3);

g4[i]=_g4+(_g4<=0.5f)*(0.7f-_g4);

}

}

Mobo: ASUS M2N-SLI

|-North bridge

| |-CPU: AMD Athlon II X4 640 (3.0Ghz)

| |-GPU: MSI 460GTX 1GB OC'd to 900Mhz

| \-MEM: 8GB DDR3

\-South bridge

. |-HDD: Onboard SATA RAID (0)

. | \- 4x WD 320GB 7200 RPM

. |-Audio: Onboard CMedia 7.1 channel

. \-NIC: Onboard Intel 1Gb Ethernet

#3
Posted 10/28/2010 07:19 PM   
as written the struct is probably going to result in bank conflicts / uncoalesced access, and since it's 5 floats it's not going to align.
so here i'd say it'd definetly benefit from separate arrays.
and then i'd read the data into registers and operate on them there instead of doing so many memory reads and writes.
and loop so that the array size is not limited by/tied to the active thread count.

[code]__global__ void test1a(float *g1, float *g2, float *g3, float *g4, float *sum, int n) {
int is = blockDim.x*gridDim.x;
for( int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += is) {
float _g1=g1[i]*0.99f;
float _g2=g2[i]*0.99f;
float _g3=g3[i]*0.99f;
float _g4=g4[i]*0.99f;
sum[i]=_g1+_g2+_g3+_g4;
g1[i]=_g1+(_g1<=0.5f)*(0.7f-_g1);
g2[i]=_g2+(_g2<=0.5f)*(0.7f-_g2);
g3[i]=_g3+(_g3<=0.5f)*(0.7f-_g3);
g4[i]=_g4+(_g4<=0.5f)*(0.7f-_g4);
}
}[/code]
as written the struct is probably going to result in bank conflicts / uncoalesced access, and since it's 5 floats it's not going to align.

so here i'd say it'd definetly benefit from separate arrays.

and then i'd read the data into registers and operate on them there instead of doing so many memory reads and writes.

and loop so that the array size is not limited by/tied to the active thread count.



__global__ void test1a(float *g1, float *g2, float *g3, float *g4, float *sum, int n) {

int is = blockDim.x*gridDim.x;

for( int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += is) {

float _g1=g1[i]*0.99f;

float _g2=g2[i]*0.99f;

float _g3=g3[i]*0.99f;

float _g4=g4[i]*0.99f;

sum[i]=_g1+_g2+_g3+_g4;

g1[i]=_g1+(_g1<=0.5f)*(0.7f-_g1);

g2[i]=_g2+(_g2<=0.5f)*(0.7f-_g2);

g3[i]=_g3+(_g3<=0.5f)*(0.7f-_g3);

g4[i]=_g4+(_g4<=0.5f)*(0.7f-_g4);

}

}

Mobo: ASUS M2N-SLI

|-North bridge

| |-CPU: AMD Athlon II X4 640 (3.0Ghz)

| |-GPU: MSI 460GTX 1GB OC'd to 900Mhz

| \-MEM: 8GB DDR3

\-South bridge

. |-HDD: Onboard SATA RAID (0)

. | \- 4x WD 320GB 7200 RPM

. |-Audio: Onboard CMedia 7.1 channel

. \-NIC: Onboard Intel 1Gb Ethernet

#4
Posted 10/28/2010 07:19 PM   
[quote name='happyjack272' post='1138372' date='Oct 28 2010, 02:19 PM']as written the struct is probably going to result in bank conflicts / uncoalesced access, and since it's 5 floats it's not going to align.
so here i'd say it'd definetly benefit from separate arrays.
and then i'd read the data into registers and operate on them there instead of doing so many memory reads and writes.
and loop so that the array size is not limited by/tied to the active thread count.

[codebox]struct TestStruct
{
float g1, g2, g3, g4, sum, dummy1, dummy2, dummy3;
};[/codebox] would that resolve the memory access issues? But I guess your way make a lot of sense for this.
[quote name='happyjack272' post='1138372' date='Oct 28 2010, 02:19 PM']as written the struct is probably going to result in bank conflicts / uncoalesced access, and since it's 5 floats it's not going to align.

so here i'd say it'd definetly benefit from separate arrays.

and then i'd read the data into registers and operate on them there instead of doing so many memory reads and writes.

and loop so that the array size is not limited by/tied to the active thread count.



[codebox]struct TestStruct

{

float g1, g2, g3, g4, sum, dummy1, dummy2, dummy3;

};[/codebox] would that resolve the memory access issues? But I guess your way make a lot of sense for this.

#5
Posted 10/28/2010 07:27 PM   
[quote name='happyjack272' post='1138372' date='Oct 28 2010, 02:19 PM']as written the struct is probably going to result in bank conflicts / uncoalesced access, and since it's 5 floats it's not going to align.
so here i'd say it'd definetly benefit from separate arrays.
and then i'd read the data into registers and operate on them there instead of doing so many memory reads and writes.
and loop so that the array size is not limited by/tied to the active thread count.

[codebox]struct TestStruct
{
float g1, g2, g3, g4, sum, dummy1, dummy2, dummy3;
};[/codebox] would that resolve the memory access issues? But I guess your way make a lot of sense for this.
[quote name='happyjack272' post='1138372' date='Oct 28 2010, 02:19 PM']as written the struct is probably going to result in bank conflicts / uncoalesced access, and since it's 5 floats it's not going to align.

so here i'd say it'd definetly benefit from separate arrays.

and then i'd read the data into registers and operate on them there instead of doing so many memory reads and writes.

and loop so that the array size is not limited by/tied to the active thread count.



[codebox]struct TestStruct

{

float g1, g2, g3, g4, sum, dummy1, dummy2, dummy3;

};[/codebox] would that resolve the memory access issues? But I guess your way make a lot of sense for this.

#6
Posted 10/28/2010 07:27 PM   
[quote name='hammer256' post='1138379' date='Oct 28 2010, 08:27 PM']Thanks a lot. What If I define the struct like the following: [codebox]struct TestStruct
{
float g1, g2, g3, g4, sum, dummy1, dummy2, dummy3;
};[/codebox] would that resolve the memory access issues? But I guess your way make a lot of sense for this.[/quote]

sure, that'll align the memory access, but it wastes bandwidth and memory.

that's why i usually go for arrays over structs, esp. when the structs don't align well.
if they align well, on the other hand, using structs might make for more efficient use of the cache or texture fetching.

perhaps a good compromise might be to make g1,2,3, & 4 a struct or at least a 2d array or array of 4-vectors, and then have sum a separate array of its own.
[quote name='hammer256' post='1138379' date='Oct 28 2010, 08:27 PM']Thanks a lot. What If I define the struct like the following: [codebox]struct TestStruct

{

float g1, g2, g3, g4, sum, dummy1, dummy2, dummy3;

};[/codebox] would that resolve the memory access issues? But I guess your way make a lot of sense for this.



sure, that'll align the memory access, but it wastes bandwidth and memory.



that's why i usually go for arrays over structs, esp. when the structs don't align well.

if they align well, on the other hand, using structs might make for more efficient use of the cache or texture fetching.



perhaps a good compromise might be to make g1,2,3, & 4 a struct or at least a 2d array or array of 4-vectors, and then have sum a separate array of its own.

Mobo: ASUS M2N-SLI

|-North bridge

| |-CPU: AMD Athlon II X4 640 (3.0Ghz)

| |-GPU: MSI 460GTX 1GB OC'd to 900Mhz

| \-MEM: 8GB DDR3

\-South bridge

. |-HDD: Onboard SATA RAID (0)

. | \- 4x WD 320GB 7200 RPM

. |-Audio: Onboard CMedia 7.1 channel

. \-NIC: Onboard Intel 1Gb Ethernet

#7
Posted 10/28/2010 08:04 PM   
[quote name='hammer256' post='1138379' date='Oct 28 2010, 08:27 PM']Thanks a lot. What If I define the struct like the following: [codebox]struct TestStruct
{
float g1, g2, g3, g4, sum, dummy1, dummy2, dummy3;
};[/codebox] would that resolve the memory access issues? But I guess your way make a lot of sense for this.[/quote]

sure, that'll align the memory access, but it wastes bandwidth and memory.

that's why i usually go for arrays over structs, esp. when the structs don't align well.
if they align well, on the other hand, using structs might make for more efficient use of the cache or texture fetching.

perhaps a good compromise might be to make g1,2,3, & 4 a struct or at least a 2d array or array of 4-vectors, and then have sum a separate array of its own.
[quote name='hammer256' post='1138379' date='Oct 28 2010, 08:27 PM']Thanks a lot. What If I define the struct like the following: [codebox]struct TestStruct

{

float g1, g2, g3, g4, sum, dummy1, dummy2, dummy3;

};[/codebox] would that resolve the memory access issues? But I guess your way make a lot of sense for this.



sure, that'll align the memory access, but it wastes bandwidth and memory.



that's why i usually go for arrays over structs, esp. when the structs don't align well.

if they align well, on the other hand, using structs might make for more efficient use of the cache or texture fetching.



perhaps a good compromise might be to make g1,2,3, & 4 a struct or at least a 2d array or array of 4-vectors, and then have sum a separate array of its own.

Mobo: ASUS M2N-SLI

|-North bridge

| |-CPU: AMD Athlon II X4 640 (3.0Ghz)

| |-GPU: MSI 460GTX 1GB OC'd to 900Mhz

| \-MEM: 8GB DDR3

\-South bridge

. |-HDD: Onboard SATA RAID (0)

. | \- 4x WD 320GB 7200 RPM

. |-Audio: Onboard CMedia 7.1 channel

. \-NIC: Onboard Intel 1Gb Ethernet

#8
Posted 10/28/2010 08:04 PM   
[quote name='happyjack272' post='1138398' date='Oct 28 2010, 03:04 PM']sure, that'll align the memory access, but it wastes bandwidth and memory.

that's why i usually go for arrays over structs, esp. when the structs don't align well.
if they align well, on the other hand, using structs might make for more efficient use of the cache or texture fetching.

perhaps a good compromise might be to make g1,2,3, & 4 a struct or at least a 2d array or array of 4-vectors, and then have sum a separate array of its own.[/quote]

Cool, I'll try that and see what kind of performance I get.
[quote name='happyjack272' post='1138398' date='Oct 28 2010, 03:04 PM']sure, that'll align the memory access, but it wastes bandwidth and memory.



that's why i usually go for arrays over structs, esp. when the structs don't align well.

if they align well, on the other hand, using structs might make for more efficient use of the cache or texture fetching.



perhaps a good compromise might be to make g1,2,3, & 4 a struct or at least a 2d array or array of 4-vectors, and then have sum a separate array of its own.



Cool, I'll try that and see what kind of performance I get.

#9
Posted 10/28/2010 09:18 PM   
[quote name='happyjack272' post='1138398' date='Oct 28 2010, 03:04 PM']sure, that'll align the memory access, but it wastes bandwidth and memory.

that's why i usually go for arrays over structs, esp. when the structs don't align well.
if they align well, on the other hand, using structs might make for more efficient use of the cache or texture fetching.

perhaps a good compromise might be to make g1,2,3, & 4 a struct or at least a 2d array or array of 4-vectors, and then have sum a separate array of its own.[/quote]

Cool, I'll try that and see what kind of performance I get.
[quote name='happyjack272' post='1138398' date='Oct 28 2010, 03:04 PM']sure, that'll align the memory access, but it wastes bandwidth and memory.



that's why i usually go for arrays over structs, esp. when the structs don't align well.

if they align well, on the other hand, using structs might make for more efficient use of the cache or texture fetching.



perhaps a good compromise might be to make g1,2,3, & 4 a struct or at least a 2d array or array of 4-vectors, and then have sum a separate array of its own.



Cool, I'll try that and see what kind of performance I get.

#10
Posted 10/28/2010 09:18 PM   
Scroll To Top