I was going to run this both with “-g” and with “-O3” optimization on a TX2 (R28.2):
# With debug symbols
gcc <b>-Wall -g</b> bitBang.c -o bitBang
# Optimized (not useful, this causes the register to not actually be touched)
gcc <u><b>-Wall -O3</b></u> bitBang.c -o bitBang2
# Further down a profiled version is used:
gcc <u><b>-Wall -g -pg -O0</b></u> bitBang.c -o bitBang3
I also thought of running this in a profiler and with some improvements for debugging. I was reminded that optimizing certain kinds of code used with registers or bare metal may not have the effect intended. The “-O3” optimized version had to be left out of testing…it seems the optimizer thought the register changes for high and low could be removed. The “-g” and “-pg” versions should be ok though (or “-O0” to disable optimizing even if debug symbols are not kept).
I separated out the loop of setting high and low into function “exercise” (see the source below…segregation of the actual test loop was for use in profiling). “exercise” itself has separated out “setHigh” and “setLow” for allowing profiling to see if high and low settings differed in performance (the two were very close in time, but setting low outperformed setting high).
So then I tried the same thing on the TX1 (R28.2), and discovered this was true there as well. Optimizing causes the loop to exit instantly. Normally I would only expect to see bugs change between an optimized and debug version only if there is a stack overflow or stack corruption. I did not confirm where the error actually is, but I am wondering if there is some sort of very simple error causing the bit flipping to not be tied to the GPIO pins.
Try running this version with both the “-Wall -g” options and the “-Wall -O3” options on both the TX1 and TX2. See if the optimized version is instant. This will demo why optimizing doesn’t work. Assuming you see the same behavior, try putting a 1 or 2 second sleep between the high and low change, and see if the pins in question really do go high or low when they should (GPIO pins won’t change…the relevant code was optimized out because the compiler didn’t know the register really needs to be poked).
/*----------------------------------------------------------------------*/
/* Include dependant headers. */
/*----------------------------------------------------------------------*/
#include <assert.h>
#include <stdint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdio.h>
#include <time.h>
#include <errno.h>
/*----------------------------------------------------------------------*/
/* Declare local/private types. */
/*----------------------------------------------------------------------*/
/*! GPIO register base. */
static volatile uint8_t * TegraGpio_X1RegBase = NULL;
static volatile uint8_t * TegraMainGpio_X2RegBase = NULL;
static volatile uint8_t * TegraAonGpio_X2RegBase = NULL;
static const unsigned int ClkGpioPinNumX2 = 388; // GPIO8
//static const unsigned int DataGpioPinNumX2 = 298; // GPIO9
static const unsigned int ClkGpioPinNumX1 = 187; // GPIO8
//static const unsigned int DataGpioPinNumX1 = 186; // GPIO9
static int TegraGpio_MemFd = -1;
static int DeviceType = 0 ; //!< 0: n/a 1: TX1 2: TX2
/*----------------------------------------------------------------------*/
/* Define local/private data. */
/*----------------------------------------------------------------------*/
#define GPIO_BASE (0x6000D000)
#define GPIO_CONTROLLER (0x0100)
#define GPIO_PORTS_PER_CONTROLLER (0x04)
#define GPIO_BITS_PER_PORT (0x08)
#define GPIO_BITS_PER_CONTROLLER (GPIO_BITS_PER_PORT * GPIO_PORTS_PER_CONTROLLER)
#define GPIO_CNF (0x80)
#define GPIO_OE (0x90)
#define GPIO_OUT (0xA0)
#define GPIO_IN (0x30)
// X2 definitions
#define MAIN_GPIO_BASE_X2 (0x02200000 + 0x10000)
#define AON_GPIO_BASE_X2 (0x0C2F0000 + 0x01000)
#define X2_CNF_OFFSET 0x00
#define X2_IN_OFFSET 0x08
#define X2_OE_OFFSET 0x0C
#define X2_OUT_OFFSET 0x10
#define X2_CONF_OUTPUT 0x03
#define X2_CONF_INPUT 0x01
#define X2_OUTPUT_DRIVE 0x00
#define X2_OUTPUT_FLOAT 0x01
#define sysFs_Main_PinNumOffset 320
#define sysFs_Aon_PinNumOffset 256
/*! @brief Read the Tegra_Id_Fuse value
*
* @return 33 if TX1, 24 if TX2
*/
int tegraFuseValue()
{
int val = 0;
FILE * tegraChipId = fopen("/sys/module/tegra_fuse/parameters/tegra_chip_id","r");
assert( tegraChipId );
int scan_ret = fscanf(tegraChipId, "%d",&val );
switch (scan_ret)
{
case EOF:
printf("Scanned EOF\n");
break;
case EAGAIN:
printf("The file descriptor underlying stream is marked nonblocking, and the read operation would block.\n");
break;
case EBADF:
printf("The file descriptor underlying stream is invalid, or not open for reading.\n");
break;
case EILSEQ:
printf("Input byte sequence does not form a valid character.\n");
break;
case EINTR:
printf("The read operation was interrupted by a signal; see signal(7).\n");
break;
case EINVAL:
printf("Not enough arguments; or format is NULL.\n");
break;
case ENOMEM:
printf("Out of memory.\n");
break;
case ERANGE:
printf("The result of an integer conversion would exceed the size that can be stored in the corresponding integer type.\n");
break;
default:
printf("fscanf returned %i (content %d).\n", scan_ret, val);
}
fclose(tegraChipId);
return val;
}
/*! @brief LookupTable for TX2 sysfs pins
*
* @return register address for the sysFsPin Id, NULL for pin Number = 0.
*/
volatile uint8_t *
TegraGpio_BaseAddressX2(
unsigned int inSysFsPinNum //!< Pin Number used in TegraGpio_PinLookupX
)
{
//!@pre Not open.
if( 0 == inSysFsPinNum)
{
return NULL;
}
assert( (sysFs_Aon_PinNumOffset < inSysFsPinNum ) && (( sysFs_Main_PinNumOffset + 5 * 8 * 8 ) > inSysFsPinNum ) );
unsigned int sysFsPinOffset = (inSysFsPinNum - sysFs_Aon_PinNumOffset) % 8 ;
volatile uint8_t * baseRegisterAddress = NULL ;
if( inSysFsPinNum < sysFs_Main_PinNumOffset)
{
#define TEGRA_AON_GPIO_PORT_S 0
#define TEGRA_AON_GPIO_PORT_U 1
#define TEGRA_AON_GPIO_PORT_V 2
#define TEGRA_AON_GPIO_PORT_W 3
#define TEGRA_AON_GPIO_PORT_Z 4
#define TEGRA_AON_GPIO_PORT_AA 5
#define TEGRA_AON_GPIO_PORT_EE 6
#define TEGRA_AON_GPIO_PORT_FF 7
#define TEGRA_AON_GPIO_PORT_S_ORDER 1
#define TEGRA_AON_GPIO_PORT_U_ORDER 2
#define TEGRA_AON_GPIO_PORT_V_ORDER 4
#define TEGRA_AON_GPIO_PORT_W_ORDER 5
#define TEGRA_AON_GPIO_PORT_Z_ORDER 7
#define TEGRA_AON_GPIO_PORT_AA_ORDER 6
#define TEGRA_AON_GPIO_PORT_EE_ORDER 3
#define TEGRA_AON_GPIO_PORT_FF_ORDER 0
unsigned int sysFsPortNum = (inSysFsPinNum - sysFs_Aon_PinNumOffset) / 8;
assert(sysFsPortNum < 8);
int AON_LOOKUP[8];
AON_LOOKUP[TEGRA_AON_GPIO_PORT_S] = TEGRA_AON_GPIO_PORT_S_ORDER;
AON_LOOKUP[TEGRA_AON_GPIO_PORT_U] = TEGRA_AON_GPIO_PORT_U_ORDER;
AON_LOOKUP[TEGRA_AON_GPIO_PORT_V] = TEGRA_AON_GPIO_PORT_V_ORDER;
AON_LOOKUP[TEGRA_AON_GPIO_PORT_W] = TEGRA_AON_GPIO_PORT_W_ORDER;
AON_LOOKUP[TEGRA_AON_GPIO_PORT_Z] = TEGRA_AON_GPIO_PORT_Z_ORDER;
AON_LOOKUP[TEGRA_AON_GPIO_PORT_AA] = TEGRA_AON_GPIO_PORT_AA_ORDER;
AON_LOOKUP[TEGRA_AON_GPIO_PORT_EE] = TEGRA_AON_GPIO_PORT_EE_ORDER;
AON_LOOKUP[TEGRA_AON_GPIO_PORT_FF] = TEGRA_AON_GPIO_PORT_FF_ORDER;
baseRegisterAddress = TegraAonGpio_X2RegBase + 0x200 * AON_LOOKUP[sysFsPortNum] + sysFsPinOffset * 0x20;
}
else
{
#define TEGRA_MAIN_GPIO_PORT_A 0
#define TEGRA_MAIN_GPIO_PORT_B 1
#define TEGRA_MAIN_GPIO_PORT_C 2
#define TEGRA_MAIN_GPIO_PORT_D 3
#define TEGRA_MAIN_GPIO_PORT_E 4
#define TEGRA_MAIN_GPIO_PORT_F 5
#define TEGRA_MAIN_GPIO_PORT_G 6
#define TEGRA_MAIN_GPIO_PORT_H 7
#define TEGRA_MAIN_GPIO_PORT_I 8
#define TEGRA_MAIN_GPIO_PORT_J 9
#define TEGRA_MAIN_GPIO_PORT_K 10
#define TEGRA_MAIN_GPIO_PORT_L 11
#define TEGRA_MAIN_GPIO_PORT_M 12
#define TEGRA_MAIN_GPIO_PORT_N 13
#define TEGRA_MAIN_GPIO_PORT_O 14
#define TEGRA_MAIN_GPIO_PORT_P 15
#define TEGRA_MAIN_GPIO_PORT_Q 16
#define TEGRA_MAIN_GPIO_PORT_R 17
#define TEGRA_MAIN_GPIO_PORT_T 18
#define TEGRA_MAIN_GPIO_PORT_X 19
#define TEGRA_MAIN_GPIO_PORT_Y 20
#define TEGRA_MAIN_GPIO_PORT_BB 21
#define TEGRA_MAIN_GPIO_PORT_CC 22
#define TEGRA_MAIN_GPIO_PORT_DD 23
#define TEGRA_MAIN_GPIO_PORT_N_ORDER 0
#define TEGRA_MAIN_GPIO_PORT_O_ORDER 1
#define TEGRA_MAIN_GPIO_PORT_Q_ORDER 2
#define TEGRA_MAIN_GPIO_PORT_T_ORDER 3
#define TEGRA_MAIN_GPIO_PORT_I_ORDER 4
#define TEGRA_MAIN_GPIO_PORT_R_ORDER 5
#define TEGRA_MAIN_GPIO_PORT_H_ORDER 8
#define TEGRA_MAIN_GPIO_PORT_L_ORDER 9
#define TEGRA_MAIN_GPIO_PORT_X_ORDER 10
#define TEGRA_MAIN_GPIO_PORT_Y_ORDER 11
#define TEGRA_MAIN_GPIO_PORT_A_ORDER 16
#define TEGRA_MAIN_GPIO_PORT_E_ORDER 17
#define TEGRA_MAIN_GPIO_PORT_F_ORDER 18
#define TEGRA_MAIN_GPIO_PORT_BB_ORDER 19
#define TEGRA_MAIN_GPIO_PORT_B_ORDER 24
#define TEGRA_MAIN_GPIO_PORT_C_ORDER 25
#define TEGRA_MAIN_GPIO_PORT_D_ORDER 26
#define TEGRA_MAIN_GPIO_PORT_P_ORDER 32
#define TEGRA_MAIN_GPIO_PORT_G_ORDER 33
#define TEGRA_MAIN_GPIO_PORT_J_ORDER 40
#define TEGRA_MAIN_GPIO_PORT_K_ORDER 41
#define TEGRA_MAIN_GPIO_PORT_CC_ORDER 42
#define TEGRA_MAIN_GPIO_PORT_M_ORDER 43
#define TEGRA_MAIN_GPIO_PORT_DD_ORDER 44
unsigned int sysFsPortNum = (inSysFsPinNum - sysFs_Main_PinNumOffset) / 8;
assert(sysFsPortNum < 24);
int MAIN_LOOKUP[24];
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_N] = TEGRA_MAIN_GPIO_PORT_N_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_O] = TEGRA_MAIN_GPIO_PORT_O_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_Q] = TEGRA_MAIN_GPIO_PORT_Q_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_T] = TEGRA_MAIN_GPIO_PORT_T_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_I] = TEGRA_MAIN_GPIO_PORT_I_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_R] = TEGRA_MAIN_GPIO_PORT_R_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_H] = TEGRA_MAIN_GPIO_PORT_H_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_L] = TEGRA_MAIN_GPIO_PORT_L_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_X] = TEGRA_MAIN_GPIO_PORT_X_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_Y] = TEGRA_MAIN_GPIO_PORT_Y_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_A] = TEGRA_MAIN_GPIO_PORT_A_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_E] = TEGRA_MAIN_GPIO_PORT_E_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_F] = TEGRA_MAIN_GPIO_PORT_F_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_BB] = TEGRA_MAIN_GPIO_PORT_BB_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_B] = TEGRA_MAIN_GPIO_PORT_B_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_C] = TEGRA_MAIN_GPIO_PORT_C_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_D] = TEGRA_MAIN_GPIO_PORT_D_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_P] = TEGRA_MAIN_GPIO_PORT_P_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_G] = TEGRA_MAIN_GPIO_PORT_G_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_J] = TEGRA_MAIN_GPIO_PORT_J_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_K] = TEGRA_MAIN_GPIO_PORT_K_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_CC] = TEGRA_MAIN_GPIO_PORT_CC_ORDER;
MAIN_LOOKUP[TEGRA_MAIN_GPIO_PORT_M] = TEGRA_MAIN_GPIO_PORT_M_ORDER;
baseRegisterAddress = TegraMainGpio_X2RegBase + 0x200 * MAIN_LOOKUP[sysFsPortNum] + sysFsPinOffset * 0x20;
}
assert(baseRegisterAddress);
return baseRegisterAddress ;
}
/*----------------------------------------------------------------------*/
/* Define external api. */
/*----------------------------------------------------------------------*/
unsigned long
gpioTx1PinOffset (
unsigned int inGpioNum
)
{
return (( (((inGpioNum % GPIO_BITS_PER_CONTROLLER) / GPIO_BITS_PER_PORT) * GPIO_PORTS_PER_CONTROLLER) + ((inGpioNum / GPIO_BITS_PER_CONTROLLER) * GPIO_CONTROLLER) ));
}
void setHigh(uint32_t* GpioClk_OutReg, unsigned int outHighValue)
{
*GpioClk_OutReg = outHighValue;
}
void setLow(uint32_t* GpioClk_OutReg, unsigned int outLowValue)
{
*GpioClk_OutReg = outLowValue;
}
void
exercise(const unsigned int count, uint32_t* GpioClk_OutReg, unsigned int outHighValue, unsigned int outLowValue)
{
printf("Begin exercise.\n");
assert(count > 0);
unsigned int iClk;
for (iClk = 0 ; iClk < count ; ++iClk)
{
setHigh(GpioClk_OutReg, outHighValue);
setLow(GpioClk_OutReg, outLowValue);
}
printf("End exercise (count %u).\n", iClk);
}
/*! @brief Instantiate a VDU class and a GPU Mgr class
*/
int main()
{
uint32_t * TegraGpioClk_CnfReg = NULL;
uint32_t * TegraGpioClk_OeReg = NULL;
uint32_t * TegraGpioClk_OutReg = NULL;
uint32_t * TegraGpioClk_InReg = NULL;
(void) TegraGpioClk_InReg;
unsigned int TegraGpioClk_OutHighValue = 0x01;
unsigned int TegraGpioClk_OutLowValue = 0x00;
//Open Gpios
if( (TegraGpio_MemFd = open( "/dev/mem", O_RDWR | O_SYNC)) >= 0 ) // RATS: ignore Filename constructed by us so known to be
switch( tegraFuseValue() ) // Is TX1 or TX2
{
case 33 : DeviceType = 1; printf("Device type TX1.\n");break;
case 24 : DeviceType = 2; printf("Device type TX2.\n"); break;
default: assert( 1 == DeviceType || 2 == DeviceType );
}
// Open TX1 pins and map registers.
if (1 == DeviceType)
{
printf("Mapping TX1 GPIO.\n");
// Use mmap to gain access to GPIO register via a memory pointer.
TegraGpio_X1RegBase = (uint8_t *)mmap(
NULL,
0x1000,
PROT_READ|PROT_WRITE,
MAP_SHARED,
TegraGpio_MemFd,
GPIO_BASE );
// Init lookup arrays.
assert( (long)TegraGpio_X1RegBase > 0 );
unsigned long clkGpioPinOffset = gpioTx1PinOffset( ClkGpioPinNumX1 );
TegraGpioClk_CnfReg = (uint32_t *)(TegraGpio_X1RegBase + clkGpioPinOffset + GPIO_CNF );
TegraGpioClk_OeReg = (uint32_t *)(TegraGpio_X1RegBase + clkGpioPinOffset + GPIO_OE );
TegraGpioClk_OutReg = (uint32_t *)(TegraGpio_X1RegBase + clkGpioPinOffset + GPIO_OUT );
TegraGpioClk_InReg = (uint32_t *)(TegraGpio_X1RegBase + clkGpioPinOffset + GPIO_IN );
TegraGpioClk_OutHighValue = (0x0101U << (ClkGpioPinNumX1 % 8 ) );
TegraGpioClk_OutLowValue = (0x0100U << (ClkGpioPinNumX1 % 8 ) );
}
if (2 == DeviceType)
{
printf("Mapping TX2 GPIO.\n");
// Use mmap to gain access to GPIO register via a memory pointer.
TegraMainGpio_X2RegBase = (uint8_t *)mmap(
NULL,
0x10000,
PROT_READ|PROT_WRITE,
MAP_SHARED,
TegraGpio_MemFd,
MAIN_GPIO_BASE_X2 );
TegraAonGpio_X2RegBase = (uint8_t *)mmap(
NULL,
0x2000,
PROT_READ|PROT_WRITE,
MAP_SHARED,
TegraGpio_MemFd,
AON_GPIO_BASE_X2 );
// Init lookup arrays.
assert( (long)TegraMainGpio_X2RegBase > 0 );
assert( (long)TegraAonGpio_X2RegBase > 0 );
volatile uint8_t* clkPinBaseAddress = TegraGpio_BaseAddressX2( ClkGpioPinNumX2);
TegraGpioClk_CnfReg = (uint32_t *)(clkPinBaseAddress + X2_CNF_OFFSET );
TegraGpioClk_OeReg = (uint32_t *)(clkPinBaseAddress + X2_OE_OFFSET );
TegraGpioClk_OutReg = (uint32_t *)(clkPinBaseAddress + X2_OUT_OFFSET );
TegraGpioClk_InReg = (uint32_t *)(clkPinBaseAddress + X2_IN_OFFSET );
TegraGpioClk_OutHighValue = 0x01;
TegraGpioClk_OutLowValue = 0x00;
}
// Set Clk as output
switch(DeviceType)
{
case 1:
printf("Set clock, TX1:\n");
*TegraGpioClk_CnfReg = (0x0101U << ( ClkGpioPinNumX1 % 8 )); // Gpio to change, set as GPIO.
*TegraGpioClk_OeReg = (0x0101U << ( ClkGpioPinNumX1 % 8 )); // Gpio to change, set as output.
break;
case 2:
printf("Set clock, TX2:\n");
*TegraGpioClk_CnfReg = (X2_CONF_OUTPUT ); // Gpio to change, set as GPIO.
*TegraGpioClk_OeReg = (X2_OUTPUT_DRIVE) ; // Gpio to change, set as output.
break;
default:
assert(!"Device type not supported.");
}
if (DeviceType == 1 || DeviceType == 2)
{
printf(" TegraGpioClk_CnfReg: 0x%x.\n", *TegraGpioClk_CnfReg);
printf(" TegraGpioClk_OeReg: 0x%x.\n", *TegraGpioClk_OeReg);
}
// Toggle Gpios
clock_t startTime = time(NULL);
const unsigned int numClockPeriods = 100 * 1000 * 1000;
printf("numClockPeriods: %i.\n", numClockPeriods);
assert(TegraGpioClk_OutLowValue != TegraGpioClk_OutHighValue);
exercise(numClockPeriods, TegraGpioClk_OutReg, TegraGpioClk_OutHighValue, TegraGpioClk_OutLowValue);
printf("\n");
clock_t finishTime = time(NULL);
double duration = ( finishTime - startTime );
printf("\nTime taken %.2f, Clock speed %.2f MHz\n", duration , numClockPeriods / (duration * 1000000));
// Close Gpios
if (1 == DeviceType)
{
int rc = munmap( (void *)TegraGpio_X1RegBase, GPIO_CONTROLLER );
assert( 0 == rc ); (void)rc;
TegraGpio_X1RegBase = NULL;
}
if(2 == DeviceType)
{
int rc = munmap( (void *)TegraMainGpio_X2RegBase, 0x10000 );
rc += munmap( (void *)TegraAonGpio_X2RegBase, 0x2000 );
assert( 0 == rc ); (void)rc;
TegraMainGpio_X2RegBase = NULL;
TegraAonGpio_X2RegBase = NULL;
}
close( TegraGpio_MemFd ); TegraGpio_MemFd = -1;
return(0);
}
About profiling…
If you compile with the “-pg” option, then each time you run the program a “gmon.out” file is generated. Running the program with gprof will then give you times in various functions, e.g.:
g++ -Wall -g -pg -O0 bitBang.c -o bitBang3
./bitBang3
gprof ./bitBang3
Remember that the gmon.out file changes each run.
I can’t tell you why the TX2 performance is so different versus the TX1. Profiling the test program at least gives a starting point to anyone who might know the details to answer.