4

I have a default raspberry pi running raspbian, assuming it’s running at 700MHz. I’d think that every instruction would take only one or a few cycles at most. In my experiments that doesn’t seem to be the case.

I use the 1MHz timer and count the number of instructions I can do in a given timeframe (e.g. 10ms). For my loop of 4 assembly instructions, that’s 146067, i.e. 14.6 loop-iterations per μs. @700MHz (700 cycles per μs) that’s 48 cycles per iteration or 12 cycles per assembly instruction.

Is that correct (i.e. is the PI actually much slower than I thought it would be), or is there an error in my experiments?

I made sure that context switches aren’t an issue and interrupts should be disabled.

Here’s the code I used for instrumentation:

// Build:
// CC=arm-rpi-4.9.3-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc
// $CC -Ofast instruction_timing.c -o instruction_timing

#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>

#define TIMER_BASE 0x20003000
#define INT_BASE 0x2000B000

#define ITERATIONS_US 10000

volatile unsigned *timer, *intrupt;

int setup(void);
int interrupts(int flag);

main() {
  sleep(1);  // 1 second delay
             // When the program starts, the interrupt
             // system may still be dealing with the
             // last Enter keystroke. This gives it
             // time to finish.

  setup();  // setup timer and interrupt pointers

  interrupts(0);  // Disable interrupts to ensure
                  // accurate timing.
                  // Re-enable via interrupts(1) as
                  // soon as accurate timing is no
                  // longer needed.

  // We perform this 20 times.
  // /proc/$pid/status says ~10 involuntary context-switches happen.
  // This way we make sure at least one of the loops was without
  // a context switch.
  unsigned int switches[20];
  unsigned int timend;
  unsigned int i;
  for (i = 0; i < 20; i++) {
    unsigned int switches_ = switches[i] = 0;
    // Count the iterations we can do within ITERATIONS_US
    timend = *timer + ITERATIONS_US;
    while (*timer < timend) switches_++;
    switches[i] = switches_;
  }

  interrupts(1);  // re-enable interrupts
  for (i = 0; i < 20; i++) {
    printf("[%02d] iterations/us: %.4f\n", i,
           ((double)switches[i]) / ITERATIONS_US);
  }
  // Sleep a bit so that our bash-script has time to record the number
  // of context switches.
  sleep(2);
  return;
}

// 14.6067 iterations / μs
//   104a8:       e5902000        ldr     r2, [r0]
//   104ac:       e2833001        add     r3, r3, #1
//   104b0:       e1510002        cmp     r1, r2
//   104b4:       8afffffb        bhi     104a8 <main+0x50>
// 4 assembly instructions per iteration
// @700MHz (700 cycles per μs) that’s 48 cycles per iteration or 12 cycles per
// instruction

// Shamelessly stolen from somewhere else
int interrupts(int flag) {
  static unsigned int sav132 = 0;
  static unsigned int sav133 = 0;
  static unsigned int sav134 = 0;

  if (flag == 0)  // disable
  {
    if (sav132 != 0) {
      // Interrupts already disabled so avoid printf
      return (0);
    }

    if ((*(intrupt + 128) | *(intrupt + 129) | *(intrupt + 130)) != 0) {
      printf("Pending interrupts\n");  // may be OK but probably
      return (0);                      // better to wait for the
    }                                  // pending interrupts to
                                       // clear

    sav134 = *(intrupt + 134);
    *(intrupt + 137) = sav134;
    sav132 = *(intrupt + 132);  // save current interrupts
    *(intrupt + 135) = sav132;  // disable active interrupts
    sav133 = *(intrupt + 133);
    *(intrupt + 136) = sav133;
  } else  // flag = 1 enable
  {
    if (sav132 == 0) {
      printf("Interrupts not disabled\n");
      return (0);
    }

    *(intrupt + 132) = sav132;  // restore saved interrupts
    *(intrupt + 133) = sav133;
    *(intrupt + 134) = sav134;
    sav132 = 0;  // indicates interrupts enabled
  }
  return (1);
}

int setup() {
  int memfd;
  void *timer_map, *int_map;

  memfd = open("/dev/mem", O_RDWR | O_SYNC);
  if (memfd < 0) {
    printf("Mem open error\n");
    return (0);
  }

  timer_map =
      mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, memfd, TIMER_BASE);

  int_map =
      mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, memfd, INT_BASE);

  close(memfd);

  if (timer_map == MAP_FAILED || int_map == MAP_FAILED) {
    printf("Map failed\n");
    return (0);
  }
  // interrupt pointer
  intrupt = (volatile unsigned *)int_map;
  // timer pointer
  timer = (volatile unsigned *)timer_map;
  ++timer;  // timer lo 4 bytes
            // timer hi 4 bytes available via *(timer+1)

  return (1);
}

Here’s how I run it:

$ sudo ./instruction_timing & sleep 2; cat /proc/$!/status | grep ctx
[1] 1061
[00] iterations/us: 14.6298
[01] iterations/us: 14.6346
[02] iterations/us: 14.6076
[03] iterations/us: 14.6075
[04] iterations/us: 14.6068
[05] iterations/us: 14.6076
[06] iterations/us: 14.6072
[07] iterations/us: 14.6075
[08] iterations/us: 14.6069
[09] iterations/us: 14.6071
[10] iterations/us: 14.6072
[11] iterations/us: 14.6074
[12] iterations/us: 14.6072
[13] iterations/us: 14.6074
[14] iterations/us: 14.6068
[15] iterations/us: 14.6074
[16] iterations/us: 14.6072
[17] iterations/us: 14.6076
[18] iterations/us: 14.6068
[19] iterations/us: 14.6071
voluntary_ctxt_switches:    5
nonvoluntary_ctxt_switches: 13
rumpel
  • 143
  • 1
  • 4

2 Answers2

1

You are not using a realtime OS but a general purpose OS. You are assuming that your raspberry pi is doing nothing else than just those commands. But in reality, it is logging and other things.

More info on RTOS here.

Dr_Bunsen
  • 496
  • 3
  • 11
1

Your problem may be here:

ldr     r2, [r0]

This instruction is accessing a memory address, so it may take an almost arbitrary number of cycles to complete (up to a few hundreds). Worse, you're accessing a memory-mapped peripheral (timer) instead of a regular RAM, so you're guaranteed to have a cache miss.

Try making a loop which only works with registers, and read the timer before and after the loop. You may get more realistic values that way.

Dmitry Grigoryev
  • 28,277
  • 6
  • 54
  • 147