I have a default raspberry pi running raspbian, assuming it’s running at 700MHz. I’d think that every instruction would take only one or a few cycles at most. In my experiments that doesn’t seem to be the case.
I use the 1MHz timer and count the number of instructions I can do in a given timeframe (e.g. 10ms). For my loop of 4 assembly instructions, that’s 146067, i.e. 14.6 loop-iterations per μs. @700MHz (700 cycles per μs) that’s 48 cycles per iteration or 12 cycles per assembly instruction.
Is that correct (i.e. is the PI actually much slower than I thought it would be), or is there an error in my experiments?
I made sure that context switches aren’t an issue and interrupts should be disabled.
Here’s the code I used for instrumentation:
// Build:
// CC=arm-rpi-4.9.3-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc
// $CC -Ofast instruction_timing.c -o instruction_timing
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#define TIMER_BASE 0x20003000
#define INT_BASE 0x2000B000
#define ITERATIONS_US 10000
volatile unsigned *timer, *intrupt;
int setup(void);
int interrupts(int flag);
main() {
sleep(1); // 1 second delay
// When the program starts, the interrupt
// system may still be dealing with the
// last Enter keystroke. This gives it
// time to finish.
setup(); // setup timer and interrupt pointers
interrupts(0); // Disable interrupts to ensure
// accurate timing.
// Re-enable via interrupts(1) as
// soon as accurate timing is no
// longer needed.
// We perform this 20 times.
// /proc/$pid/status says ~10 involuntary context-switches happen.
// This way we make sure at least one of the loops was without
// a context switch.
unsigned int switches[20];
unsigned int timend;
unsigned int i;
for (i = 0; i < 20; i++) {
unsigned int switches_ = switches[i] = 0;
// Count the iterations we can do within ITERATIONS_US
timend = *timer + ITERATIONS_US;
while (*timer < timend) switches_++;
switches[i] = switches_;
}
interrupts(1); // re-enable interrupts
for (i = 0; i < 20; i++) {
printf("[%02d] iterations/us: %.4f\n", i,
((double)switches[i]) / ITERATIONS_US);
}
// Sleep a bit so that our bash-script has time to record the number
// of context switches.
sleep(2);
return;
}
// 14.6067 iterations / μs
// 104a8: e5902000 ldr r2, [r0]
// 104ac: e2833001 add r3, r3, #1
// 104b0: e1510002 cmp r1, r2
// 104b4: 8afffffb bhi 104a8 <main+0x50>
// 4 assembly instructions per iteration
// @700MHz (700 cycles per μs) that’s 48 cycles per iteration or 12 cycles per
// instruction
// Shamelessly stolen from somewhere else
int interrupts(int flag) {
static unsigned int sav132 = 0;
static unsigned int sav133 = 0;
static unsigned int sav134 = 0;
if (flag == 0) // disable
{
if (sav132 != 0) {
// Interrupts already disabled so avoid printf
return (0);
}
if ((*(intrupt + 128) | *(intrupt + 129) | *(intrupt + 130)) != 0) {
printf("Pending interrupts\n"); // may be OK but probably
return (0); // better to wait for the
} // pending interrupts to
// clear
sav134 = *(intrupt + 134);
*(intrupt + 137) = sav134;
sav132 = *(intrupt + 132); // save current interrupts
*(intrupt + 135) = sav132; // disable active interrupts
sav133 = *(intrupt + 133);
*(intrupt + 136) = sav133;
} else // flag = 1 enable
{
if (sav132 == 0) {
printf("Interrupts not disabled\n");
return (0);
}
*(intrupt + 132) = sav132; // restore saved interrupts
*(intrupt + 133) = sav133;
*(intrupt + 134) = sav134;
sav132 = 0; // indicates interrupts enabled
}
return (1);
}
int setup() {
int memfd;
void *timer_map, *int_map;
memfd = open("/dev/mem", O_RDWR | O_SYNC);
if (memfd < 0) {
printf("Mem open error\n");
return (0);
}
timer_map =
mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, memfd, TIMER_BASE);
int_map =
mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, memfd, INT_BASE);
close(memfd);
if (timer_map == MAP_FAILED || int_map == MAP_FAILED) {
printf("Map failed\n");
return (0);
}
// interrupt pointer
intrupt = (volatile unsigned *)int_map;
// timer pointer
timer = (volatile unsigned *)timer_map;
++timer; // timer lo 4 bytes
// timer hi 4 bytes available via *(timer+1)
return (1);
}
Here’s how I run it:
$ sudo ./instruction_timing & sleep 2; cat /proc/$!/status | grep ctx
[1] 1061
[00] iterations/us: 14.6298
[01] iterations/us: 14.6346
[02] iterations/us: 14.6076
[03] iterations/us: 14.6075
[04] iterations/us: 14.6068
[05] iterations/us: 14.6076
[06] iterations/us: 14.6072
[07] iterations/us: 14.6075
[08] iterations/us: 14.6069
[09] iterations/us: 14.6071
[10] iterations/us: 14.6072
[11] iterations/us: 14.6074
[12] iterations/us: 14.6072
[13] iterations/us: 14.6074
[14] iterations/us: 14.6068
[15] iterations/us: 14.6074
[16] iterations/us: 14.6072
[17] iterations/us: 14.6076
[18] iterations/us: 14.6068
[19] iterations/us: 14.6071
voluntary_ctxt_switches: 5
nonvoluntary_ctxt_switches: 13