I’m experimenting with different ways to program an ESP, including native code, MicroPython, Lua on RTOS, JavaScript, and WebAssembly. I want to compare these options in terms of execution time, CPU load, memory usage, and storage footprint.
What’s the best way to set up practical benchmarks across these environments so the results are fair and comparable? Is there an ideal approach or toolchain for measuring these metrics reliably on the ESP?
ESP Performance Benchmarks
Re: ESP Performance Benchmarks
Below you'll find the benchmark I use for the ESP32-P4 in C, and my results at 400mhz. Maybe this will give you some inspiration for what you can use to start checking.
As for memory footprint, probably the best thing to do is ask the RTOS for the task/thread stack watermark, as long as all your variables are local. I went ahead and put an example in here at the end of the test.
(3584 main task stack size = 1596 bytes stack used)
As for memory footprint, probably the best thing to do is ask the RTOS for the task/thread stack watermark, as long as all your variables are local. I went ahead and put an example in here at the end of the test.
Code: Select all
/* Original thread: https://esp32.com/viewtopic.php?p=136103#p136103 */
#include <stdio.h>
#include <esp_attr.h>
#include <esp_timer.h>
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include "riscv/rv_utils.h"
#include "hal/clk_tree_ll.h"
#include "hal/regi2c_ctrl_ll.h"
static double tv[8];
const int N = 3200000;
#ifndef CSR_MINSTRET
#define CSR_MINSTRET 0xb02
#endif
#define TEST(type,name,ops) void IRAM_ATTR name (void) {\
type f0 = tv[0],f1 = tv[1],f2 = tv[2],f3 = tv[3];\
type f4 = tv[4],f5 = tv[5],f6 = tv[6],f7 = tv[7];\
for (uint32_t j = N/16; j > 0; j--) {\
ops \
}\
tv[0] = f0;tv[1] = f1;tv[2] = f2;tv[3] = f3;\
tv[4] = f4;tv[5] = f5;tv[6] = f6;tv[7] = f7;\
}
#define fops(op1,op2) f0 op1##=f1 op2 f2;f1 op1##=f2 op2 f3;\
f2 op1##=f3 op2 f4;f3 op1##=f4 op2 f5;\
f4 op1##=f5 op2 f6;f5 op1##=f6 op2 f7;\
f6 op1##=f7 op2 f0;f7 op1##=f0 op2 f1;
#define addops fops(,+) fops(,+)
#define divops fops(,/) fops(,/)
#define mulops fops(,*) fops(,*)
#define muladdops fops(+,*)
TEST(int,mulint,mulops)
TEST(float,mulfloat,mulops)
TEST(double,muldouble,mulops)
TEST(int,addint,addops)
TEST(float,addfloat,addops)
TEST(double,adddouble,addops)
TEST(int,divint,divops)
TEST(float,divfloat,divops)
TEST(double,divdouble,divops)
TEST(int,muladdint,muladdops)
TEST(float,muladdfloat,muladdops)
TEST(double,muladddouble,muladdops)
void timeit(char *name,void fn(void)) {
vTaskDelay(1);
tv[0]=tv[1]=tv[2]=tv[3]=tv[4]=tv[5]=tv[6]=tv[7]=1;
// get time since boot in microseconds
uint64_t time=esp_timer_get_time();
unsigned ccount,icount,ccount_new,icount_new;
ccount = rv_utils_get_cycle_count();
icount = RV_READ_CSR(CSR_MINSTRET);
fn();
ccount_new = rv_utils_get_cycle_count();
icount_new = RV_READ_CSR(CSR_MINSTRET);
time=esp_timer_get_time()-time;
float cpi=(float)(ccount_new-ccount)/(icount_new-icount);
printf ("%s \t %f MOP/S \tCPI=%f\n",name, (float)N/time,cpi);
}
void app_main() {
/* Set the ESP32-P4 clock to 400mhz at your own risk */
_regi2c_ctrl_ll_master_enable_clock(true);
clk_ll_cpll_set_config(400, 40);
timeit("Integer Addition",addint);
timeit("Integer Multiply",mulint);
timeit("Integer Division",divint);
timeit("Integer Multiply-Add",muladdint);
timeit("Float Addition ", addfloat);
timeit("Float Multiply ", mulfloat);
timeit("Float Division ", divfloat);
timeit("Float Multiply-Add", muladdfloat);
timeit("Double Addition", adddouble);
timeit("Double Multiply", muldouble);
timeit("Double Division", divdouble);
timeit("Double Multiply-Add", muladddouble);
BaseType_t stack_remaining = uxTaskGetStackHighWaterMark(NULL);
printf("%d stack remaining", stack_remaining);
}
Code: Select all
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Memory Type/Section ┃ Used [bytes] ┃ Used [%] ┃ Remain [bytes] ┃ Total [bytes] ┃
┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ Flash │ 144310 │ │ │ │
│ .text │ 101270 │ │ │ │
│ .rodata │ 42556 │ │ │ │
│ .appdesc │ 256 │ │ │ │
│ .init_array │ 228 │ │ │ │
│ DIRAM │ 76003 │ 13.18 │ 500461 │ 576464 │
│ .text │ 62734 │ 10.88 │ │ │
│ .data │ 7265 │ 1.26 │ │ │
│ .bss │ 6004 │ 1.04 │ │ │
│ HP core RAM │ 66 │ 0.81 │ 8126 │ 8192 │
│ .data │ 56 │ 0.68 │ │ │
│ .text │ 10 │ 0.12 │ │ │
│ LP RAM │ 24 │ 0.07 │ 32744 │ 32768 │
│ .rtc_reserved │ 24 │ 0.07 │ │ │
└─────────────────────┴──────────────┴──────────┴────────────────┴───────────────┘
Total image size: 214147 bytes (.bin may be padded larger)
Code: Select all
Integer Addition 355.318665 MOP/S CPI=1.000510
Integer Multiply 355.239777 MOP/S CPI=1.000500
Integer Division 355.279236 MOP/S CPI=1.000178
Integer Multiply-Add 355.437073 MOP/S CPI=1.000152
Float Addition 354.021454 MOP/S CPI=1.003911
Float Multiply 355.358124 MOP/S CPI=1.000268
Float Division 99.925056 MOP/S CPI=3.554537
Float Multiply-Add 639.360657 MOP/S CPI=1.000451
Double Addition 12.666194 MOP/S CPI=1.180594
Double Multiply 4.638723 MOP/S CPI=1.069303
Double Division 1.668304 MOP/S CPI=1.043895
Double Multiply-Add 10.660661 MOP/S CPI=1.167640
1988 stack remaining
-
MicroController
- Posts: 2661
- Joined: Mon Oct 17, 2022 7:38 pm
- Location: Europe, Germany
Re: ESP Performance Benchmarks
Sticking to the de-facto standard CoreMark would be a good idea. Not sure though how well it translates to interpreted languages (no function pointers).
Who is online
Users browsing this forum: PerplexityBot, PetalBot, Qwantbot, YisouSpider and 8 guests