ESP Performance Benchmarks

gabrielt
Posts: 5
Joined: Tue Jun 24, 2025 4:09 pm

ESP Performance Benchmarks

Postby gabrielt » Sun Aug 24, 2025 3:17 pm

I’m experimenting with different ways to program an ESP, including native code, MicroPython, Lua on RTOS, JavaScript, and WebAssembly. I want to compare these options in terms of execution time, CPU load, memory usage, and storage footprint.

What’s the best way to set up practical benchmarks across these environments so the results are fair and comparable? Is there an ideal approach or toolchain for measuring these metrics reliably on the ESP?

User avatar
thefury
Posts: 38
Joined: Thu Sep 05, 2019 5:25 pm

Re: ESP Performance Benchmarks

Postby thefury » Tue Aug 26, 2025 2:53 pm

Below you'll find the benchmark I use for the ESP32-P4 in C, and my results at 400mhz. Maybe this will give you some inspiration for what you can use to start checking.

As for memory footprint, probably the best thing to do is ask the RTOS for the task/thread stack watermark, as long as all your variables are local. I went ahead and put an example in here at the end of the test.

Code: Select all

/* Original thread: https://esp32.com/viewtopic.php?p=136103#p136103 */
#include <stdio.h>
#include <esp_attr.h>
#include <esp_timer.h>
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include "riscv/rv_utils.h"
#include "hal/clk_tree_ll.h"
#include "hal/regi2c_ctrl_ll.h"

static double tv[8];
const int N = 3200000;

#ifndef CSR_MINSTRET
#define CSR_MINSTRET 0xb02
#endif

#define TEST(type,name,ops) void IRAM_ATTR name (void) {\
    type f0 = tv[0],f1 = tv[1],f2 = tv[2],f3 = tv[3];\
    type f4 = tv[4],f5 = tv[5],f6 = tv[6],f7 = tv[7];\
    for (uint32_t j = N/16; j > 0; j--) {\
        ops \
    }\
    tv[0] = f0;tv[1] = f1;tv[2] = f2;tv[3] = f3;\
    tv[4] = f4;tv[5] = f5;tv[6] = f6;tv[7] = f7;\
    }
    
#define fops(op1,op2) f0 op1##=f1 op2 f2;f1 op1##=f2 op2 f3;\
    f2 op1##=f3 op2 f4;f3 op1##=f4 op2 f5;\
    f4 op1##=f5 op2 f6;f5 op1##=f6 op2 f7;\
    f6 op1##=f7 op2 f0;f7 op1##=f0 op2 f1;

#define addops fops(,+) fops(,+)
#define divops fops(,/) fops(,/)
#define mulops fops(,*) fops(,*)
#define muladdops fops(+,*)

TEST(int,mulint,mulops)
TEST(float,mulfloat,mulops)
TEST(double,muldouble,mulops)
TEST(int,addint,addops)
TEST(float,addfloat,addops)
TEST(double,adddouble,addops)
TEST(int,divint,divops)
TEST(float,divfloat,divops)
TEST(double,divdouble,divops)
TEST(int,muladdint,muladdops)
TEST(float,muladdfloat,muladdops)
TEST(double,muladddouble,muladdops)

void timeit(char *name,void fn(void)) {
    vTaskDelay(1);
    tv[0]=tv[1]=tv[2]=tv[3]=tv[4]=tv[5]=tv[6]=tv[7]=1;
    // get time since boot in microseconds
    uint64_t time=esp_timer_get_time();
    unsigned ccount,icount,ccount_new,icount_new;
    ccount = rv_utils_get_cycle_count();
    icount = RV_READ_CSR(CSR_MINSTRET);
    fn();
    ccount_new = rv_utils_get_cycle_count();
    icount_new = RV_READ_CSR(CSR_MINSTRET);
    time=esp_timer_get_time()-time;
    float cpi=(float)(ccount_new-ccount)/(icount_new-icount);
    printf ("%s \t %f MOP/S   \tCPI=%f\n",name, (float)N/time,cpi);
}

void app_main() {
    /* Set the ESP32-P4 clock to 400mhz at your own risk */
    _regi2c_ctrl_ll_master_enable_clock(true);
    clk_ll_cpll_set_config(400, 40);

    timeit("Integer Addition",addint);
    timeit("Integer Multiply",mulint);
    timeit("Integer Division",divint);
    timeit("Integer Multiply-Add",muladdint);

    timeit("Float Addition ", addfloat);
    timeit("Float Multiply ", mulfloat);
    timeit("Float Division ", divfloat);
    timeit("Float Multiply-Add", muladdfloat);

    timeit("Double Addition", adddouble);
    timeit("Double Multiply", muldouble);
    timeit("Double Division", divdouble);
    timeit("Double Multiply-Add", muladddouble);
    
    BaseType_t stack_remaining = uxTaskGetStackHighWaterMark(NULL);
    printf("%d stack remaining", stack_remaining);
}

Code: Select all

┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Memory Type/Section ┃ Used [bytes] ┃ Used [%] ┃ Remain [bytes] ┃ Total [bytes] ┃
┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ Flash               │       144310 │          │                │               │
│    .text            │       101270 │          │                │               │
│    .rodata          │        42556 │          │                │               │
│    .appdesc         │          256 │          │                │               │
│    .init_array      │          228 │          │                │               │
│ DIRAM               │        76003 │    13.18 │         500461 │        576464 │
│    .text            │        62734 │    10.88 │                │               │
│    .data            │         7265 │     1.26 │                │               │
│    .bss             │         6004 │     1.04 │                │               │
│ HP core RAM         │           66 │     0.81 │           8126 │          8192 │
│    .data            │           56 │     0.68 │                │               │
│    .text            │           10 │     0.12 │                │               │
│ LP RAM              │           24 │     0.07 │          32744 │         32768 │
│    .rtc_reserved    │           24 │     0.07 │                │               │
└─────────────────────┴──────────────┴──────────┴────────────────┴───────────────┘
Total image size: 214147 bytes (.bin may be padded larger)

Code: Select all

Integer Addition 	 355.318665 MOP/S   	CPI=1.000510
Integer Multiply 	 355.239777 MOP/S   	CPI=1.000500
Integer Division 	 355.279236 MOP/S   	CPI=1.000178
Integer Multiply-Add 	 355.437073 MOP/S   	CPI=1.000152
Float Addition  	 354.021454 MOP/S   	CPI=1.003911
Float Multiply  	 355.358124 MOP/S   	CPI=1.000268
Float Division  	 99.925056 MOP/S   	CPI=3.554537
Float Multiply-Add 	 639.360657 MOP/S   	CPI=1.000451
Double Addition 	 12.666194 MOP/S   	CPI=1.180594
Double Multiply 	 4.638723 MOP/S   	CPI=1.069303
Double Division 	 1.668304 MOP/S   	CPI=1.043895
Double Multiply-Add 	 10.660661 MOP/S   	CPI=1.167640
1988 stack remaining
(3584 main task stack size = 1596 bytes stack used)

MicroController
Posts: 2661
Joined: Mon Oct 17, 2022 7:38 pm
Location: Europe, Germany

Re: ESP Performance Benchmarks

Postby MicroController » Tue Aug 26, 2025 7:44 pm

Sticking to the de-facto standard CoreMark would be a good idea. Not sure though how well it translates to interpreted languages (no function pointers).

Who is online

Users browsing this forum: Applebot, ChatGPT-User, PerplexityBot and 11 guests