ESP32-P4 Cache Preload API Test Results
Summary
Thank you MicroController for the suggestion to use the asynchronous pattern. I implemented your recommended approach and compared it against optimized CPU copy as a baseline. Unfortunately, the results confirm that Cache_Start_L2_Cache_Preload() does not perform actual PSRAM data transfer.
Test Setup
- Hardware: ESP32-P4
- ESP-IDF: v5.5
- Buffer Size: 128 KB in PSRAM
- Eviction Buffer: 256 KB (to ensure cold cache)
- Iterations: 10
- Cache Configuration: L2 256 KB, line size 128B
Test Code
main/app_main.cpp
Code: Select all
#include <stdio.h>
#include <stdint.h>
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include "esp_heap_caps.h"
#include "esp_timer.h"
#include "rom/cache.h"
extern "C" {
void dl_esp32p4_memcpy(void *dst, const void *src, const size_t n);
}
#define BUFFER_SIZE (128 * 1024)
#define EVICT_SIZE (256 * 1024)
#define ALIGNMENT 128
#define NUM_ITERATIONS 10
#define OUTPUT_SIZE (8 * 1024)
static volatile uint64_t g_prevent_optimization = 0;
static void force_cache_eviction(volatile uint8_t *evict_buf)
{
volatile uint64_t sum = 0;
for (size_t i = 0; i < EVICT_SIZE; i += 64) {
sum += evict_buf[i];
}
g_prevent_optimization += sum;
asm volatile("fence" ::: "memory");
}
static void test_comparison()
{
char *output = (char *)heap_caps_malloc(OUTPUT_SIZE, MALLOC_CAP_INTERNAL);
if (!output) {
printf("ERROR: Cannot allocate output buffer\n");
return;
}
int pos = 0;
pos += snprintf(output + pos, OUTPUT_SIZE - pos,
"\n========================================\n"
"ESP32-P4 Cache Preload vs CPU Copy Test\n"
"========================================\n"
"Buffer: 128 KB PSRAM\n"
"Iterations: %d\n"
"PSRAM theoretical max: ~400 MB/s\n\n", NUM_ITERATIONS);
volatile uint8_t *psram_src = (volatile uint8_t *)heap_caps_aligned_alloc(
ALIGNMENT, BUFFER_SIZE, MALLOC_CAP_SPIRAM);
uint8_t *internal_dst = (uint8_t *)heap_caps_aligned_alloc(
ALIGNMENT, BUFFER_SIZE, MALLOC_CAP_INTERNAL);
volatile uint8_t *evict_buf = (volatile uint8_t *)heap_caps_aligned_alloc(
ALIGNMENT, EVICT_SIZE, MALLOC_CAP_SPIRAM);
if (!psram_src || !internal_dst || !evict_buf) {
pos += snprintf(output + pos, OUTPUT_SIZE - pos, "ERROR: Allocation failed\n");
printf("%s", output);
heap_caps_free(output);
return;
}
for (size_t i = 0; i < BUFFER_SIZE; i++) {
psram_src[i] = (uint8_t)(i & 0xFF);
}
for (size_t i = 0; i < EVICT_SIZE; i++) {
evict_buf[i] = (uint8_t)(i & 0xFF);
}
// TEST 1: Hardware Preload API (following async pattern)
pos += snprintf(output + pos, OUTPUT_SIZE - pos,
"TEST 1: Hardware Preload API\n"
"=============================\n");
uint64_t total_preload_us = 0;
uint32_t total_polls = 0;
for (int i = 0; i < NUM_ITERATIONS; i++) {
force_cache_eviction(evict_buf);
Cache_Invalidate_Addr(CACHE_MAP_L2_CACHE, (uint32_t)psram_src, BUFFER_SIZE);
vTaskDelay(pdMS_TO_TICKS(10));
uint32_t restore = 0;
int64_t start = esp_timer_get_time();
if (Cache_L2_Cache_Preload_Done()) {
restore = Cache_Start_L2_Cache_Preload((uint32_t)psram_src, BUFFER_SIZE, 0);
}
uint32_t polls = 0;
while (!Cache_L2_Cache_Preload_Done()) {
polls++;
}
int64_t end = esp_timer_get_time();
if (restore) {
Cache_End_L2_Cache_Preload(restore);
}
volatile uint64_t check = 0;
for (size_t j = 0; j < BUFFER_SIZE; j += 64) {
check += psram_src[j];
}
g_prevent_optimization += check;
total_preload_us += (end - start);
total_polls += polls;
}
float avg_preload_us = (float)total_preload_us / NUM_ITERATIONS;
float preload_bw = (BUFFER_SIZE / (1024.0f * 1024.0f)) / (avg_preload_us / 1e6f);
float avg_polls = (float)total_polls / NUM_ITERATIONS;
pos += snprintf(output + pos, OUTPUT_SIZE - pos,
"Average time: %.1f us\n"
"Average polls: %.0f\n"
"Bandwidth: %.1f MB/s\n\n",
avg_preload_us, avg_polls, preload_bw);
// TEST 2: Optimized CPU Copy (baseline)
pos += snprintf(output + pos, OUTPUT_SIZE - pos,
"TEST 2: Optimized CPU Copy (PSRAM→RAM)\n"
"=======================================\n");
uint64_t total_cpu_us = 0;
for (int i = 0; i < NUM_ITERATIONS; i++) {
force_cache_eviction(evict_buf);
Cache_Invalidate_Addr(CACHE_MAP_L2_CACHE, (uint32_t)psram_src, BUFFER_SIZE);
vTaskDelay(pdMS_TO_TICKS(10));
asm volatile("fence" ::: "memory");
int64_t start = esp_timer_get_time();
dl_esp32p4_memcpy(internal_dst, (const void *)psram_src, BUFFER_SIZE);
asm volatile("fence" ::: "memory");
int64_t end = esp_timer_get_time();
volatile uint64_t check = 0;
for (size_t j = 0; j < BUFFER_SIZE; j += 64) {
check += internal_dst[j];
}
g_prevent_optimization += check;
total_cpu_us += (end - start);
}
float avg_cpu_us = (float)total_cpu_us / NUM_ITERATIONS;
float cpu_bw = (BUFFER_SIZE / (1024.0f * 1024.0f)) / (avg_cpu_us / 1e6f);
pos += snprintf(output + pos, OUTPUT_SIZE - pos,
"Average time: %.1f us\n"
"Bandwidth: %.1f MB/s\n\n",
avg_cpu_us, cpu_bw);
pos += snprintf(output + pos, OUTPUT_SIZE - pos,
"COMPARISON\n"
"==========\n"
"Expected for real PSRAM preload:\n"
" - Time: 320-640 us\n"
" - Bandwidth: 200-400 MB/s\n"
" - Polls: 1000+ (if hardware fetching)\n\n"
"Actual results:\n"
" Hardware preload: %.1f us, %.1f MB/s, %.0f polls\n"
" Optimized CPU: %.1f us, %.1f MB/s\n\n",
avg_preload_us, preload_bw, avg_polls,
avg_cpu_us, cpu_bw);
if (avg_preload_us < 100.0f && avg_polls < 10.0f) {
pos += snprintf(output + pos, OUTPUT_SIZE - pos,
"CONCLUSION:\n"
"-----------\n"
"Hardware preload completes instantly without\n"
"fetching from PSRAM.\n\n"
"Optimized CPU copy shows real PSRAM bandwidth\n"
"of %.0f MB/s, which matches hardware specs.\n",
cpu_bw);
} else {
pos += snprintf(output + pos, OUTPUT_SIZE - pos,
"CONCLUSION:\n"
"-----------\n"
"Hardware preload is working correctly.\n");
}
pos += snprintf(output + pos, OUTPUT_SIZE - pos,
"\nPrevent optimization: %llu\n"
"========================================\n",
g_prevent_optimization);
printf("%s", output);
heap_caps_free((void *)psram_src);
heap_caps_free(internal_dst);
heap_caps_free((void *)evict_buf);
heap_caps_free(output);
}
extern "C" void app_main(void)
{
vTaskDelay(pdMS_TO_TICKS(100));
test_comparison();
}
main/dl_esp32p4_memcpy.S
Optimized RISC-V assembly using ESP32-P4 SIMD extensions (from ESP-DL library):
Code: Select all
.text
.align 2
.global dl_esp32p4_memcpy
.type dl_esp32p4_memcpy, @function
.balign 4
.option norvc
dl_esp32p4_memcpy:
esp.ld.128.usar.ip q0, a1, 0
esp.movx.r.sar.bytes t6
esp.ld.128.usar.ip q1, a0, 0
esp.movx.r.sar.bytes t4
li a6, 16
sub a6, a6, t4
li t0, 16
beq a6, t0, 13f
blt a6, a2, dl_esp32p4_memcpy_done_min
mv a6, a2
dl_esp32p4_memcpy_done_min:
srli t6, a6, 2
slli a7, t6, 2
sub a7, a6, a7
mv t0, t6
blez t0, 10f
9:
lw a3, 0(a1)
addi a1, a1, 4
sw a3, 0(a0)
addi a0, a0, 4
addi t0, t0, -1
bgtz t0, 9b
10:
mv t0, a7
blez t0, 12f
11:
lbu a3, 0(a1)
addi a1, a1, 1
sb a3, 0(a0)
addi a0, a0, 1
addi t0, t0, -1
bgtz t0, 11b
12:
sub a2, a2, a6
esp.ld.128.usar.ip q0, a1, 0
esp.movx.r.sar.bytes t6
13:
beqz t6, 1f
srli a3, a2, 4
slli a4, a3, 4
sub a4, a2, a4
srli a5, a3, 1
slli t3, a5, 1
sub t3, a3, t3
srli t4, a4, 2
slli t5, t4, 2
sub t5, a4, t5
mv t0, a5
blez t0, 15f
14:
esp.ld.128.usar.ip q0, a1, 16
esp.ld.128.usar.ip q1, a1, 16
esp.ld.128.usar.ip q2, a1, 0
esp.src.q q0, q0, q1
esp.src.q q1, q1, q2
esp.vst.128.ip q0, a0, 16
esp.vst.128.ip q1, a0, 16
addi t0, t0, -1
bgtz t0, 14b
15:
beqz t3, 4f
esp.ld.128.usar.ip q0, a1, 16
esp.ld.128.usar.ip q1, a1, 0
esp.src.q q0, q0, q1
esp.vst.128.ip q0, a0, 16
bnez t3, 4f
1:
srli a3, a2, 4
slli a4, a3, 4
sub a4, a2, a4
srli a5, a3, 1
slli t3, a5, 1
sub t3, a3, t3
srli t4, a4, 2
slli t5, t4, 2
sub t5, a4, t5
mv t0, a5
blez t0, 3f
2:
esp.vld.128.ip q0, a1, 16
esp.vld.128.ip q1, a1, 16
esp.vst.128.ip q0, a0, 16
esp.vst.128.ip q1, a0, 16
addi t0, t0, -1
bgtz t0, 2b
3:
beqz t3, 4f
esp.vld.128.ip q0, a1, 16
esp.vst.128.ip q0, a0, 16
4:
mv t0, t4
blez t0, 6f
5:
lw a3, 0(a1)
addi a1, a1, 4
sw a3, 0(a0)
addi a0, a0, 4
addi t0, t0, -1
bgtz t0, 5b
6:
mv t0, t5
blez t0, 8f
7:
lbu a3, 0(a1)
addi a1, a1, 1
sb a3, 0(a0)
addi a0, a0, 1
addi t0, t0, -1
bgtz t0, 7b
8:
ret
main/CMakeLists.txt
Code: Select all
set(srcs "app_main.cpp"
"dl_esp32p4_memcpy.S")
idf_component_register(
SRCS ${srcs}
REQUIRES esp_timer
)
Test Results
Code: Select all
========================================
ESP32-P4 Cache Preload vs CPU Copy Test
========================================
Buffer: 128 KB PSRAM
Iterations: 10
PSRAM theoretical max: ~400 MB/s
TEST 1: Hardware Preload API
=============================
Average time: 1.5 us
Average polls: 0
Bandwidth: 83333.3 MB/s
TEST 2: Optimized CPU Copy (PSRAM→RAM)
=======================================
Average time: 669.2 us
Bandwidth: 186.8 MB/s
COMPARISON
==========
Expected for real PSRAM preload:
- Time: 320-640 us
- Bandwidth: 200-400 MB/s
- Polls: 1000+ (if hardware fetching)
Actual results:
Hardware preload: 1.5 us, 83333.3 MB/s, 0 polls
Optimized CPU: 669.2 us, 186.8 MB/s
CONCLUSION:
-----------
Hardware preload completes instantly without
fetching from PSRAM.
Optimized CPU copy shows real PSRAM bandwidth
of 187 MB/s, which matches hardware specs.
Analysis
Key Findings:
- Hardware Preload API:
- Completes in 1.5 μs for 128 KB
- Shows 0 polls (immediate completion)
- Reports 83,333 MB/s (physically impossible)
- 446× faster than actual PSRAM speed
- Optimized CPU Copy:
- Takes 669 μs for 128 KB
- Achieves 187 MB/s bandwidth
- Matches expected PSRAM @ 200 MHz performance
- 47% efficiency (typical for real hardware)
Why This Proves the API Doesn't Work:
For a real PSRAM transfer at 200 MHz (Octal SPI):
- Theoretical max: 8 lines × 200 MHz × 2 (DDR) = 400 MB/s
- Real-world: ~180-200 MB/s (accounting for protocol overhead)
- Expected time for 128 KB: 640-700 μs
The hardware preload reports:
- Time: 1.5 μs (446× faster than physical limit)
- Polls: 0 (no wait for hardware operation)
- Bandwidth: 83 GB/s (208× faster than theoretical maximum)
This is
physically impossible for PSRAM data transfer.
Thread Safety Note:
The cache.h header states:
/**
* Please do not call this function in your SDK application.
*/
uint32_t Cache_Start_L2_Cache_Preload(...);
Combined with the test results, this suggests the API is:
- Intended for ROM bootloader internal use only
- Not designed for application-level PSRAM bandwidth optimization
- Possibly just manipulates cache metadata without DMA
Questions for Espressif:
- Is Cache_Start_L2_Cache_Preload() intended for application use on ESP32-P4?
- If yes, what is the correct usage pattern to achieve actual PSRAM preloading?
- If no, what is the recommended approach for optimizing PSRAM access patterns?
- Should ESP-DL (Espressif's ML framework) implement P4 preload, as it does for S3?
Workaround
For now, software cache warming provides predictable results:
Code: Select all
void software_preload(volatile uint8_t *addr, size_t size) {
volatile uint64_t sum = 0;
for (size_t i = 0; i < size; i += 64) { // 64B cache line
sum += addr[i];
}
asm volatile("" : "+r"(sum) :: "memory");
}
This achieves ~187 MB/s (matching hardware limits) and loads data into L1/L2 cache predictably.
---
Thank you for your help investigating this issue. I hope Espressif can clarify the intended usage of these ROM functions.