How to configure raw_opus_decoder to play audio from a simple Python server?
Posted: Sat May 10, 2025 6:45 am
On the server side, load the wav file, read and encode it into raw opus frames with opuslib_next.Encoder, then decode and play using raw_opus_decoder on esp32
Problem: On the server side, no matter I use the big endian length prefix or not, and no matter how the parameters are set on esp32, it will not play the audio
Details:
- device:esp32s3
- idf: 5.3.2
- adf:2.4
What i've checked:
1. the speaker can play audio in another example: https://github.com/espressif/esp-adf/bl ... /README.md
2. esp32 has received the opus frames
3. add big endian length prefix on server side or not, set opus_cfg.enable_frame_length_prefix on esp32 side to true or not, the device will not play the audio.
4. in another test, opus encoded by esp-adf raw_opus_encoder can be decoded using opuslib_next.Decoder on server side, so I believe raw_opus_decoder can decode opus frames created by opuslib_next.Encoder
Thanks a lot!
Essential codes:
opus_play_server.py:
esp32:
Full codes:
opus_play_server.py:
esp32:
Problem: On the server side, no matter I use the big endian length prefix or not, and no matter how the parameters are set on esp32, it will not play the audio
Details:
- device:esp32s3
- idf: 5.3.2
- adf:2.4
What i've checked:
1. the speaker can play audio in another example: https://github.com/espressif/esp-adf/bl ... /README.md
2. esp32 has received the opus frames
3. add big endian length prefix on server side or not, set opus_cfg.enable_frame_length_prefix on esp32 side to true or not, the device will not play the audio.
4. in another test, opus encoded by esp-adf raw_opus_encoder can be decoded using opuslib_next.Decoder on server side, so I believe raw_opus_decoder can decode opus frames created by opuslib_next.Encoder
Thanks a lot!
Essential codes:
opus_play_server.py:
Code: Select all
def encode_opus(pcm_data):
"""encode pcm data into raw opus frames"""
encoder = opuslib_next.Encoder(SAMPLE_RATE, CHANNELS, APPLICATION)
# bytes to int16
samples = np.frombuffer(pcm_data, dtype=np.int16)
# segment data according to FRAME_SIZE
frames = [samples[i:i+FRAME_SIZE] for i in range(0, len(samples), FRAME_SIZE)]
# encode pcm frames into opus
encoded_frames = []
for frame in frames:
# zero padding for the last frame
if len(frame) < FRAME_SIZE:
frame = np.pad(frame, (0, FRAME_SIZE - len(frame)), 'constant')
encoded = encoder.encode(frame.tobytes(), FRAME_SIZE)
frame_length = len(encoded)
# add big endian length prefix, 2 bytes
# length_prefix = struct.pack('>H', frame_length)
# length_prefix = frame_length.to_bytes(2, byteorder='big')
# encoded_frames.append(length_prefix + encoded)
return encoded_framesCode: Select all
// RAW_OPUS_DECODER with length prefix
raw_opus_dec_cfg_t opus_cfg = RAW_OPUS_DEC_CONFIG_DEFAULT();
opus_cfg.sample_rate = SAMPLE_RATE;
opus_cfg.channels = CHANNELS;
opus_cfg.dec_frame_size = 960; // 60ms @ 16kHz = 960 samples
opus_cfg.enable_frame_length_prefix = true; // with length prefix
opus_decoder = raw_opus_decoder_init(&opus_cfg);Full codes:
opus_play_server.py:
Code: Select all
import os
import random
import wave
import numpy as np
import opuslib_next
import time
import struct
import asyncio
import websockets
import logging
import json
from threading import Thread
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("opus_server")
WAV_DIR = 'recordings'
HOST = '0.0.0.0'
PORT = 8000
SAMPLE_RATE = 16000
CHANNELS = 1
FRAME_SIZE = 960 # 60ms @ 16kHz, 16000 * (60 / 1000) = 960
APPLICATION = opuslib_next.APPLICATION_VOIP
wav_files = [f for f in os.listdir(WAV_DIR) if f.endswith('16000hz.wav')]
def read_wav_file(file_path):
"""load .wav and return pcm data"""
with wave.open(file_path, 'rb') as wav_file:
# check sample rate and channels
if wav_file.getframerate() != SAMPLE_RATE:
raise ValueError(f"sample rate doesn't match, expected: {SAMPLE_RATE}Hz")
if wav_file.getnchannels() != CHANNELS:
raise ValueError(f"channels doesn't match, expected: {CHANNELS}")
# read all the files
pcm_data = wav_file.readframes(wav_file.getnframes())
return pcm_data
def encode_opus(pcm_data):
"""encode pcm into raw opus"""
encoder = opuslib_next.Encoder(SAMPLE_RATE, CHANNELS, APPLICATION)
# bytes to int16
samples = np.frombuffer(pcm_data, dtype=np.int16)
# segment data according to FRAME_SIZE
frames = [samples[i:i+FRAME_SIZE] for i in range(0, len(samples), FRAME_SIZE)]
# encode pcm frames into opus
encoded_frames = []
for frame in frames:
# zero padding for the last frame
if len(frame) < FRAME_SIZE:
frame = np.pad(frame, (0, FRAME_SIZE - len(frame)), 'constant')
encoded = encoder.encode(frame.tobytes(), FRAME_SIZE)
frame_length = len(encoded)
# add big endian length prefix, 2 bytes
# length_prefix = struct.pack('>H', frame_length)
# length_prefix = frame_length.to_bytes(2, byteorder='big')
# encoded_frames.append(length_prefix + encoded)
return encoded_frames
async def handle_client(websocket):
"""handle WebSocket client connections"""
client_ip = websocket.remote_address[0]
logger.info(f"client connected: {client_ip}")
try:
# send hello
await websocket.send(json.dumps({
"type": "hello",
"status": "ok",
"sample_rate": SAMPLE_RATE,
"channels": CHANNELS
}))
while True:
# get local wav files
if not wav_files:
logger.warning("no wav files are found")
await websocket.send(json.dumps({
"type": "error",
"message": "no available wav files"
}))
await asyncio.sleep(5)
continue
# pick a wav file
wav_file = random.choice(wav_files)
wav_path = os.path.join(WAV_DIR, wav_file)
logger.info(f"sending file: {wav_file}")
await websocket.send(json.dumps({
"type": "file_start",
"filename": wav_file
}))
try:
# load and encode
pcm_data = read_wav_file(wav_path)
encoded_frames = encode_opus(pcm_data)
# send encoded frames
for frame in encoded_frames:
logger.info(len(frame))
await websocket.send(frame)
await asyncio.sleep(0.02) # send a frame every 20ms
logger.info(f"done sending {wav_file}")
# send file end
await websocket.send(json.dumps({
"type": "file_end",
"filename": wav_file
Code: Select all
/* test different settings of opus decoder to decode audio from WebSocket server */
#include <string.h>
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include "freertos/event_groups.h"
#include "esp_log.h"
#include "esp_wifi.h"
#include "nvs_flash.h"
#include "sdkconfig.h"
#include "audio_element.h"
#include "audio_pipeline.h"
#include "audio_event_iface.h"
#include "audio_common.h"
#include "esp_peripherals.h"
#include "periph_wifi.h"
#include "board.h"
#include "i2s_stream.h"
#include "raw_stream.h"
#include "filter_resample.h"
#include "ringbuf.h"
// select different type of opus decoder
// 1 - RAW_OPUS_DECODER (with length prefix)
// 2 - RAW_OPUS_DECODER (without length prefix)
// 3 - OPUS_DECODER
#define DECODER_TYPE 1
#if (DECODER_TYPE == 1)
#include "raw_opus_decoder.h"
static const char *TAG = "RAW_OPUS_WITH_PREFIX";
#elif (DECODER_TYPE == 2)
#include "raw_opus_decoder.h"
static const char *TAG = "RAW_OPUS_NO_PREFIX";
#elif (DECODER_TYPE == 3)
#include "opus_decoder.h"
static const char *TAG = "OPUS_DECODER";
#else
#error "select a valid decoder first"
#endif
#include "esp_websocket_client.h"
#define WEBSOCKET_URI "ws://192.168.31.132:8000"
#define BUFFER_SIZE 1024
static audio_pipeline_handle_t pipeline;
static audio_element_handle_t raw_write;
static audio_element_handle_t opus_decoder;
static audio_element_handle_t i2s_writer;
static esp_websocket_client_handle_t client;
ringbuf_handle_t raw_in_rb;
ringbuf_handle_t opus_in_rb;
static EventGroupHandle_t s_wifi_event_group = NULL;
#define WIFI_CONNECTED_BIT BIT0
#define WIFI_FAIL_BIT BIT1
#define SAMPLE_RATE 16000
#define CHANNELS 1
static void websocket_event_handler(void *handler_args, esp_event_base_t base, int32_t event_id, void *event_data)
{
esp_websocket_event_data_t *data = (esp_websocket_event_data_t *)event_data;
switch (event_id) {
case WEBSOCKET_EVENT_CONNECTED:
ESP_LOGI(TAG, "WEBSOCKET_EVENT_CONNECTED");
break;
case WEBSOCKET_EVENT_DISCONNECTED:
ESP_LOGI(TAG, "WEBSOCKET_EVENT_DISCONNECTED");
break;
case WEBSOCKET_EVENT_DATA:
if (data->op_code == 2) { // binary data
ESP_LOGI(TAG, "receive binary data, len=%d", data->data_len);
// print the frist 8 bytes to debug
if (data->data_len >= 8) {
ESP_LOGI(TAG, "frist 8 bytes: 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x",
data->data_ptr[0], data->data_ptr[1], data->data_ptr[2], data->data_ptr[3],
data->data_ptr[4], data->data_ptr[5], data->data_ptr[6], data->data_ptr[7]);
}
// write into opus decoder
if (data->data_len > 0) {
int bytes_written = raw_stream_write(raw_write, (char *)data->data_ptr, data->data_len);
ESP_LOGI(TAG, "write to raw_stream, written: %d bytes", bytes_written);
// check write result
if (bytes_written < data->data_len) {
ESP_LOGE(TAG, "error writing data into raw_stream,maybe the buffer is full");
}
}
} else {
// text data
ESP_LOGI(TAG, "receive text data, len=%d, content: %.*s",
data->data_len, data->data_len, (char *)data->data_ptr);
}
break;
case WEBSOCKET_EVENT_ERROR:
ESP_LOGI(TAG, "WEBSOCKET_EVENT_ERROR");
break;
}
}
static void wifi_event_handler(void *arg, esp_event_base_t event_base, int32_t event_id, void *event_data)
{
if (event_base == WIFI_EVENT && event_id == WIFI_EVENT_STA_START) {
esp_wifi_connect();
} else if (event_base == WIFI_EVENT && event_id == WIFI_EVENT_STA_DISCONNECTED) {
ESP_LOGI(TAG, "WiFi disconnected,trying to reconnect...");
esp_wifi_connect();
xEventGroupClearBits(s_wifi_event_group, WIFI_CONNECTED_BIT);
xEventGroupSetBits(s_wifi_event_group, WIFI_FAIL_BIT);
} else if (event_base == IP_EVENT && event_id == IP_EVENT_STA_GOT_IP) {
ip_event_got_ip_t *event = (ip_event_got_ip_t *)event_data;
ESP_LOGI(TAG, "Got IP address:" IPSTR, IP2STR(&event->ip_info.ip));
xEventGroupSetBits(s_wifi_event_group, WIFI_CONNECTED_BIT);
}
}
static void init_wifi(void)
{
s_wifi_event_group = xEventGroupCreate();
ESP_ERROR_CHECK(esp_netif_init());
ESP_ERROR_CHECK(esp_event_loop_create_default());
esp_netif_create_default_wifi_sta();
wifi_init_config_t cfg = WIFI_INIT_CONFIG_DEFAULT();
ESP_ERROR_CHECK(esp_wifi_init(&cfg));
ESP_ERROR_CHECK(esp_event_handler_register(WIFI_EVENT, ESP_EVENT_ANY_ID, &wifi_event_handler, NULL));
ESP_ERROR_CHECK(esp_event_handler_register(IP_EVENT, IP_EVENT_STA_GOT_IP, &wifi_event_handler, NULL));
wifi_config_t wifi_config = {
.sta = {
.ssid = CONFIG_WIFI_SSID,
.password = CONFIG_WIFI_PASSWORD,
},
};
ESP_ERROR_CHECK(esp_wifi_set_mode(WIFI_MODE_STA));
ESP_ERROR_CHECK(esp_wifi_set_config(WIFI_IF_STA, &wifi_config));
ESP_ERROR_CHECK(esp_wifi_start());
ESP_LOGI(TAG, "WiFi initialized, waiting for connections...");
// wait for connections
EventBits_t bits = xEventGroupWaitBits(s_wifi_event_group,
WIFI_CONNECTED_BIT | WIFI_FAIL_BIT,
pdFALSE,
pdFALSE,
portMAX_DELAY);
if (bits & WIFI_CONNECTED_BIT) {
ESP_LOGI(TAG, "WiFi Connected");
} else if (bits & WIFI_FAIL_BIT) {
ESP_LOGE(TAG, "fail to connect to WiFi");
} else {
ESP_LOGE(TAG, "unexpected error");
}
}
void app_main(void)
{
esp_log_level_set("*", ESP_LOG_INFO);
esp_log_level_set(TAG, ESP_LOG_DEBUG);
ESP_LOGI(TAG, "[ 1 ] initializing NVS");
esp_err_t err = nvs_flash_init();
if (err == ESP_ERR_NVS_NO_FREE_PAGES) {
ESP_ERROR_CHECK(nvs_flash_erase());
err = nvs_flash_init();
}
ESP_ERROR_CHECK(err);
ESP_LOGI(TAG, "[ 2 ] initializing peripherals");
// esp_periph_config_t periph_cfg = DEFAULT_ESP_PERIPH_SET_CONFIG();
// esp_periph_set_handle_t set = esp_periph_set_init(&periph_cfg);
ESP_LOGI(TAG, "[ 3 ] initializing Wi-Fi");
init_wifi();
ESP_LOGI(TAG, "[ 4 ] setup audio pipeline");
audio_pipeline_cfg_t pipeline_cfg = DEFAULT_AUDIO_PIPELINE_CONFIG();
pipeline = audio_pipeline_init(&pipeline_cfg);
mem_assert(pipeline);
// setup raw_stream as a data entry
raw_stream_cfg_t raw_cfg = RAW_STREAM_CFG_DEFAULT();
raw_cfg.type = AUDIO_STREAM_WRITER;
raw_write = raw_stream_init(&raw_cfg);
ESP_LOGI(TAG, "[ 5 ] setup decoder");
#if (DECODER_TYPE == 1)
// RAW_OPUS_DECODER with length prefix
raw_opus_dec_cfg_t opus_cfg = RAW_OPUS_DEC_CONFIG_DEFAULT();
opus_cfg.sample_rate = SAMPLE_RATE;
opus_cfg.channels = CHANNELS;
opus_cfg.dec_frame_size = 960; // 60ms @ 16kHz = 960 samples
opus_cfg.enable_frame_length_prefix = true; // with length prefix
opus_decoder = raw_opus_decoder_init(&opus_cfg);
ESP_LOGI(TAG, "using RAW_OPUS_DECODER (with length prefix)");
#elif (DECODER_TYPE == 2)
// RAW_OPUS_DECODER without length prefix
raw_opus_dec_cfg_t opus_cfg = RAW_OPUS_DEC_CONFIG_DEFAULT();
opus_cfg.sample_rate = SAMPLE_RATE;
opus_cfg.channels = CHANNELS;
opus_cfg.dec_frame_size = 960; // 60ms @ 16kHz = 960 samples
opus_cfg.enable_frame_length_prefix = false; // disable length prefix
opus_cfg.self_delimited = true;
opus_decoder = raw_opus_decoder_init(&opus_cfg);
ESP_LOGI(TAG, "using RAW_OPUS_DECODER (without length prefix)");
#elif (DECODER_TYPE == 3)
// standard OPUS_DECODER(ogg)
opus_decoder_cfg_t opus_cfg = DEFAULT_OPUS_DECODER_CONFIG();
opus_decoder = decoder_opus_init(&opus_cfg);
ESP_LOGI(TAG, "using standard OPUS_DECODER(ogg)");
#endif
ESP_LOGI(TAG, "[ 6 ] setup i2s stream");
i2s_stream_cfg_t i2s_cfg = I2S_STREAM_CFG_DEFAULT();
i2s_cfg.type = AUDIO_STREAM_WRITER;
// dma buffer setup
i2s_cfg.chan_cfg.dma_desc_num = 8;
i2s_cfg.chan_cfg.dma_frame_num = 1024;
// sample rate、bit width、channel
i2s_cfg.std_cfg.clk_cfg.sample_rate_hz = SAMPLE_RATE;
i2s_cfg.std_cfg.slot_cfg.data_bit_width = I2S_DATA_BIT_WIDTH_16BIT;
i2s_cfg.std_cfg.slot_cfg.slot_mode = I2S_SLOT_MODE_MONO;
i2s_cfg.out_rb_size = 16 * 1024;
i2s_writer = i2s_stream_init(&i2s_cfg);
ESP_LOGI(TAG, "[ 7 ] registering all elements to pipeline");
audio_pipeline_register(pipeline, raw_write, "raw");
audio_pipeline_register(pipeline, opus_decoder, "opus");
audio_pipeline_register(pipeline, i2s_writer, "i2s");
ESP_LOGI(TAG, "[ 8 ] linking elements: [raw] --> [opus] --> [i2s]");
const char *link_tag[3] = {"raw", "opus", "i2s"};
audio_pipeline_link(pipeline, &link_tag[0], 3);
ESP_LOGI(TAG, "[ 9 ] setup event listener");
audio_event_iface_cfg_t evt_cfg = AUDIO_EVENT_IFACE_DEFAULT_CFG();
evt_cfg.queue_set_size = 20;
audio_event_iface_handle_t evt = audio_event_iface_init(&evt_cfg);
audio_pipeline_set_listener(pipeline, evt);
ESP_LOGI(TAG, "[ 10 ] launch the audio pipeline");
audio_pipeline_run(pipeline);
ESP_LOGI(TAG, "[ 11 ] initializing WebSocket client");
esp_websocket_client_config_t websocket_cfg = {
.uri = WEBSOCKET_URI,
};
client = esp_websocket_client_init(&websocket_cfg);
esp_websocket_register_events(client, WEBSOCKET_EVENT_ANY, websocket_event_handler, NULL);
ESP_LOGI(TAG, "[ 12 ] starting WebSocket client");
esp_websocket_client_start(client);
ESP_LOGI(TAG, "[ 13 ] listen for events");
while (1) {
audio_event_iface_msg_t msg;
esp_err_t ret = audio_event_iface_listen(evt, &msg, 100 / portTICK_PERIOD_MS);
if (ret != ESP_OK) {
ESP_LOGE(TAG, "[ * ] event iface error: %d", ret);
// small delay to prevent cpu overload
vTaskDelay(pdMS_TO_TICKS(10));
continue;
}
if (msg.source_type == AUDIO_ELEMENT_TYPE_ELEMENT && msg.source == (void *)opus_decoder
&& msg.cmd == AEL_MSG_CMD_REPORT_MUSIC_INFO) {
audio_element_info_t music_info = {0};
audio_element_getinfo(opus_decoder, &music_info);
ESP_LOGI(TAG, "[ * ] audio info received,sample rate: %d, channels: %d",
music_info.sample_rates, music_info.channels);
i2s_stream_set_clk(i2s_writer, music_info.sample_rates, music_info.bits, music_info.channels);
continue;
}
/* handle pipeline event */
if (msg.source_type == AUDIO_ELEMENT_TYPE_ELEMENT && msg.cmd == AEL_MSG_CMD_REPORT_STATUS) {
audio_element_state_t el_state = audio_element_get_state(msg.source);
if (el_state == AEL_STATE_FINISHED) {
ESP_LOGI(TAG, "[ * ] element done processing: %s", (char *)msg.data);
}
}
}
}