Google TTS only plays a few words

trizzant · Postby **trizzant** » Wed Jan 01, 2025 9:27 am

Hello. I've been working on an issue for several days and cannot find a way to fix it.
Expected result: speak full text-to-speech
Unintended result: Will only speak a few words. i.e. "hello good morning". If additional words are added then only a blip of static is played.

Code: Untitled.txt Select all




Hardware:



ESP32-S3-DevKitC-1-N8R2

Partition Scheme: "Default 8M w/ spiffs (3MB APP/1.5MB spiffs"

QSPI PSRAM

Flash: QIO 80Mhz



INMP441

Max98357

Code: Untitled.cpp Select all




//Arduino v2.3.4



#include <Arduino.h>

#include <WiFi.h>

#include <HTTPClient.h>

#include "driver/i2s.h"

#include <ArduinoJson.h>

#include <mbedtls/base64.h>



// I2S Pins

#define I2S_BCLK          GPIO_NUM_14

#define I2S_LRCLK         GPIO_NUM_40

#define I2S_DOUT          GPIO_NUM_9

#define I2S_DIN           GPIO_NUM_17

#define I2S_AMP_SD        GPIO_NUM_21



#define SAMPLE_RATE       16000

#define MAX_AUDIO_DATA    100000 



class TTSPlayer {

private:

    HTTPClient http;

    const char* apiKey;

    int16_t* audioBuffer = nullptr;

    size_t audioDataSize = 0;

    float gain = 2.0f;



    void initI2S() {

        i2s_config_t i2s_config = {

            .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX),

            .sample_rate = SAMPLE_RATE,

            .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,

            .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,

            .communication_format = I2S_COMM_FORMAT_I2S,

            .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,

            .dma_buf_count = 8,

            .dma_buf_len = 64,

            .use_apll = false,

            .tx_desc_auto_clear = true,

            .fixed_mclk = 0

        };



        i2s_pin_config_t pin_config = {

            .bck_io_num = I2S_BCLK,

            .ws_io_num = I2S_LRCLK,

            .data_out_num = I2S_DOUT,

            .data_in_num = I2S_PIN_NO_CHANGE

        };



        i2s_driver_install(I2S_NUM_0, &i2s_config, 0, NULL);

        i2s_set_pin(I2S_NUM_0, &pin_config);

        i2s_zero_dma_buffer(I2S_NUM_0);

    }



    void playAudio() {

        Serial.println("Starting playback...");

        i2s_zero_dma_buffer(I2S_NUM_0);

        size_t totalBytesWritten = 0;

        int16_t* bufferPtr = audioBuffer;

        size_t samplesToWrite = audioDataSize / 2;



        while (totalBytesWritten < samplesToWrite) {

            // Process 512 samples at a time (like your working code)

            size_t chunkSize = 512;

            if (samplesToWrite - totalBytesWritten < chunkSize) {

                chunkSize = samplesToWrite - totalBytesWritten;

            }



            // Temporary buffer for processed samples

            int16_t tempBuffer[512];  // Fixed size buffer



            // Apply gain to samples

            for (size_t i = 0; i < chunkSize; i++) {

                int32_t sample = (int32_t)(bufferPtr[i] * gain);



                // Prevent clipping

                if (sample > INT16_MAX) {

                    sample = INT16_MAX;

                } else if (sample < INT16_MIN) {

                    sample = INT16_MIN;

                }



                tempBuffer[i] = (int16_t)sample;

            }



            // Write to I2S

            size_t bytesToWrite = chunkSize * sizeof(int16_t);

            size_t numBytesWritten = 0;

            esp_err_t result = i2s_write(I2S_NUM_0, tempBuffer, bytesToWrite, &numBytesWritten, portMAX_DELAY);

            

            if (result != ESP_OK) {

                Serial.println("I2S Write Error");

                break;

            }



            bufferPtr += chunkSize;

            totalBytesWritten += chunkSize;

        }

        i2s_zero_dma_buffer(I2S_NUM_0);

        Serial.println("Playback complete!");

    }



public:

    TTSPlayer(const char* apiKeyParam) : apiKey(apiKeyParam) {

        pinMode(I2S_AMP_SD, OUTPUT);

        digitalWrite(I2S_AMP_SD, HIGH);

        initI2S();



        // Allocate audio buffer in PSRAM

        audioBuffer = (int16_t*)heap_caps_malloc(MAX_AUDIO_DATA, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);

        if (audioBuffer == NULL) {

            Serial.println("Failed to allocate audio buffer");

        }

    }



    ~TTSPlayer() {

        if (audioBuffer) {

            free(audioBuffer);

        }

    }



    void playTTS(const char* text) {

        const char* tts_endpoint = "https://texttospeech.googleapis.com/v1/text:synthesize";

        String url = String(tts_endpoint) + "?key=" + apiKey;

        

        http.begin(url);

        http.addHeader("Content-Type", "application/json");



        DynamicJsonDocument doc(1024);

        doc["input"]["text"] = text;

        doc["voice"]["languageCode"] = "en-US";

        doc["audioConfig"]["audioEncoding"] = "LINEAR16";

        doc["audioConfig"]["sampleRateHertz"] = SAMPLE_RATE;

        doc["audioConfig"]["volumeGainDb"] = 0.0;

        

        String requestBody;

        serializeJson(doc, requestBody);



        Serial.println("Sending request to Google Cloud TTS...");

        int httpCode = http.POST(requestBody);

        

        if (httpCode > 0) {

            Serial.printf("HTTP Response code: %d\n", httpCode);

            

            if (httpCode == HTTP_CODE_OK) {

                String response = http.getString();

                DynamicJsonDocument responseDoc(32768);

                DeserializationError error = deserializeJson(responseDoc, response);

                

                if (!error) {

                    const char* audioContent = responseDoc["audioContent"];

                    if (audioContent && strlen(audioContent) > 0) {

                        Serial.println("Decoding audio data...");

                        

                        size_t decodedLength = strlen(audioContent) * 3 / 4;

                        uint8_t* decodedAudio = (uint8_t*)malloc(decodedLength);

                        

                        if (decodedAudio) {

                            size_t outputLength;

                            int decodeResult = mbedtls_base64_decode(

                                decodedAudio,

                                decodedLength,

                                &outputLength,

                                (const unsigned char*)audioContent,

                                strlen(audioContent)

                            );

                            

                            if (decodeResult == 0) {

                                Serial.printf("Decoded %d bytes of audio data\n", outputLength);

                                

                                // Clear the audio buffer first

                                memset(audioBuffer, 0, MAX_AUDIO_DATA);

                                

                                // Copy decoded data to audio buffer

                                if (outputLength <= MAX_AUDIO_DATA) {

                                    memcpy(audioBuffer, decodedAudio, outputLength);

                                    audioDataSize = outputLength;

                                    

                                    // Play the audio

                                    playAudio();

                                } else {

                                    Serial.println("Audio data too large for buffer");

                                }

                            }

                            free(decodedAudio);

                        }

                    }

                }

            }

        }

        http.end();

    }

};



// WiFi credentials

const char* ssid = "ssid";

const char* password = "password";

const char* apiKey = "apiKey";



TTSPlayer* player;



void setup() {

    Serial.begin(115200);

    

    WiFi.begin(ssid, password);

    while (WiFi.status() != WL_CONNECTED) {

        delay(500);

        Serial.print(".");

    }

    Serial.println("\nWiFi connected");



    player = new TTSPlayer(apiKey);

    player->playTTS("Hello good morning");

}



void loop() {

    delay(1000);

}

Google TTS only plays a few words

Google TTS only plays a few words

Who is online

About Us

Extra

Information