Google TTS only plays a few words

trizzant
Posts: 1
Joined: Wed Jan 01, 2025 9:11 am

Google TTS only plays a few words

Postby trizzant » Wed Jan 01, 2025 9:27 am

Hello. I've been working on an issue for several days and cannot find a way to fix it.
Expected result: speak full text-to-speech
Unintended result: Will only speak a few words. i.e. "hello good morning". If additional words are added then only a blip of static is played.


Code: Untitled.txt Select all



Hardware:

ESP32-S3-DevKitC-1-N8R2
Partition Scheme: "Default 8M w/ spiffs (3MB APP/1.5MB spiffs"
QSPI PSRAM
Flash: QIO 80Mhz

INMP441
Max98357

Code: Untitled.cpp Select all



//Arduino v2.3.4

#include <Arduino.h>
#include <WiFi.h>
#include <HTTPClient.h>
#include "driver/i2s.h"
#include <ArduinoJson.h>
#include <mbedtls/base64.h>

// I2S Pins
#define I2S_BCLK GPIO_NUM_14
#define I2S_LRCLK GPIO_NUM_40
#define I2S_DOUT GPIO_NUM_9
#define I2S_DIN GPIO_NUM_17
#define I2S_AMP_SD GPIO_NUM_21

#define SAMPLE_RATE 16000
#define MAX_AUDIO_DATA 100000

class TTSPlayer {
private:
HTTPClient http;
const char* apiKey;
int16_t* audioBuffer = nullptr;
size_t audioDataSize = 0;
float gain = 2.0f;

void initI2S() {
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX),
.sample_rate = SAMPLE_RATE,
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = I2S_COMM_FORMAT_I2S,
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 8,
.dma_buf_len = 64,
.use_apll = false,
.tx_desc_auto_clear = true,
.fixed_mclk = 0
};

i2s_pin_config_t pin_config = {
.bck_io_num = I2S_BCLK,
.ws_io_num = I2S_LRCLK,
.data_out_num = I2S_DOUT,
.data_in_num = I2S_PIN_NO_CHANGE
};

i2s_driver_install(I2S_NUM_0, &i2s_config, 0, NULL);
i2s_set_pin(I2S_NUM_0, &pin_config);
i2s_zero_dma_buffer(I2S_NUM_0);
}

void playAudio() {
Serial.println("Starting playback...");
i2s_zero_dma_buffer(I2S_NUM_0);
size_t totalBytesWritten = 0;
int16_t* bufferPtr = audioBuffer;
size_t samplesToWrite = audioDataSize / 2;

while (totalBytesWritten < samplesToWrite) {
// Process 512 samples at a time (like your working code)
size_t chunkSize = 512;
if (samplesToWrite - totalBytesWritten < chunkSize) {
chunkSize = samplesToWrite - totalBytesWritten;
}

// Temporary buffer for processed samples
int16_t tempBuffer[512]; // Fixed size buffer

// Apply gain to samples
for (size_t i = 0; i < chunkSize; i++) {
int32_t sample = (int32_t)(bufferPtr[i] * gain);

// Prevent clipping
if (sample > INT16_MAX) {
sample = INT16_MAX;
} else if (sample < INT16_MIN) {
sample = INT16_MIN;
}

tempBuffer[i] = (int16_t)sample;
}

// Write to I2S
size_t bytesToWrite = chunkSize * sizeof(int16_t);
size_t numBytesWritten = 0;
esp_err_t result = i2s_write(I2S_NUM_0, tempBuffer, bytesToWrite, &numBytesWritten, portMAX_DELAY);

if (result != ESP_OK) {
Serial.println("I2S Write Error");
break;
}

bufferPtr += chunkSize;
totalBytesWritten += chunkSize;
}
i2s_zero_dma_buffer(I2S_NUM_0);
Serial.println("Playback complete!");
}

public:
TTSPlayer(const char* apiKeyParam) : apiKey(apiKeyParam) {
pinMode(I2S_AMP_SD, OUTPUT);
digitalWrite(I2S_AMP_SD, HIGH);
initI2S();

// Allocate audio buffer in PSRAM
audioBuffer = (int16_t*)heap_caps_malloc(MAX_AUDIO_DATA, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (audioBuffer == NULL) {
Serial.println("Failed to allocate audio buffer");
}
}

~TTSPlayer() {
if (audioBuffer) {
free(audioBuffer);
}
}

void playTTS(const char* text) {
const char* tts_endpoint = "https://texttospeech.googleapis.com/v1/text:synthesize";
String url = String(tts_endpoint) + "?key=" + apiKey;

http.begin(url);
http.addHeader("Content-Type", "application/json");

DynamicJsonDocument doc(1024);
doc["input"]["text"] = text;
doc["voice"]["languageCode"] = "en-US";
doc["audioConfig"]["audioEncoding"] = "LINEAR16";
doc["audioConfig"]["sampleRateHertz"] = SAMPLE_RATE;
doc["audioConfig"]["volumeGainDb"] = 0.0;

String requestBody;
serializeJson(doc, requestBody);

Serial.println("Sending request to Google Cloud TTS...");
int httpCode = http.POST(requestBody);

if (httpCode > 0) {
Serial.printf("HTTP Response code: %d\n", httpCode);

if (httpCode == HTTP_CODE_OK) {
String response = http.getString();
DynamicJsonDocument responseDoc(32768);
DeserializationError error = deserializeJson(responseDoc, response);

if (!error) {
const char* audioContent = responseDoc["audioContent"];
if (audioContent && strlen(audioContent) > 0) {
Serial.println("Decoding audio data...");

size_t decodedLength = strlen(audioContent) * 3 / 4;
uint8_t* decodedAudio = (uint8_t*)malloc(decodedLength);

if (decodedAudio) {
size_t outputLength;
int decodeResult = mbedtls_base64_decode(
decodedAudio,
decodedLength,
&outputLength,
(const unsigned char*)audioContent,
strlen(audioContent)
);

if (decodeResult == 0) {
Serial.printf("Decoded %d bytes of audio data\n", outputLength);

// Clear the audio buffer first
memset(audioBuffer, 0, MAX_AUDIO_DATA);

// Copy decoded data to audio buffer
if (outputLength <= MAX_AUDIO_DATA) {
memcpy(audioBuffer, decodedAudio, outputLength);
audioDataSize = outputLength;

// Play the audio
playAudio();
} else {
Serial.println("Audio data too large for buffer");
}
}
free(decodedAudio);
}
}
}
}
}
http.end();
}
};

// WiFi credentials
const char* ssid = "ssid";
const char* password = "password";
const char* apiKey = "apiKey";

TTSPlayer* player;

void setup() {
Serial.begin(115200);

WiFi.begin(ssid, password);
while (WiFi.status() != WL_CONNECTED) {
delay(500);
Serial.print(".");
}
Serial.println("\nWiFi connected");

player = new TTSPlayer(apiKey);
player->playTTS("Hello good morning");
}

void loop() {
delay(1000);
}

Who is online

Users browsing this forum: Semrush [Bot] and 1 guest