Description
This script provides a simple example for generating Text-To-Speech (TTS) using the Gemini API within Google Apps Script. The Gemini API generates audio data in the audio/L16;codec=pcm;rate=24000
format, which is not directly playable. Since there’s no built-in method to convert this to a standard audio/wav
format, this sample script includes a custom function to handle the conversion.
Limitations and Considerations
- The provided
convertL16ToWav_
function is specifically designed for theaudio/L16;codec=pcm;rate=24000
MIME type. Using it with other audio formats will result in an error. - The script uses a hardcoded WAV header. This header assumes specific audio parameters (e.g., sample rate, bit depth, number of channels) that match the Gemini API’s output for this format. If the Gemini API’s output format changes, this header might need adjustment.
Sample Script
Before running, replace "###"
with your actual Gemini API key in the myFunction
.
/**
* Converts a byte data of "audio/L16" to a byte data of "audio/wav".
* L16 assumes 16-bit PCM.
*
* @param {Byte[]} inputData Input data (audio/L16).
* @param {string} mimeType
* @param {number} numChannels - Mono and stereo are 1 and 2, respectively.
* @return {Byte[]} Converted data as byte data.
*/
function convertL16ToWav_(inputData, mimeType = "audio/L16;codec=pcm;rate=24000", numChannels = 1) {
const [type, codec, sampleRate] = mimeType.split(";").map(e => e.includes("=") ? e.trim().split("=")[1] : e.trim());
if (type != "audio/L16" || codec != "pcm") {
throw new Error(`Sorry. This function can use for only "audio/L16;codec=pcm;rate=###".`);
}
const bitsPerSample = 16;
const blockAlign = numChannels * bitsPerSample / 8;
const byteRate = Number(sampleRate) * blockAlign;
const dataSize = inputData.length;
const fileSize = 36 + dataSize;
const header = new ArrayBuffer(44);
const view = new DataView(header);
const data = [
{ method: "setUint8", value: [..."RIFF"].map(e => e.charCodeAt(0)), add: [0, 1, 2, 3] },
{ method: "setUint32", value: [fileSize], add: [4], littleEndian: true },
{ method: "setUint8", value: [..."WAVE"].map(e => e.charCodeAt(0)), add: [8, 9, 10, 11] },
{ method: "setUint8", value: [..."fmt "].map(e => e.charCodeAt(0)), add: [12, 13, 14, 15] },
{ method: "setUint32", value: [16], add: [16], littleEndian: true },
{ method: "setUint16", value: [1, numChannels], add: [20, 22], littleEndian: true },
{ method: "setUint32", value: [Number(sampleRate), byteRate], add: [24, 28], littleEndian: true },
{ method: "setUint16", value: [blockAlign, bitsPerSample], add: [32, 34], littleEndian: true },
{ method: "setUint8", value: [..."data"].map(e => e.charCodeAt(0)), add: [36, 37, 38, 39] },
{ method: "setUint32", value: [dataSize], add: [40], littleEndian: true },
];
data.forEach(({ method, value, add, littleEndian }) =>
add.forEach((a, i) => view[method](a, value[i], littleEndian || false))
);
return [...new Uint8Array(header), ...inputData];
}
function myFunction() {
const apiKey = "###"; // Please set your API key here.
const text = [
"Create Text-To-Speech the following conversation.",
"User A: Hey there! How are you doing today?",
"User B: Hi! I'm doing well, thanks. How about you?",
"User A: I'm good too, thanks for asking!",
].join("\n");
const url = `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key=${apiKey}`;
const payload = {
contents: [{ role: "user", parts: [{ text }] }],
generationConfig: {
responseModalities: ["AUDIO"],
speechConfig: {
multiSpeakerVoiceConfig: {
speakerVoiceConfigs: [
{
speaker: "User A",
voiceConfig: { prebuiltVoiceConfig: { voiceName: "Kore" } },
},
{
speaker: "User B",
voiceConfig: { prebuiltVoiceConfig: { voiceName: "Leda" } },
},
],
},
},
},
};
const res = UrlFetchApp.fetch(url, {
contentType: "application/json",
payload: JSON.stringify(payload),
});
const obj = JSON.parse(res.getContentText());
const { data, mimeType } = obj.candidates[0].content.parts[0].inlineData;
const blob = Utilities.newBlob(
convertL16ToWav_(Utilities.base64Decode(data), mimeType),
"audio/wav",
"sample.wav"
);
DriveApp.createFile(blob);
}
Testing
Upon successful execution of the myFunction
, an audio file named sample.wav
will be created in the root folder of your Google Drive. You can then click this file to play the generated speech.