Speak Text with Google's Text-to-Speech

I use this script a lot, especially since I can call it from other scripts. It does require setting up a Google Cloud account on your machine, but totally worth it for way higher quality voices.

Here's how you run this from another script. You can select a voice by changing 5 to whatever:


await run("speak-text", "I like tacos", "--voice", 5)

The script in full:

Install speak-text


// Menu: Speak Text
// Description: Speaks Text Using Google's Text-to-Speech
// Author: John Lindquist
// Twitter: @johnlindquist

// Requires a Google Cloud account and configuration:
// https://cloud.google.com/text-to-speech

let { playAudioFile } = await kit("audio")
let { format } = await npm("date-fns")

/** @type typeof import("@google-cloud/text-to-speech") */
let textToSpeech = await npm("@google-cloud/text-to-speech")
let client = new textToSpeech.TextToSpeechClient()

let text = await arg("What should I say?")

let voicesDB = db("voices", { voices: [] })
let voices = voicesDB.get("voices").value()

//cache voices
if (voices.length === 0) {
  let [{ voices: englishVoices }] = await client.listVoices(
    {
      languageCode: "en",
    }
  )

  let voiceChoices = englishVoices.map(voice => {
    return {
      name: `${voice.ssmlGender} - ${voice.name}`,
      value: {
        ...voice,
        languageCode: voice.name.slice(0, 4),
      },
    }
  })

  voicesDB.set("voices", voiceChoices).write()
  voices = voicesDB.get("voices").value()
}

// From the terminal or run
// speak-text "I like tacos" --voice 5
// await run("speak-text", "I like tacos", "--voice", "5")
let voice =
  typeof arg?.voice === "number"
    ? voices[arg?.voice].value
    : await arg("Select voice", voices)

let effectsProfileId = ["headphone-class-device"]

let createRequest = (voice, text) => {
  let speakingRate = 1
  return {
    input: { text },
    voice,
    audioConfig: {
      audioEncoding: "MP3",
      effectsProfileId,
      speakingRate,
    },
  }
}

let request = createRequest(voice, text)

let safeFileName = text
  .slice(0, 10)
  .replace(/[^a-z0-9]/gi, "-")
  .toLowerCase()

let date = format(new Date(), "yyyy-MM-dd-hh-mm-ss")
let fileName = `${date}-${safeFileName}.mp3`

// Performs the text-to-speech request
let [response] = await client.synthesizeSpeech(request)

// Write the .mp3 locally
let textAudioPath = tmp(fileName)
await writeFile(
  textAudioPath,
  response.audioContent,
  "binary"
)

playAudioFile(textAudioPath)