Source: create_audio.mjs

/**
 * @fileOverview Provides functionalities for generating audio content from text and JSON data using OpenAI's APIs. 
 * 
 * This module contains functions for creating speech from text input, saving audio files, extracting text from JSON for speech synthesis, and converting JSON data to audio files in a specified directory. It leverages OpenAI's text-to-speech and GPT-4 models to process and convert textual content into spoken audio, supporting various customization options like voice model and speech speed. 
 * 
 * Functions exposed from [cli](module-Ipynb2web_cli.html) and [node](module-Ipynb2web_node.html).
 * @module create_audio
 * @exports {Object} - Exports functions like createSpeech, saveSpeech, getTextFromJson, and speechFromDir for audio processing and generation.
 * @author Charles Karpati
 */



import fs from "fs";
import path from 'path';

/**
 * Creates an audio speech from text using the OpenAI API.
 * 
 * @async
 * @public
 * @param {string} input - The text to be converted into speech.
 * @param {string} [apikey] - The OpenAI API key. If not provided, it will use the environment variable 'OPENAI_API_KEY'.
 * @param {string} [voice='echo'] - The voice model to use.
 * @param {number} [speed=1.0] - The speed of the speech (0.25 to 4.0).
 * @param {string} [model='tts-1'] - The speech model to use.
 * @param {boolean} [verbose=false] - If set to true, enables verbose logging for detailed information.
 * @returns {Buffer|null} The audio data as a Buffer, or null if an error occurs or no API key is provided.
 * @throws {Error} Logs an error to the console if fetching the speech fails and verbose is true.
 * @memberof module:create_audio
 */
async function createSpeech(input, apikey, voice = 'echo', speed = 1.0, model = 'tts-1', verbose = false) {
    if (!apikey) { apikey = process.env.OPENAI_API_KEY }
    if (!apikey) { verbose && console.log('No API Key provided and \"env.OPENAI_API_KEY\" not found.'); return }
    try {
        const openai = new OpenAI(apikey);
        // Speed [ `0.25` - `4.0`]. default = `1.0`. 
        // The maximum length is 4096 characters.
        const mp3Response = await fetch('https://api.openai.com/v1/audio/speech', {
            method: 'POST',
            headers: {
                'Authorization': `Bearer ${apikey}`,
                'Content-Type': 'application/json'
            },
            body: JSON.stringify({
                model, input, voice, speed, response_format: 'mp3'
            })
        });
        var buffer = await mp3Response.buffer();
    } catch (e) {
        verbose && console.log('createSpeech error', e)
        return
    }
    return buffer
}

/**
 * Saves the given audio buffer to a file.
 *
 * @async
 * @param {string} mp3SaveFilePath - The file path where the MP3 should be saved.
 * @param {Buffer} buffer - The audio data to be saved.
 * @param {boolean} [verbose=false] - If set to true, enables verbose logging for detailed information.
 * @returns {void} Does not return a value; saves the audio buffer to a file.
 * @throws {Error} Logs an error to the console if there is a failure in saving the audio file and verbose is true.
 * @memberof module:create_audio
 */
async function saveSpeech(mp3SaveFilePath, buffer, verbose = false) {
    try {
        await fs.promises.writeFile(mp3SaveFilePath, buffer);
        verbose && console.log(`Audio saved to ${mp3SaveFilePath}`);
    } catch (e) {
        verbose && console.log('saveSpeech error', e)
    }
}

/**
 * Pass json to chatGPT and ask it to extract the text for speech using gpt4.
 *
 * @async
 * @param {Object} json - The JSON object containing the data to extract text from.
 * @param {string} [apikey] - The OpenAI API key. If not provided, it will use the environment variable 'OPENAI_API_KEY'.
 * @param {string} [model='gpt-4o-mini'] - The text model to use.
 * @param {boolean} [verbose=false] - If set to true, enables verbose logging for detailed information.
 * @returns {string|null} The extracted text from the JSON object, or null if an error occurs or no API key is provided.
 * @throws {Error} Logs an error to the console if there is an error in fetching or processing the request and verbose is true.
 * @memberof module:create_audio
 */

async function getTextFromJson(json, apikey, model='gpt-4o-mini', verbose = false) {
    if (!apikey) { apikey = process.env.OPENAI_API_KEY; }
    if (!apikey) { verbose && console.log('No API Key provided and \"env.OPENAI_API_KEY\" not found.'); return }
    try {
        let text = !json.title ? '' : `Title:   ${json.title} \n `;
        text += !json.summary ? '' : `Summary: ${json.summary} \n `;
        text += `Content: ${JSON.stringify(json.content)}`;

        const requestBody = {
            model,
            messages: [
                {
                    "role": "system", "content": `
You are an assistant to a webpage to audio service. 
You will be given a webpage you must convert it to a form of text ready for reading aloud.
Start every conversion with a statement "You are listening to the audio version of this webpage" followed by the title and summary.
Under no circumstances should code be read and should be paraphrased or skipped. 
`
                },
                { "role": "user", "content": text }
            ]
        };

        const response = await fetch('https://api.openai.com/v1/chat/completions', {
            method: 'POST',
            headers: {
                'Authorization': `Bearer ${apikey}`,
                'Content-Type': 'application/json'
            },
            body: JSON.stringify(requestBody)
        });
        let data
        const responseData = await response.json();
        data = responseData.choices[0].message.content;
    } catch (e) {
        verbose && console.log('getTextFromJson error', e)
        return
    }
    return data;
}

/**
 * Converts all JSON files in a directory to speech files.
 * Recursively processes directories and skips non-JSON files.
 *
 * @async
 * @param {string} fromFolder - The directory containing JSON files.
 * @param {string} toFolder - The directory where the resulting MP3 files will be saved.
 * @param {string} [apikey] - The OpenAI API key. If not provided, it will use the environment variable 'OPENAI_API_KEY'.
 * @param {boolean} [verbose=false] - If set to true, enables verbose logging for detailed information.
 * @returns {void} Does not return a value; processes files in place.
 * @throws {Error} Logs an error to the console if there is a failure in reading the directory or processing files and verbose is true.
 * @memberof module:create_audio
 */

async function speechFromDir(fromFolder, toFolder, apikey, verbose = false) {
    if (!apikey) { apikey = process.env.OPENAI_API_KEY }
    if (!apikey) { verbose && console.log('No API Key provided and \"env.OPENAI_API_KEY\" not found.'); return }
    // get all files in SAVETO.
    try {
        const files = fs.readdirSync(fromFolder);
        for (let i = 0; i < files.length; i++) {
            const filename = path.join(fromFolder, files[i]);
            const stat = fs.lstatSync(filename);
            if (stat.isDirectory()) {
                speechFromDir(filename, toFolder); //recurse
            } else if (filename.indexOf('.json') >= 0) {
                let file = fs.readFileSync(filename, 'utf8');
                let json = JSON.parse(file);
                if (json?.meta?.audio) {
                    let text = await getTextFromJson(json);
                    let buffer = await createSpeech(text, apikey);
                    let file = files[i].substring(0, files[i].length - 5)
                    // console.log('file', file)
                    let savePath = path.join(toFolder, file) + '.mp3'
                    // console.log('savePath', savePath)
                    saveSpeech(savePath, buffer)
                }
            }
        }
    }
    catch (e) {
        verbose && console.log('speechFromDir error', e)
        return
    }
}

export { createSpeech, saveSpeech, getTextFromJson, speechFromDir }