How to Generate an AI Voiceover plus Sound for my Youtube Video

Learn how to turn text into lifelike spoken voiceover for your youtube video

Here an example of how you can use it to create a voiceover plus soundtrack and attach it to your video.

πŸ“˜

Pre-Requisites for this API Recipe

You will need to have set up your development environment and installed AudioStack to complete this tutorial. Follow the steps here to get set up.

You'll also need to install Ffmpeg - this can be a little complicated. We've put together a step-by-step guide to help.

Β How to generate your own voiceover using text to speech and adding a soundtrack

🚧

Make sure you have the rights to the video you use!

The first step is to download a video to work with. You may want to try out pexels for royalty-free content that can be used in most commercial settings. In this example, let's download a video of a parrot (I used this Rainbow Lorikeet) and save it locally (on your computer) as parrot.mp4.

Then, create a new python file called "videovoiceover.py" and copy and paste the following code into the file. Add your own API key in line 5, then save the file.

import audiostack
import os

audiostack.api_key = "APIKEY"

def create_voiceover(req):
    scriptText = f"""
    <as:section name="main" soundsegment="main"> 
    {req["text"]}
    </as:section>"""

    print(f"Generating your script...")
    script = audiostack.Content.Script.create(scriptText=scriptText, scriptName="test")

    print(f"Synthesizing speech...")
    tts = audiostack.Speech.TTS.create(scriptItem=script, voice=req["voice"], speed=req["speed"])

    print(f"Condensing speech to match target duration...")
    tts = audiostack.Speech.TTS.reduce(speechId=tts.speechId, targetLength=req["targetLength"])

    timelineProperties= {
        "forceLength" : req['targetLength'],
        "speechStart" : 0,
        "fadeIn" : 0,
        "fadeOut" : 0,
    }

    print(f"Applying auto mixing and mastering")

    mix = audiostack.Production.Mix.create(speechItem=tts, exportSettings={"ttsTrack" : True}, masteringPreset="", timelineProperties=timelineProperties)

    print(f"Preparing for download...")
    encoder = audiostack.Delivery.Encoder.encode_mix(
        productionItem=mix,
        preset="custom",
        sampleRate=44100,
        bitDepth=16,
        public=False,
        format="wav",
        channels=2,
        loudnessPreset="podcast"
    )

    encoder.download(fileName=req["name"])
    print(encoder)

if __name__ == "__main__":

    script_content = [
        {
            "name": "parrots_voiceover",
            "text": "Parrots are highly intelligent and social birds known for their vibrant plumage and remarkable ability to mimic sounds, including human speech. These colorful avian companions are found in tropical regions around the world and are known for their playful and affectionate nature, often forming strong bonds with their human caregivers.  Parrots and generative audio both exhibit remarkable abilities for mimicry and creativity, with parrots mimicking sounds and voices, and generative audio systems producing compelling sound through imitation and adaptation. At AudioStack, we're big fans of parrots and take inspiration from them in everything from our voice cloning capabilities to our branding. Find out more at www.audiostack.ai",
            "voice": "cosmo",
            "speed": 1.00,
            "targetLength": 41
        },
    ]

    for req in script_content:
        res = create_voiceover(req)
        print(res)


print(f"Combining video with your voiceover...")
def combine_audio(video, audio):
    if (audio):
        os.system(
            f"""ffmpeg \
                -i {video} -i {audio} \
                -c:v copy \
                -map 0:v -map 1:a \
                -shortest \
                -y output_example.mp4""")

def create_video():
        VOICE="cosmo"
        try:     
            combine_audio("parrot.mp4", "parrots_voiceover.wav")

        except Exception as e:
            print(e)

if __name__ == "__main__":
    combine_audio("parrot.mp4", "parrots_voiceover.wav")

πŸ“˜

What is Target Length?

This example uses the AudioStack API's smart duration reduction feature to "condense" your script to fit a set duration (line 56). By specifying the desired length in seconds, you can control the duration of your voice-over to ensure it fits perfectly with your video content. Find out more.

Check out the resulting video below:

Add some sound track in the background

If you want to take your voice-over content further, you can add a sound template to bring your audio to life.

πŸ“˜

What are Sound Templates?

Sound templates are short, curated sound designs, including sound effects and background music. You can find out more about AudioStack's sound templates here.

Check out this updated code example (and feel free to try it yourself). As always, remember to add your API key in line 5.

In line 31, you can see that there is now a sound template, called sound_affects, being applied in the mix.

import audiostack
import os

audiostack.api_base = "https://v2.api.audio"
audiostack.api_key = "APIKEY"

def create_voiceover(req):
    scriptText = f"""
    <as:section name="main" soundsegment="main"> 
    {req["text"]}
    </as:section>"""

    print(f"Generating your script...")
    script = audiostack.Content.Script.create(scriptText=scriptText, scriptName="test")

    print(f"Synthesizing speech...")
    tts = audiostack.Speech.TTS.create(scriptItem=script, voice=req["voice"], speed=req["speed"])

    print(f"Condensing speech to match target duration...")
    tts = audiostack.Speech.TTS.reduce(speechId=tts.speechId, targetLength=req["targetLength"])

    timelineProperties= {
        "forceLength" : req['targetLength'],
        "speechStart" : 0,
        "fadeIn" : 0,
        "fadeOut" : 0,
    }

    print(f"Combining sounds and applying auto mixing and mastering...")

    mix = audiostack.Production.Mix.create(speechItem=tts, soundTemplate="sound_affects", exportSettings={"ttsTrack" : True}, masteringPreset="balanced", timelineProperties=timelineProperties)

    print(f"Preparing for download...")
    encoder = audiostack.Delivery.Encoder.encode_mix(
        productionItem=mix,
        preset="custom",
        sampleRate=44100,
        bitDepth=16,
        public=False,
        format="wav",
        channels=2,
        loudnessPreset="podcast"
    )

    encoder.download(fileName=req["name"])
    print(encoder)

if __name__ == "__main__":

    script_content = [
        {
            "name": "parrots_voiceover",
            "text": "Parrots are highly intelligent and social birds known for their vibrant plumage and remarkable ability to mimic sounds, including human speech. These colorful avian companions are found in tropical regions around the world and are known for their playful and affectionate nature, often forming strong bonds with their human caregivers.  Parrots and generative audio both exhibit remarkable abilities for mimicry and creativity, with parrots mimicking sounds and voices, and generative audio systems producing compelling sound through imitation and adaptation. At AudioStack, we're big fans of parrots and take inspiration from them in everything from our voice cloning capabilities to our branding. Find out more at www.audiostack.ai",
            "voice": "cosmo",
            "speed": 1.00,
            "targetLength": 41
        },
    ]

    for req in script_content:
        res = create_voiceover(req)
        print(res)


print(f"Combining video with your voiceover...")
def combine_audio(video, audio):
    if (audio):
        os.system(
            f"""ffmpeg \
                -i {video} -i {audio} \
                -c:v copy \
                -map 0:v -map 1:a \
                -shortest \
                -y output_example.mp4""")

def create_video():
        VOICE="cosmo"
        try:     
            combine_audio("parrot.mp4", "parrots_voiceover.wav")

        except Exception as e:
            print(e)

if __name__ == "__main__":
    combine_audio("parrot.mp4", "parrots_voiceover.wav")


What’s Next

There are so many different ways to create amazing video voice-overs with AudioStack. Try creating voice-overs in different languages, or changing the voices and sound templates in your code. You can also create custom sound design templates and control timing with Advanced Timing Parameters by checking out the detailed guides below: