Audio & Video to Text avatar
Audio & Video to Text

Pricing

Pay per event

Go to Store
Audio & Video to Text

Audio & Video to Text

Developed by

Donjuan

Donjuan

Maintained by Community

Transcribes video and audio files into plain text and subtitle formats (TXT, SRT, VTT, TSV, JSON) using OpenAI's Whisper model. Supports preloaded tiny, base, and small models.

0.0 (0)

Pricing

Pay per event

4

Total users

23

Monthly users

18

Runs succeeded

96%

Last modified

5 days ago

.actor/README.md

1---
2
3## 🎬 Video and Audio to Text Transcription
4
5### 🧠 Overview
6
7This script is designed for the **Apify** platform and uses **OpenAI Whisper** to transcribe audio or video (e.g., from YouTube or MP4 files) into text and other formats (SRT, VTT, etc.).
8
9---
10
11## 📥 Input
12
13### Parameters
14
15- **model**: *(string)* — Whisper model to use. Available options:
16 - `tiny`*(pre-installed)*
17 - `base`*(pre-installed)*
18 - `small`*(pre-installed)*
19 - `medium` *(requires download)*
20 - `large` *(requires download)*
21 - `turbo` *(requires download)*
22
23>**Note**: Models `tiny`, `base`, and `small` are already downloaded in the Docker image for faster and offline-ready processing.
24- **source_url**: *(string)* — Direct URL to the video/audio file (e.g., an MP4 file hosted online).
25 ⚠️ *YouTube links are not supported directly. You must download the video first.*
26
27### Example Input
28
29```json
30{
31 "model": "tiny",
32 "source_url": "https://raw.githubusercontent.com/donjuanMime/audio_to_text/main/video.mp4"
33}
34```
35
36---
37
38## 📤 Output
39
40The output is a JSON array with one object, which includes multiple transcription formats:
41
42* `json`: Full Whisper output with segments, tokens, and metadata.
43* `srt`: SubRip subtitle format.
44* `tsv`: Tab-separated values (start, end, text).
45* `txt`: Plain text transcription.
46* `vtt`: WebVTT subtitle format.
47
48### Example Output (excerpt)
49
50```json
51[
52 {
53 "json": "{ ... Whisper segment data ... }",
54 "srt": "1\n00:00:00,000 --> 00:00:01,120\nWhat's your favorite drink?\n...",
55 "tsv": "start\tend\ttext\n0\t1120\tWhat's your favorite drink?\n...",
56 "txt": "What's your favorite drink?\nMy favorite drink is apple juice...\n",
57 "vtt": "WEBVTT\n\n00:00.000 --> 00:01.120\nWhat's your favorite drink?\n..."
58 }
59]
60```
61
62---
63
64## 🛠️ How to Use
65
661. Go to your **Apify** dashboard and create a new actor or task.
672. Paste this script into the actor’s source.
683. Provide the input in the required JSON format (see above).
694. Run the actor. It will download the media file, process it using Whisper, and return transcription in multiple formats.
70
71---
72
73## ⚠️ Disclaimer
74
75This script is provided **"as is"**, without warranties of any kind. Use it at your own risk.
76Ensure compliance with:
77
78* YouTube’s Terms of Service (if downloading/transcribing from YouTube).
79* Local and international copyright laws.
80
81---
82
83Let me know if you’d like the actual Apify actor code or instructions on downloading YouTube videos as `.mp4` files to use with this.

.actor/actor.json

{
"actorSpecification": 1,
"name": "video-to-text",
"title": "Video to Text",
"description": "Transcribes video or audio files into text and subtitle formats using Whisper (tiny, base, small models pre-installed).",
"version": "0.1.0",
"meta": {
"templateId": "ts-start-bun"
},
"readme": "./README.md",
"input": "./input_schema.json",
"dockerfile": "./Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
"views": {
"overview": {
"title": "Overview",
"transformation": {
"fields": [
"json",
"srt",
"tsv",
"txt",
"vtt"
]
},
"display": {
"component": "table",
"properties": {
"json": {
"label": "JSON",
"format": "text"
},
"srt": {
"label": "SRT",
"format": "text"
},
"tsv": {
"label": "TSV",
"format": "text"
},
"txt": {
"label": "Plain Text",
"format": "text"
},
"vtt": {
"label": "VTT",
"format": "text"
}
}
}
}
}
}
}
}

.actor/input_schema.json

{
"title": "Video Transcription",
"type": "object",
"schemaVersion": 1,
"properties": {
"source_url": {
"title": "Source URL",
"type": "string",
"description": "Direct URL to the video or audio file (e.g., .mp4, .wav). YouTube links are not supported.",
"editor": "textfield",
"prefill": "https://raw.githubusercontent.com/donjuanMime/audio_to_text/main/video.mp4"
},
"model": {
"title": "Whisper Model",
"type": "string",
"description": "Whisper model to use: 'tiny', 'base', or 'small' (pre-installed). Other models require download.",
"editor": "textfield",
"prefill": "tiny"
}
},
"required": ["source_url", "model"]
}

store/dataset.go

package store
import (
"bytes"
"encoding/json"
"fmt"
"github.com/davesavic/clink"
"io"
)
func GetKVDatasetEndpoint(id string) string {
return fmt.Sprintf(KVDatasetURL, id)
}
func KVDatasetNew(id string) KVDataset {
client := clink.NewClient()
client.Headers["Content-Type"] = "application/json"
return KVDataset{ID: id, Client: client}
}
func KVDatasetDefault() KVDataset {
return KVDatasetNew(KVDatabaseDefaultID)
}
func (kv KVDataset) Put(payload any) error {
url := GetKVDatasetEndpoint(kv.ID)
kv.Client.Headers["Authorization"] = fmt.Sprintf("Bearer %s", Token)
_, err := kv.Client.Post(url, KVDatasetRequestFrom(payload))
if err != nil {
xlog.Error("failed to set dataset")
}
return err
}
func KVDatasetRequestFrom(v any) io.Reader {
b, _ := json.Marshal(v)
return bytes.NewReader(b)
}

store/file.go

package store
import (
"bytes"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"os"
"strings"
"github.com/davesavic/clink"
)
var (
KVStoreDefaultID string
KVDatabaseDefaultID string
KVStoreURL = "https://api.apify.com/v2/key-value-stores/%s/records/%s?token=%s"
KVDatasetURL = "https://api.apify.com/v2/datasets/%s/items"
Token string
xlog = slog.New(slog.NewTextHandler(os.Stdout, nil))
Dataset KVDataset
Store KVStore
)
type (
KVStoreValue map[string]any
KVStore struct {
ID string
Client *clink.Client
}
KVDataset struct {
ID string
Client *clink.Client
}
)
func init() {
Token = os.Getenv("APIFY_TOKEN")
KVStoreDefaultID = os.Getenv("APIFY_DEFAULT_KEY_VALUE_STORE_ID")
KVDatabaseDefaultID = os.Getenv("APIFY_DEFAULT_DATASET_ID")
if strings.EqualFold(Token, "") || strings.EqualFold(KVStoreDefaultID, "") {
xlog.Error("token or default kv store missing")
}
Dataset = KVDatasetDefault()
Store = KVStoreDefault()
}
func GetKVStoreEndpoint(id string, key string) string {
return fmt.Sprintf(KVStoreURL, id, key, Token)
}
func KVStoreNew(id string) KVStore {
client := clink.NewClient()
client.Headers["Content-Type"] = "application/json"
return KVStore{ID: id, Client: client}
}
func KVStoreDefault() KVStore {
return KVStoreNew(KVStoreDefaultID)
}
func (kv KVStore) Get(key string) (KVStoreValue, error) {
url := GetKVStoreEndpoint(kv.ID, key)
resp, err := kv.Client.Get(url)
if err != nil {
xlog.Error("failed to get value", "key", key, "error", err)
return nil, err
}
return KVStoreValueFromResponse(resp), nil
}
func (kv KVStore) Put(key string, payload any) error {
url := GetKVStoreEndpoint(kv.ID, key)
_, err := kv.Client.Put(url, KVStoreRequestFrom(payload))
if err != nil {
xlog.Error("failed to set value", "key", key, "error", err)
}
return err
}
func (kv KVStore) Delete(key string) error {
url := GetKVStoreEndpoint(kv.ID, key)
_, err := kv.Client.Delete(url)
if err != nil {
xlog.Error("failed to delete key", "key", key, "error", err)
}
return err
}
func KVStoreRequestFrom(v any) io.Reader {
b, _ := json.Marshal(v)
return bytes.NewReader(b)
}
func KVStoreValueFromResponse(resp *http.Response) KVStoreValue {
var value KVStoreValue
if err := clink.ResponseToJson(resp, &value); err != nil {
xlog.Error("failed to unmarshal response", "error", err)
}
return value
}

Dockerfile

FROM theartist497/whisper-go-app:latest
# Upgrade pip and install whisper-ctranslate2
RUN pip install --upgrade pip \
&& pip install -U whisper-ctranslate2
# Set working directory inside container
WORKDIR /app
# Copy your app files
COPY . .
# Build Go binary
RUN go build -o main .
# Run the app
CMD ["./main"]

go.mod

module audio_to_text
go 1.23.0
require (
github.com/davesavic/clink v1.0.2 // indirect
golang.org/x/time v0.5.0 // indirect
)

go.sum

github.com/davesavic/clink v1.0.2 h1:zpbyoWtx9ztI6nO4wHS72Du6aS3xvtFvdhwn8KOeyh8=
github.com/davesavic/clink v1.0.2/go.mod h1:t/riPX5tWlWKYcP7lvUpfsxFqC6WNnOOO+xvgei+uhg=
golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=

main.go

package main
import (
"audio_to_text/store"
"bytes"
"fmt"
"io"
"log"
"log/slog"
"net/http"
"os"
"os/exec"
"strings"
"sync"
)
var (
xlog = slog.New(slog.NewTextHandler(os.Stdout, nil))
)
func main() {
input, err := store.Store.Get("INPUT")
if err != nil {
xlog.Error("failed to get input from kv store", "error", err)
return
}
source := input["source_url"].(string)
if strings.EqualFold(source, "") {
xlog.Error("not found 'source_url'", source)
}
model := input["model"].(string)
if strings.EqualFold(source, "") {
xlog.Error("not found 'source_url'", source)
}
transcription(source, model)
}
func Test() {
url := "https://raw.githubusercontent.com/donjuanMime/audio_to_text/main/video.mp4"
model := "tiny"
transcription(url, model)
}
func transcription(sourceUrl, model string) {
downloadFile(sourceUrl)
filename := "video.mp4"
command := fmt.Sprintf(`whisper-ctranslate2 %s --batched true --batch_size 4 --max_words_per_line 8 --word_timestamps true --vad_filter true --model %s`, filename, model)
runCmdWithProgress(command)
}
func runCmdWithProgress(command string) bool {
cmd := exec.Command("bash", "-c", command)
var stdoutBuf, stderrBuf bytes.Buffer
stdoutIn, _ := cmd.StdoutPipe()
stderrIn, _ := cmd.StderrPipe()
var errStdout, errStderr error
stdout := io.MultiWriter(os.Stdout, &stdoutBuf)
stderr := io.MultiWriter(os.Stderr, &stderrBuf)
if err := cmd.Start(); err != nil {
xlog.Error("cmd.Start() failed with ", err)
return false
}
var wg sync.WaitGroup
wg.Add(1)
go func() {
_, errStdout = io.Copy(stdout, stdoutIn)
log.Println(stdoutIn)
wg.Done()
}()
_, errStderr = io.Copy(stderr, stderrIn)
wg.Wait()
if err := cmd.Wait(); err != nil {
xlog.Error("cmd.Run() failed with ", err)
}
if errStdout != nil || errStderr != nil {
xlog.Error("failed to capture stdout or stderr", errStderr, errStdout)
return false
}
sendResult()
return true
}
func sendResult() {
if err := store.Dataset.Put(map[string]interface{}{
"json": string(mustReadFile("video.json")),
"srt": string(mustReadFile("video.srt")),
"tsv": string(mustReadFile("video.tsv")),
"txt": string(mustReadFile("video.txt")),
"vtt": string(mustReadFile("video.vtt")),
}); err != nil {
log.Fatal(err)
}
}
func mustReadFile(filename string) []byte {
file, err := os.ReadFile(filename)
if err != nil {
return nil
}
return file
}
func downloadFile(fileURL string) {
outputPath := "video.mp4"
// Create the file
out, err := os.Create(outputPath)
if err != nil {
log.Fatalf("Failed to create file: %v", err)
}
defer out.Close()
// Get the data
resp, err := http.Get(fileURL)
if err != nil {
log.Fatalf("Failed to download file: %v", err)
}
defer resp.Body.Close()
// Check server response
if resp.StatusCode != http.StatusOK {
log.Fatalf("Bad status: %s", resp.Status)
}
// Write the body to file
_, err = io.Copy(out, resp.Body)
if err != nil {
log.Fatalf("Failed to save file: %v", err)
}
log.Println("File downloaded successfully:", outputPath)
}