
Audio & Video to Text
Pricing
Pay per event
Go to Store

Audio & Video to Text
Transcribes video and audio files into plain text and subtitle formats (TXT, SRT, VTT, TSV, JSON) using OpenAI's Whisper model. Supports preloaded tiny, base, and small models.
0.0 (0)
Pricing
Pay per event
4
Total users
23
Monthly users
18
Runs succeeded
96%
Last modified
5 days ago
.actor/README.md
1---2
3## 🎬 Video and Audio to Text Transcription4
5### 🧠 Overview6
7This script is designed for the **Apify** platform and uses **OpenAI Whisper** to transcribe audio or video (e.g., from YouTube or MP4 files) into text and other formats (SRT, VTT, etc.).8
9---10
11## 📥 Input12
13### Parameters14
15- **model**: *(string)* — Whisper model to use. Available options:16 - `tiny` ✅ *(pre-installed)*17 - `base` ✅ *(pre-installed)*18 - `small` ✅ *(pre-installed)*19 - `medium` *(requires download)*20 - `large` *(requires download)*21 - `turbo` *(requires download)*22
23> ✅ **Note**: Models `tiny`, `base`, and `small` are already downloaded in the Docker image for faster and offline-ready processing.24- **source_url**: *(string)* — Direct URL to the video/audio file (e.g., an MP4 file hosted online). 25 ⚠️ *YouTube links are not supported directly. You must download the video first.*26
27### Example Input28
29```json30{31 "model": "tiny",32 "source_url": "https://raw.githubusercontent.com/donjuanMime/audio_to_text/main/video.mp4"33}34```35
36---37
38## 📤 Output39
40The output is a JSON array with one object, which includes multiple transcription formats:41
42* `json`: Full Whisper output with segments, tokens, and metadata.43* `srt`: SubRip subtitle format.44* `tsv`: Tab-separated values (start, end, text).45* `txt`: Plain text transcription.46* `vtt`: WebVTT subtitle format.47
48### Example Output (excerpt)49
50```json51[52 {53 "json": "{ ... Whisper segment data ... }",54 "srt": "1\n00:00:00,000 --> 00:00:01,120\nWhat's your favorite drink?\n...",55 "tsv": "start\tend\ttext\n0\t1120\tWhat's your favorite drink?\n...",56 "txt": "What's your favorite drink?\nMy favorite drink is apple juice...\n",57 "vtt": "WEBVTT\n\n00:00.000 --> 00:01.120\nWhat's your favorite drink?\n..."58 }59]60```61
62---63
64## 🛠️ How to Use65
661. Go to your **Apify** dashboard and create a new actor or task.672. Paste this script into the actor’s source.683. Provide the input in the required JSON format (see above).694. Run the actor. It will download the media file, process it using Whisper, and return transcription in multiple formats.70
71---72
73## ⚠️ Disclaimer74
75This script is provided **"as is"**, without warranties of any kind. Use it at your own risk.76Ensure compliance with:77
78* YouTube’s Terms of Service (if downloading/transcribing from YouTube).79* Local and international copyright laws.80
81---82
83Let me know if you’d like the actual Apify actor code or instructions on downloading YouTube videos as `.mp4` files to use with this.
.actor/actor.json
{ "actorSpecification": 1, "name": "video-to-text", "title": "Video to Text", "description": "Transcribes video or audio files into text and subtitle formats using Whisper (tiny, base, small models pre-installed).", "version": "0.1.0", "meta": { "templateId": "ts-start-bun" }, "readme": "./README.md", "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "views": { "overview": { "title": "Overview", "transformation": { "fields": [ "json", "srt", "tsv", "txt", "vtt" ] }, "display": { "component": "table", "properties": { "json": { "label": "JSON", "format": "text" }, "srt": { "label": "SRT", "format": "text" }, "tsv": { "label": "TSV", "format": "text" }, "txt": { "label": "Plain Text", "format": "text" }, "vtt": { "label": "VTT", "format": "text" } } } } } } }}
.actor/input_schema.json
{ "title": "Video Transcription", "type": "object", "schemaVersion": 1, "properties": { "source_url": { "title": "Source URL", "type": "string", "description": "Direct URL to the video or audio file (e.g., .mp4, .wav). YouTube links are not supported.", "editor": "textfield", "prefill": "https://raw.githubusercontent.com/donjuanMime/audio_to_text/main/video.mp4" }, "model": { "title": "Whisper Model", "type": "string", "description": "Whisper model to use: 'tiny', 'base', or 'small' (pre-installed). Other models require download.", "editor": "textfield", "prefill": "tiny" } }, "required": ["source_url", "model"]}
store/dataset.go
package store
import ( "bytes" "encoding/json" "fmt" "github.com/davesavic/clink" "io")
func GetKVDatasetEndpoint(id string) string { return fmt.Sprintf(KVDatasetURL, id)}
func KVDatasetNew(id string) KVDataset { client := clink.NewClient() client.Headers["Content-Type"] = "application/json" return KVDataset{ID: id, Client: client}}
func KVDatasetDefault() KVDataset { return KVDatasetNew(KVDatabaseDefaultID)}
func (kv KVDataset) Put(payload any) error { url := GetKVDatasetEndpoint(kv.ID) kv.Client.Headers["Authorization"] = fmt.Sprintf("Bearer %s", Token) _, err := kv.Client.Post(url, KVDatasetRequestFrom(payload)) if err != nil { xlog.Error("failed to set dataset") } return err}
func KVDatasetRequestFrom(v any) io.Reader { b, _ := json.Marshal(v) return bytes.NewReader(b)}
store/file.go
package store
import ( "bytes" "encoding/json" "fmt" "io" "log/slog" "net/http" "os" "strings"
"github.com/davesavic/clink")
var ( KVStoreDefaultID string KVDatabaseDefaultID string KVStoreURL = "https://api.apify.com/v2/key-value-stores/%s/records/%s?token=%s" KVDatasetURL = "https://api.apify.com/v2/datasets/%s/items" Token string xlog = slog.New(slog.NewTextHandler(os.Stdout, nil))
Dataset KVDataset Store KVStore)
type ( KVStoreValue map[string]any KVStore struct { ID string Client *clink.Client } KVDataset struct { ID string Client *clink.Client })
func init() { Token = os.Getenv("APIFY_TOKEN") KVStoreDefaultID = os.Getenv("APIFY_DEFAULT_KEY_VALUE_STORE_ID") KVDatabaseDefaultID = os.Getenv("APIFY_DEFAULT_DATASET_ID") if strings.EqualFold(Token, "") || strings.EqualFold(KVStoreDefaultID, "") { xlog.Error("token or default kv store missing") }
Dataset = KVDatasetDefault() Store = KVStoreDefault()}
func GetKVStoreEndpoint(id string, key string) string { return fmt.Sprintf(KVStoreURL, id, key, Token)}
func KVStoreNew(id string) KVStore { client := clink.NewClient() client.Headers["Content-Type"] = "application/json" return KVStore{ID: id, Client: client}}
func KVStoreDefault() KVStore { return KVStoreNew(KVStoreDefaultID)}
func (kv KVStore) Get(key string) (KVStoreValue, error) { url := GetKVStoreEndpoint(kv.ID, key) resp, err := kv.Client.Get(url) if err != nil { xlog.Error("failed to get value", "key", key, "error", err) return nil, err } return KVStoreValueFromResponse(resp), nil}
func (kv KVStore) Put(key string, payload any) error { url := GetKVStoreEndpoint(kv.ID, key) _, err := kv.Client.Put(url, KVStoreRequestFrom(payload)) if err != nil { xlog.Error("failed to set value", "key", key, "error", err) } return err}
func (kv KVStore) Delete(key string) error { url := GetKVStoreEndpoint(kv.ID, key) _, err := kv.Client.Delete(url) if err != nil { xlog.Error("failed to delete key", "key", key, "error", err) } return err}
func KVStoreRequestFrom(v any) io.Reader { b, _ := json.Marshal(v) return bytes.NewReader(b)}
func KVStoreValueFromResponse(resp *http.Response) KVStoreValue { var value KVStoreValue if err := clink.ResponseToJson(resp, &value); err != nil { xlog.Error("failed to unmarshal response", "error", err) } return value}
Dockerfile
FROM theartist497/whisper-go-app:latest
# Upgrade pip and install whisper-ctranslate2RUN pip install --upgrade pip \ && pip install -U whisper-ctranslate2
# Set working directory inside containerWORKDIR /app
# Copy your app filesCOPY . .
# Build Go binaryRUN go build -o main .
# Run the appCMD ["./main"]
go.mod
module audio_to_text
go 1.23.0
require ( github.com/davesavic/clink v1.0.2 // indirect golang.org/x/time v0.5.0 // indirect)
go.sum
github.com/davesavic/clink v1.0.2 h1:zpbyoWtx9ztI6nO4wHS72Du6aS3xvtFvdhwn8KOeyh8=github.com/davesavic/clink v1.0.2/go.mod h1:t/riPX5tWlWKYcP7lvUpfsxFqC6WNnOOO+xvgei+uhg=golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
main.go
package main
import ( "audio_to_text/store" "bytes" "fmt" "io" "log" "log/slog" "net/http" "os" "os/exec" "strings" "sync")
var ( xlog = slog.New(slog.NewTextHandler(os.Stdout, nil)))
func main() { input, err := store.Store.Get("INPUT") if err != nil { xlog.Error("failed to get input from kv store", "error", err) return }
source := input["source_url"].(string) if strings.EqualFold(source, "") { xlog.Error("not found 'source_url'", source) }
model := input["model"].(string) if strings.EqualFold(source, "") { xlog.Error("not found 'source_url'", source) }
transcription(source, model)}
func Test() { url := "https://raw.githubusercontent.com/donjuanMime/audio_to_text/main/video.mp4" model := "tiny" transcription(url, model)}
func transcription(sourceUrl, model string) { downloadFile(sourceUrl) filename := "video.mp4"
command := fmt.Sprintf(`whisper-ctranslate2 %s --batched true --batch_size 4 --max_words_per_line 8 --word_timestamps true --vad_filter true --model %s`, filename, model)
runCmdWithProgress(command)}
func runCmdWithProgress(command string) bool { cmd := exec.Command("bash", "-c", command)
var stdoutBuf, stderrBuf bytes.Buffer stdoutIn, _ := cmd.StdoutPipe() stderrIn, _ := cmd.StderrPipe()
var errStdout, errStderr error stdout := io.MultiWriter(os.Stdout, &stdoutBuf) stderr := io.MultiWriter(os.Stderr, &stderrBuf) if err := cmd.Start(); err != nil { xlog.Error("cmd.Start() failed with ", err) return false }
var wg sync.WaitGroup wg.Add(1)
go func() { _, errStdout = io.Copy(stdout, stdoutIn)
log.Println(stdoutIn) wg.Done() }()
_, errStderr = io.Copy(stderr, stderrIn) wg.Wait()
if err := cmd.Wait(); err != nil { xlog.Error("cmd.Run() failed with ", err) } if errStdout != nil || errStderr != nil { xlog.Error("failed to capture stdout or stderr", errStderr, errStdout) return false }
sendResult()
return true}
func sendResult() { if err := store.Dataset.Put(map[string]interface{}{ "json": string(mustReadFile("video.json")), "srt": string(mustReadFile("video.srt")), "tsv": string(mustReadFile("video.tsv")), "txt": string(mustReadFile("video.txt")), "vtt": string(mustReadFile("video.vtt")), }); err != nil { log.Fatal(err) }}
func mustReadFile(filename string) []byte { file, err := os.ReadFile(filename) if err != nil { return nil } return file}
func downloadFile(fileURL string) { outputPath := "video.mp4"
// Create the file out, err := os.Create(outputPath) if err != nil { log.Fatalf("Failed to create file: %v", err) } defer out.Close()
// Get the data resp, err := http.Get(fileURL) if err != nil { log.Fatalf("Failed to download file: %v", err) } defer resp.Body.Close()
// Check server response if resp.StatusCode != http.StatusOK { log.Fatalf("Bad status: %s", resp.Status) }
// Write the body to file _, err = io.Copy(out, resp.Body) if err != nil { log.Fatalf("Failed to save file: %v", err) }
log.Println("File downloaded successfully:", outputPath)}