Video Download Link Crawler avatar
Video Download Link Crawler

Pricing

Pay per event

Go to Store
Video Download Link Crawler

Video Download Link Crawler

Developed by

Rodrigo Franco

Rodrigo Franco

Maintained by Community

Automatically discover and extract video download links from any website. Crawl through multiple pages, follow custom link patterns, and export results in JSON, CSV, HTML, or XML formats. Perfect for content creators, researchers, and media professionals.

0.0 (0)

Pricing

Pay per event

0

Total users

2

Monthly users

2

Runs succeeded

>99%

Last modified

2 days ago

.actor/actor.json

{
"actorSpecification": 1,
"name": "video-download-crawler",
"title": "Project Cheerio Crawler Javascript",
"description": "Crawlee and Cheerio project in javascript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-cheerio"
},
"input": "./input_schema.json",
"dockerfile": "../Dockerfile"
}

.actor/input_schema.json

{
"title": "Video Download Link Crawler Input",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrl": {
"title": "Start URL",
"type": "string",
"description": "The URL where crawling will begin",
"editor": "textfield",
"pattern": "^https?://.*"
},
"linkRegex": {
"title": "Link Following Regex",
"type": "string",
"description": "Regular expression to match URLs to follow",
"editor": "textfield",
"default": ".*"
},
"videoRegex": {
"title": "Video Detection Regex",
"type": "string",
"description": "Regular expression to identify video download links",
"editor": "textfield",
"default": "\\.(mp4|avi|mov|mkv|webm|m4v)$"
},
"maxCrawlDepth": {
"title": "Maximum Crawl Depth",
"type": "integer",
"description": "Maximum depth of crawling",
"default": 3,
"minimum": 1,
"maximum": 10
},
"maxPages": {
"title": "Maximum Pages",
"type": "integer",
"description": "Maximum number of pages to crawl",
"default": 100,
"minimum": 1
},
"outputFormat": {
"title": "Output Format",
"type": "string",
"description": "Export format for results",
"enum": ["JSON", "CSV", "HTML", "XML"],
"default": "JSON"
}
},
"required": ["startUrl"]
}

src/main.js

1// Import required modules using ES6 syntax
2import { Actor } from 'apify';
3import { CheerioCrawler } from 'crawlee';
4
5// Initialize the actor
6await Actor.main(async () => {
7 // Get input from user
8 const input = await Actor.getInput();
9
10 // Validate input
11 if (!input || !input.startUrl) {
12 throw new Error('Start URL is required');
13 }
14
15 // Set up default values
16 const {
17 startUrl,
18 linkRegex = '.*',
19 videoRegex = '\\.(mp4|avi|mov|mkv|webm|m4v)$',
20 maxCrawlDepth = 3,
21 maxPages = 100,
22 outputFormat = 'JSON'
23 } = input;
24
25 console.log('Starting Video Download Link Crawler...');
26 console.log(`Start URL: ${startUrl}`);
27 console.log(`Video Regex: ${videoRegex}`);
28 console.log(`Max Depth: ${maxCrawlDepth}`);
29 console.log(`Max Pages: ${maxPages}`);
30
31 // Create request queue
32 const requestQueue = await Actor.openRequestQueue();
33
34 // Add start URL to queue
35 await requestQueue.addRequest({
36 url: startUrl,
37 userData: { depth: 0 }
38 });
39
40 // Set up dataset for results
41 const dataset = await Actor.openDataset();
42
43 // Configure crawler
44 const crawler = new CheerioCrawler({
45 requestQueue,
46 maxRequestsPerCrawl: maxPages,
47 async requestHandler({ request, $ }) {
48 const { url } = request;
49 const { depth } = request.userData;
50
51 console.log(`Processing: ${url} (depth: ${depth})`);
52
53 // Extract video links from current page
54 const videoLinks = await extractVideoLinks($, url, videoRegex);
55
56 console.log(`Found ${videoLinks.length} video links on ${url}`);
57
58 // Save video links to dataset
59 for (const videoLink of videoLinks) {
60 await dataset.pushData({
61 sourceUrl: url,
62 videoUrl: videoLink.url,
63 title: videoLink.title,
64 fileSize: videoLink.fileSize,
65 format: videoLink.format,
66 foundAt: new Date().toISOString(),
67 depth: depth
68 });
69 }
70
71 // Find and enqueue new links if within depth limit
72 if (depth < maxCrawlDepth) {
73 const links = await extractLinks($, url, linkRegex, videoRegex);
74
75 console.log(`Found ${links.length} links to follow from ${url}`);
76
77 for (const link of links) {
78 await requestQueue.addRequest({
79 url: link,
80 userData: { depth: depth + 1 }
81 });
82 }
83 }
84 },
85 async failedRequestHandler({ request }) {
86 console.error(`Request failed: ${request.url}`);
87
88 // Check if the failed request is actually a video file we should record
89 const videoRegexPattern = new RegExp(videoRegex, 'i');
90 if (videoRegexPattern.test(request.url)) {
91 console.log(`Recording failed request as video link: ${request.url}`);
92
93 // Extract info from the URL
94 const urlParts = request.url.split('/');
95 const filename = urlParts[urlParts.length - 1];
96 const format = getVideoFormat(request.url);
97
98 await dataset.pushData({
99 sourceUrl: request.userData.sourceUrl || 'Unknown',
100 videoUrl: request.url,
101 title: filename.replace(/\.[^/.]+$/, ""), // Remove extension
102 fileSize: null,
103 format: format,
104 foundAt: new Date().toISOString(),
105 depth: request.userData.depth || 0,
106 note: 'Found as direct video link'
107 });
108 }
109 }
110 });
111
112 // Run the crawler
113 await crawler.run();
114
115 // Export results based on format
116 const results = await dataset.getData();
117 await exportResults(results.items, outputFormat);
118
119 console.log(`Crawling completed! Found ${results.items.length} video links.`);
120});
121
122// Helper function to extract video links
123async function extractVideoLinks($, baseUrl, videoRegex) {
124 const videoLinks = [];
125 const regex = new RegExp(videoRegex, 'i');
126
127 try {
128 // Check all links on the page
129 $('a[href]').each((index, element) => {
130 const href = $(element).attr('href');
131 if (!href) return;
132
133 try {
134 const absoluteUrl = new URL(href, baseUrl).href;
135
136 if (regex.test(absoluteUrl)) {
137 videoLinks.push({
138 url: absoluteUrl,
139 title: $(element).text().trim() || $(element).attr('title') || 'Unknown',
140 fileSize: null,
141 format: getVideoFormat(absoluteUrl)
142 });
143 }
144 } catch (urlError) {
145 console.warn(`Invalid URL: ${href}`);
146 }
147 });
148
149 // Check for video elements
150 $('video source[src], video[src]').each((index, element) => {
151 const src = $(element).attr('src');
152 if (!src) return;
153
154 try {
155 const absoluteUrl = new URL(src, baseUrl).href;
156
157 if (regex.test(absoluteUrl)) {
158 videoLinks.push({
159 url: absoluteUrl,
160 title: $('video').attr('title') || 'Video',
161 fileSize: null,
162 format: getVideoFormat(absoluteUrl)
163 });
164 }
165 } catch (urlError) {
166 console.warn(`Invalid video URL: ${src}`);
167 }
168 });
169
170 // Check for direct video links in download buttons or specific patterns
171 $('a[href*="download"], a[href*="sample"], a[href*="video"]').each((index, element) => {
172 const href = $(element).attr('href');
173 if (!href) return;
174
175 try {
176 const absoluteUrl = new URL(href, baseUrl).href;
177
178 if (regex.test(absoluteUrl)) {
179 const linkText = $(element).text().trim();
180 const title = linkText || $(element).attr('title') || $(element).attr('alt') || 'Video File';
181
182 videoLinks.push({
183 url: absoluteUrl,
184 title: title,
185 fileSize: null,
186 format: getVideoFormat(absoluteUrl)
187 });
188 }
189 } catch (urlError) {
190 console.warn(`Invalid video link URL: ${href}`);
191 }
192 });
193
194 // Look for embedded videos or iframes
195 $('iframe[src*="video"], embed[src*="video"]').each((index, element) => {
196 const src = $(element).attr('src');
197 if (!src) return;
198
199 try {
200 const absoluteUrl = new URL(src, baseUrl).href;
201
202 if (regex.test(absoluteUrl)) {
203 videoLinks.push({
204 url: absoluteUrl,
205 title: $(element).attr('title') || 'Embedded Video',
206 fileSize: null,
207 format: getVideoFormat(absoluteUrl)
208 });
209 }
210 } catch (urlError) {
211 console.warn(`Invalid embedded video URL: ${src}`);
212 }
213 });
214
215 } catch (error) {
216 console.error('Error extracting video links:', error);
217 }
218
219 // Remove duplicates based on URL
220 const uniqueVideos = [];
221 const seenUrls = new Set();
222
223 for (const video of videoLinks) {
224 if (!seenUrls.has(video.url)) {
225 seenUrls.add(video.url);
226 uniqueVideos.push(video);
227 }
228 }
229
230 return uniqueVideos;
231}
232
233// Helper function to extract links to follow
234async function extractLinks($, baseUrl, linkRegex, videoRegex) {
235 const links = [];
236 const regex = new RegExp(linkRegex, 'i');
237 const videoRegexPattern = new RegExp(videoRegex, 'i');
238
239 try {
240 $('a[href]').each((index, element) => {
241 const href = $(element).attr('href');
242 if (!href) return;
243
244 try {
245 const absoluteUrl = new URL(href, baseUrl).href;
246
247 // If it's a video file, don't try to crawl it, but save the reference
248 if (videoRegexPattern.test(absoluteUrl)) {
249 console.log(`Found direct video link (not crawling): ${absoluteUrl}`);
250 // We'll handle this in the video extraction function
251 return;
252 }
253
254 // Only follow HTML pages and directories
255 if (regex.test(absoluteUrl) && absoluteUrl.startsWith('http')) {
256 // Avoid crawling direct file downloads
257 const urlPath = new URL(absoluteUrl).pathname;
258 if (!urlPath.match(/\.(pdf|zip|exe|dmg|pkg|deb|rpm)$/i)) {
259 links.push(absoluteUrl);
260 }
261 }
262 } catch (urlError) {
263 console.warn(`Invalid link URL: ${href}`);
264 }
265 });
266 } catch (error) {
267 console.error('Error extracting links:', error);
268 }
269
270 return [...new Set(links)]; // Remove duplicates
271}
272
273// Helper function to get video format
274function getVideoFormat(url) {
275 const match = url.match(/\.([^.?]+)(?:\?|$)/);
276 return match ? match[1].toLowerCase() : 'unknown';
277}
278
279// Helper function to export results
280async function exportResults(results, format) {
281 try {
282 switch (format) {
283 case 'CSV':
284 await Actor.setValue('OUTPUT.csv', convertToCSV(results));
285 break;
286 case 'HTML':
287 await Actor.setValue('OUTPUT.html', convertToHTML(results));
288 break;
289 case 'XML':
290 await Actor.setValue('OUTPUT.xml', convertToXML(results));
291 break;
292 default:
293 await Actor.setValue('OUTPUT.json', results);
294 }
295 console.log(`Results exported in ${format} format`);
296 } catch (error) {
297 console.error('Error exporting results:', error);
298 }
299}
300
301// CSV conversion function
302function convertToCSV(data) {
303 if (!data.length) return '';
304
305 const headers = Object.keys(data[0]);
306 const csvContent = [
307 headers.join(','),
308 ...data.map(row =>
309 headers.map(header => `"${(row[header] || '').toString().replace(/"/g, '""')}"`).join(',')
310 )
311 ].join('\n');
312
313 return csvContent;
314}
315
316// HTML conversion function
317function convertToHTML(data) {
318 const htmlContent = `
319<!DOCTYPE html>
320<html>
321<head>
322 <title>Video Download Links</title>
323 <style>
324 table { border-collapse: collapse; width: 100%; }
325 th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
326 th { background-color: #f2f2f2; }
327 a { color: #0066cc; text-decoration: none; }
328 a:hover { text-decoration: underline; }
329 </style>
330</head>
331<body>
332 <h1>Video Download Links</h1>
333 <p>Total videos found: ${data.length}</p>
334 <table>
335 <thead>
336 <tr>
337 <th>Title</th>
338 <th>Video URL</th>
339 <th>Source URL</th>
340 <th>Format</th>
341 <th>Found At</th>
342 <th>Depth</th>
343 </tr>
344 </thead>
345 <tbody>
346 ${data.map(item => `
347 <tr>
348 <td>${item.title || 'Unknown'}</td>
349 <td><a href="${item.videoUrl}" target="_blank">${item.videoUrl}</a></td>
350 <td><a href="${item.sourceUrl}" target="_blank">${item.sourceUrl}</a></td>
351 <td>${item.format}</td>
352 <td>${item.foundAt}</td>
353 <td>${item.depth}</td>
354 </tr>
355 `).join('')}
356 </tbody>
357 </table>
358</body>
359</html>
360 `;
361
362 return htmlContent;
363}
364
365// XML conversion function
366function convertToXML(data) {
367 const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
368<videos count="${data.length}">
369 ${data.map(item => `
370 <video>
371 <title><![CDATA[${item.title || 'Unknown'}]]></title>
372 <videoUrl><![CDATA[${item.videoUrl}]]></videoUrl>
373 <sourceUrl><![CDATA[${item.sourceUrl}]]></sourceUrl>
374 <format>${item.format}</format>
375 <foundAt>${item.foundAt}</foundAt>
376 <depth>${item.depth}</depth>
377 </video>
378 `).join('')}
379</videos>`;
380
381 return xmlContent;
382}

.dockerignore

# configurations
.idea
.vscode
.zed
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf
quote_type = single

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
.vscode
.zed
dist
node_modules
apify_storage
storage
# Added by Apify CLI
.venv

.prettierrc

{
"printWidth": 120,
"tabWidth": 4,
"singleQuote": true
}

Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:22
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Create and run as a non-root user.
RUN adduser -h /home/apify -D apify && \
chown -R apify:apify ./
USER apify
# Run the image.
CMD npm start --silent

eslint.config.mjs

1import prettier from 'eslint-config-prettier';
2
3import apify from '@apify/eslint-config/js.js';
4
5// eslint-disable-next-line import/no-default-export
6export default [{ ignores: ['**/dist'] }, ...apify, prettier];

package.json

{
"name": "video-download-crawler",
"version": "0.0.1",
"type": "module",
"description": "This is a boilerplate of an Apify Actor.",
"engines": {
"node": ">=18.0.0"
},
"dependencies": {
"apify": "^3.4.2",
"crawlee": "^3.13.8"
},
"devDependencies": {
"@apify/eslint-config": "^1.0.0",
"eslint": "^9.29.0",
"eslint-config-prettier": "^10.1.5",
"prettier": "^3.5.3"
},
"scripts": {
"start": "node src/main.js",
"format": "prettier --write .",
"format:check": "prettier --check .",
"lint": "eslint",
"lint:fix": "eslint --fix",
"test": "echo \"Error: oops, the Actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}