1
2import { Actor } from 'apify';
3import { CheerioCrawler } from 'crawlee';
4
5
6await Actor.main(async () => {
7
8 const input = await Actor.getInput();
9
10
11 if (!input || !input.startUrl) {
12 throw new Error('Start URL is required');
13 }
14
15
16 const {
17 startUrl,
18 linkRegex = '.*',
19 videoRegex = '\\.(mp4|avi|mov|mkv|webm|m4v)$',
20 maxCrawlDepth = 3,
21 maxPages = 100,
22 outputFormat = 'JSON'
23 } = input;
24
25 console.log('Starting Video Download Link Crawler...');
26 console.log(`Start URL: ${startUrl}`);
27 console.log(`Video Regex: ${videoRegex}`);
28 console.log(`Max Depth: ${maxCrawlDepth}`);
29 console.log(`Max Pages: ${maxPages}`);
30
31
32 const requestQueue = await Actor.openRequestQueue();
33
34
35 await requestQueue.addRequest({
36 url: startUrl,
37 userData: { depth: 0 }
38 });
39
40
41 const dataset = await Actor.openDataset();
42
43
44 const crawler = new CheerioCrawler({
45 requestQueue,
46 maxRequestsPerCrawl: maxPages,
47 async requestHandler({ request, $ }) {
48 const { url } = request;
49 const { depth } = request.userData;
50
51 console.log(`Processing: ${url} (depth: ${depth})`);
52
53
54 const videoLinks = await extractVideoLinks($, url, videoRegex);
55
56 console.log(`Found ${videoLinks.length} video links on ${url}`);
57
58
59 for (const videoLink of videoLinks) {
60 await dataset.pushData({
61 sourceUrl: url,
62 videoUrl: videoLink.url,
63 title: videoLink.title,
64 fileSize: videoLink.fileSize,
65 format: videoLink.format,
66 foundAt: new Date().toISOString(),
67 depth: depth
68 });
69 }
70
71
72 if (depth < maxCrawlDepth) {
73 const links = await extractLinks($, url, linkRegex, videoRegex);
74
75 console.log(`Found ${links.length} links to follow from ${url}`);
76
77 for (const link of links) {
78 await requestQueue.addRequest({
79 url: link,
80 userData: { depth: depth + 1 }
81 });
82 }
83 }
84 },
85 async failedRequestHandler({ request }) {
86 console.error(`Request failed: ${request.url}`);
87
88
89 const videoRegexPattern = new RegExp(videoRegex, 'i');
90 if (videoRegexPattern.test(request.url)) {
91 console.log(`Recording failed request as video link: ${request.url}`);
92
93
94 const urlParts = request.url.split('/');
95 const filename = urlParts[urlParts.length - 1];
96 const format = getVideoFormat(request.url);
97
98 await dataset.pushData({
99 sourceUrl: request.userData.sourceUrl || 'Unknown',
100 videoUrl: request.url,
101 title: filename.replace(/\.[^/.]+$/, ""),
102 fileSize: null,
103 format: format,
104 foundAt: new Date().toISOString(),
105 depth: request.userData.depth || 0,
106 note: 'Found as direct video link'
107 });
108 }
109 }
110 });
111
112
113 await crawler.run();
114
115
116 const results = await dataset.getData();
117 await exportResults(results.items, outputFormat);
118
119 console.log(`Crawling completed! Found ${results.items.length} video links.`);
120});
121
122
123async function extractVideoLinks($, baseUrl, videoRegex) {
124 const videoLinks = [];
125 const regex = new RegExp(videoRegex, 'i');
126
127 try {
128
129 $('a[href]').each((index, element) => {
130 const href = $(element).attr('href');
131 if (!href) return;
132
133 try {
134 const absoluteUrl = new URL(href, baseUrl).href;
135
136 if (regex.test(absoluteUrl)) {
137 videoLinks.push({
138 url: absoluteUrl,
139 title: $(element).text().trim() || $(element).attr('title') || 'Unknown',
140 fileSize: null,
141 format: getVideoFormat(absoluteUrl)
142 });
143 }
144 } catch (urlError) {
145 console.warn(`Invalid URL: ${href}`);
146 }
147 });
148
149
150 $('video source[src], video[src]').each((index, element) => {
151 const src = $(element).attr('src');
152 if (!src) return;
153
154 try {
155 const absoluteUrl = new URL(src, baseUrl).href;
156
157 if (regex.test(absoluteUrl)) {
158 videoLinks.push({
159 url: absoluteUrl,
160 title: $('video').attr('title') || 'Video',
161 fileSize: null,
162 format: getVideoFormat(absoluteUrl)
163 });
164 }
165 } catch (urlError) {
166 console.warn(`Invalid video URL: ${src}`);
167 }
168 });
169
170
171 $('a[href*="download"], a[href*="sample"], a[href*="video"]').each((index, element) => {
172 const href = $(element).attr('href');
173 if (!href) return;
174
175 try {
176 const absoluteUrl = new URL(href, baseUrl).href;
177
178 if (regex.test(absoluteUrl)) {
179 const linkText = $(element).text().trim();
180 const title = linkText || $(element).attr('title') || $(element).attr('alt') || 'Video File';
181
182 videoLinks.push({
183 url: absoluteUrl,
184 title: title,
185 fileSize: null,
186 format: getVideoFormat(absoluteUrl)
187 });
188 }
189 } catch (urlError) {
190 console.warn(`Invalid video link URL: ${href}`);
191 }
192 });
193
194
195 $('iframe[src*="video"], embed[src*="video"]').each((index, element) => {
196 const src = $(element).attr('src');
197 if (!src) return;
198
199 try {
200 const absoluteUrl = new URL(src, baseUrl).href;
201
202 if (regex.test(absoluteUrl)) {
203 videoLinks.push({
204 url: absoluteUrl,
205 title: $(element).attr('title') || 'Embedded Video',
206 fileSize: null,
207 format: getVideoFormat(absoluteUrl)
208 });
209 }
210 } catch (urlError) {
211 console.warn(`Invalid embedded video URL: ${src}`);
212 }
213 });
214
215 } catch (error) {
216 console.error('Error extracting video links:', error);
217 }
218
219
220 const uniqueVideos = [];
221 const seenUrls = new Set();
222
223 for (const video of videoLinks) {
224 if (!seenUrls.has(video.url)) {
225 seenUrls.add(video.url);
226 uniqueVideos.push(video);
227 }
228 }
229
230 return uniqueVideos;
231}
232
233
234async function extractLinks($, baseUrl, linkRegex, videoRegex) {
235 const links = [];
236 const regex = new RegExp(linkRegex, 'i');
237 const videoRegexPattern = new RegExp(videoRegex, 'i');
238
239 try {
240 $('a[href]').each((index, element) => {
241 const href = $(element).attr('href');
242 if (!href) return;
243
244 try {
245 const absoluteUrl = new URL(href, baseUrl).href;
246
247
248 if (videoRegexPattern.test(absoluteUrl)) {
249 console.log(`Found direct video link (not crawling): ${absoluteUrl}`);
250
251 return;
252 }
253
254
255 if (regex.test(absoluteUrl) && absoluteUrl.startsWith('http')) {
256
257 const urlPath = new URL(absoluteUrl).pathname;
258 if (!urlPath.match(/\.(pdf|zip|exe|dmg|pkg|deb|rpm)$/i)) {
259 links.push(absoluteUrl);
260 }
261 }
262 } catch (urlError) {
263 console.warn(`Invalid link URL: ${href}`);
264 }
265 });
266 } catch (error) {
267 console.error('Error extracting links:', error);
268 }
269
270 return [...new Set(links)];
271}
272
273
274function getVideoFormat(url) {
275 const match = url.match(/\.([^.?]+)(?:\?|$)/);
276 return match ? match[1].toLowerCase() : 'unknown';
277}
278
279
280async function exportResults(results, format) {
281 try {
282 switch (format) {
283 case 'CSV':
284 await Actor.setValue('OUTPUT.csv', convertToCSV(results));
285 break;
286 case 'HTML':
287 await Actor.setValue('OUTPUT.html', convertToHTML(results));
288 break;
289 case 'XML':
290 await Actor.setValue('OUTPUT.xml', convertToXML(results));
291 break;
292 default:
293 await Actor.setValue('OUTPUT.json', results);
294 }
295 console.log(`Results exported in ${format} format`);
296 } catch (error) {
297 console.error('Error exporting results:', error);
298 }
299}
300
301
302function convertToCSV(data) {
303 if (!data.length) return '';
304
305 const headers = Object.keys(data[0]);
306 const csvContent = [
307 headers.join(','),
308 ...data.map(row =>
309 headers.map(header => `"${(row[header] || '').toString().replace(/"/g, '""')}"`).join(',')
310 )
311 ].join('\n');
312
313 return csvContent;
314}
315
316
317function convertToHTML(data) {
318 const htmlContent = `
319<!DOCTYPE html>
320<html>
321<head>
322 <title>Video Download Links</title>
323 <style>
324 table { border-collapse: collapse; width: 100%; }
325 th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
326 th { background-color: #f2f2f2; }
327 a { color: #0066cc; text-decoration: none; }
328 a:hover { text-decoration: underline; }
329 </style>
330</head>
331<body>
332 <h1>Video Download Links</h1>
333 <p>Total videos found: ${data.length}</p>
334 <table>
335 <thead>
336 <tr>
337 <th>Title</th>
338 <th>Video URL</th>
339 <th>Source URL</th>
340 <th>Format</th>
341 <th>Found At</th>
342 <th>Depth</th>
343 </tr>
344 </thead>
345 <tbody>
346 ${data.map(item => `
347 <tr>
348 <td>${item.title || 'Unknown'}</td>
349 <td><a href="${item.videoUrl}" target="_blank">${item.videoUrl}</a></td>
350 <td><a href="${item.sourceUrl}" target="_blank">${item.sourceUrl}</a></td>
351 <td>${item.format}</td>
352 <td>${item.foundAt}</td>
353 <td>${item.depth}</td>
354 </tr>
355 `).join('')}
356 </tbody>
357 </table>
358</body>
359</html>
360 `;
361
362 return htmlContent;
363}
364
365
366function convertToXML(data) {
367 const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
368<videos count="${data.length}">
369 ${data.map(item => `
370 <video>
371 <title><![CDATA[${item.title || 'Unknown'}]]></title>
372 <videoUrl><![CDATA[${item.videoUrl}]]></videoUrl>
373 <sourceUrl><![CDATA[${item.sourceUrl}]]></sourceUrl>
374 <format>${item.format}</format>
375 <foundAt>${item.foundAt}</foundAt>
376 <depth>${item.depth}</depth>
377 </video>
378 `).join('')}
379</videos>`;
380
381 return xmlContent;
382}