1import { Actor } from 'apify';
2import { PlaywrightCrawler } from 'crawlee';
3
4
5const main = async () => {
6 console.log('🚀 iG Profile Scraper - Starting...');
7
8 const input = await Actor.getInput();
9
10 if (!input) {
11 throw new Error('No input provided. Please configure the actor input.');
12 }
13
14 const {
15 mode = 'MODE1',
16 targetUsername,
17 scrapeFollowers = true,
18 scrapeFollowing = false,
19 maxProfilesMode1 = 100,
20 targetUsernames = [],
21 scrapeFollowersMode2 = true,
22 scrapeFollowingMode2 = false,
23 maxProfilesPerAccount = 100,
24 usernamesToAnalyze = [],
25 maxProfilesMode3 = 100,
26 seedUsernames = [],
27 expansionDepth = 1,
28 maxProfilesMode4 = 200,
29 extractEmail = true,
30 extractPhone = true,
31 extractWebsite = true,
32 extractBusinessCategory = true,
33 extractAddress = false,
34 calculateEngagementRate = true,
35 extractPostCaptions = false,
36 deepContactSearch = false,
37 keywords = [],
38 keywordSearchIn = 'bio',
39 locationKeywords = [],
40 language = 'any',
41 minFollowers = 0,
42 maxFollowers = 0,
43 lastPostAfter,
44 minPostsInPeriod = 0,
45 postingPeriodDays = 30,
46 reelsFilter = 'disabled',
47 minMedianViews = 0,
48 highEngagementOnly = false,
49 contactInfoFilter = 'any',
50 requireWebsite = false,
51 accountTypeFilter = 'any',
52 influencersOnly = false,
53 businessCategories = [],
54 verificationFilter = 'any',
55 proxy,
56 sessionCookie
57 } = input;
58
59
60 let maxProfiles = 100;
61 try {
62 switch(mode) {
63 case 'MODE1':
64 maxProfiles = maxProfilesMode1 || 100;
65 break;
66 case 'MODE2':
67 maxProfiles = (maxProfilesPerAccount || 100) * Math.max(1, (targetUsernames || []).length);
68 break;
69 case 'MODE3':
70 maxProfiles = maxProfilesMode3 || 100;
71 break;
72 case 'MODE4':
73 maxProfiles = maxProfilesMode4 || 200;
74 break;
75 }
76 } catch (e) {
77 console.log(`⚠️ Error calculating max profiles: ${e.message}`);
78 maxProfiles = 100;
79 }
80
81 console.log(`📊 Mode: ${mode}`);
82 console.log(`🎯 Max profiles to scrape: ${maxProfiles}`);
83
84
85 if (mode === 'MODE1' && !targetUsername) {
86 throw new Error('Target username is required for Mode 1. Please provide a username in the "Target Account" field.');
87 }
88 if (mode === 'MODE2' && (!targetUsernames || targetUsernames.length === 0)) {
89 throw new Error('At least one username is required for Mode 2. Please provide usernames in the "List of Target Profiles" field.');
90 }
91 if (mode === 'MODE3' && (!usernamesToAnalyze || usernamesToAnalyze.length === 0)) {
92 throw new Error('At least one username is required for Mode 3. Please provide usernames in the "List of Profiles to Analyze" field.');
93 }
94 if (mode === 'MODE4' && (!seedUsernames || seedUsernames.length === 0)) {
95 throw new Error('At least one seed username is required for Mode 4. Please provide usernames in the "Start Profiles" field.');
96 }
97
98
99 let proxyConfiguration;
100 try {
101 proxyConfiguration = await Actor.createProxyConfiguration(proxy);
102 console.log('✅ Proxy configuration created');
103 } catch (e) {
104 console.error(`❌ Error creating proxy configuration: ${e.message}`);
105 throw new Error('Failed to create proxy configuration. Please check your proxy settings.');
106 }
107
108
109 const extractEmailFromText = (text) => {
110 if (!text || !extractEmail) return null;
111 try {
112 const emailRegex = /([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)/gi;
113 const matches = text.match(emailRegex);
114 return matches ? matches[0] : null;
115 } catch (e) {
116 return null;
117 }
118 };
119
120 const extractPhoneFromText = (text) => {
121 if (!text || !extractPhone) return null;
122 try {
123 const phoneRegex = /[\+]?[(]?[0-9]{1,4}[)]?[-\s\.]?[(]?[0-9]{1,4}[)]?[-\s\.]?[0-9]{1,4}[-\s\.]?[0-9]{1,9}/g;
124 const matches = text.match(phoneRegex);
125 return matches ? matches.filter(m => m.length >= 10)[0] : null;
126 } catch (e) {
127 return null;
128 }
129 };
130
131 const containsKeywords = (text, keywordList) => {
132 if (!keywordList || keywordList.length === 0) return true;
133 if (!text) return false;
134 try {
135 const lowerText = text.toLowerCase();
136 return keywordList.some(keyword => lowerText.includes(keyword.toLowerCase()));
137 } catch (e) {
138 return false;
139 }
140 };
141
142 const applyFilters = (profile) => {
143 try {
144
145 if (keywords.length > 0) {
146 let textToSearch = '';
147 if (keywordSearchIn === 'bio' || keywordSearchIn === 'both') {
148 textToSearch += (profile.bio || '') + ' ';
149 }
150 if (keywordSearchIn === 'fullName' || keywordSearchIn === 'both') {
151 textToSearch += (profile.fullName || '');
152 }
153 if (!containsKeywords(textToSearch, keywords)) return false;
154 }
155
156
157 if (locationKeywords.length > 0) {
158 if (!containsKeywords(profile.bio || '', locationKeywords)) return false;
159 }
160
161
162 if (minFollowers > 0 && profile.followers < minFollowers) return false;
163 if (maxFollowers > 0 && profile.followers > maxFollowers) return false;
164
165
166 if (contactInfoFilter !== 'any') {
167 const hasEmail = !!profile.email;
168 const hasPhone = !!profile.phone;
169 const hasWebsite = !!profile.website;
170
171 if (contactInfoFilter === 'hasEmail' && !hasEmail) return false;
172 if (contactInfoFilter === 'hasPhone' && !hasPhone) return false;
173 if (contactInfoFilter === 'hasWebsite' && !hasWebsite) return false;
174 if (contactInfoFilter === 'hasAny' && !hasEmail && !hasPhone && !hasWebsite) return false;
175 if (contactInfoFilter === 'hasEmailOrPhone' && !hasEmail && !hasPhone) return false;
176 }
177
178 if (requireWebsite && !profile.website) return false;
179 if (accountTypeFilter !== 'any' && profile.accountType !== accountTypeFilter) return false;
180 if (verificationFilter === 'verified' && !profile.isVerified) return false;
181 if (verificationFilter === 'notVerified' && profile.isVerified) return false;
182
183 return true;
184 } catch (e) {
185 console.error(`Error applying filters: ${e.message}`);
186 return true;
187 }
188 };
189
190
191 const generateStartUrls = () => {
192 const urls = [];
193
194 try {
195 switch (mode) {
196 case 'MODE1':
197 console.log(`🎯 Mode 1: Analyzing @${targetUsername}`);
198 urls.push({
199 url: `https://www.instagram.com/${targetUsername}/`,
200 userData: { label: 'PROFILE', username: targetUsername, mode: 'MODE1' }
201 });
202 break;
203
204 case 'MODE2':
205 console.log(`🎯 Mode 2: Batch analyzing ${targetUsernames.length} accounts`);
206 targetUsernames.forEach(username => {
207 urls.push({
208 url: `https://www.instagram.com/${username}/`,
209 userData: { label: 'PROFILE', username, mode: 'MODE2' }
210 });
211 });
212 break;
213
214 case 'MODE3':
215 console.log(`🎯 Mode 3: Analyzing ${usernamesToAnalyze.length} specific profiles`);
216 usernamesToAnalyze.slice(0, maxProfiles).forEach(username => {
217 urls.push({
218 url: `https://www.instagram.com/${username}/`,
219 userData: { label: 'PROFILE_DIRECT', username }
220 });
221 });
222 break;
223
224 case 'MODE4':
225 console.log(`🎯 Mode 4: Network expansion from ${seedUsernames.length} seed profiles`);
226 seedUsernames.forEach(username => {
227 urls.push({
228 url: `https://www.instagram.com/${username}/`,
229 userData: { label: 'NETWORK_SEED', username, depth: 0 }
230 });
231 });
232 break;
233 }
234 } catch (e) {
235 console.error(`Error generating start URLs: ${e.message}`);
236 throw e;
237 }
238
239 return urls;
240 };
241
242 const scrapedUsernames = new Set();
243 let profileCount = 0;
244
245
246 console.log('🔧 Creating Playwright crawler...');
247
248 const crawler = new PlaywrightCrawler({
249 proxyConfiguration,
250 maxRequestsPerCrawl: maxProfiles * 10,
251 headless: true,
252
253 launchContext: {
254 launchOptions: {
255 headless: true,
256 args: [
257 '--no-sandbox',
258 '--disable-setuid-sandbox',
259 '--disable-dev-shm-usage',
260 '--disable-accelerated-2d-canvas',
261 '--disable-gpu'
262 ]
263 }
264 },
265
266 preNavigationHooks: [async ({ page, request }) => {
267 try {
268
269 await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
270
271
272 if (sessionCookie) {
273 await page.context().addCookies([
274 {
275 name: 'sessionid',
276 value: sessionCookie,
277 domain: '.instagram.com',
278 path: '/',
279 httpOnly: true,
280 secure: true,
281 sameSite: 'None'
282 }
283 ]);
284 }
285 } catch (e) {
286 console.error(`Error in preNavigationHook: ${e.message}`);
287 }
288 }],
289
290 requestHandler: async ({ page, request, crawler }) => {
291 try {
292 const { label, username, mode: reqMode, depth } = request.userData;
293
294 await page.waitForTimeout(2000 + Math.random() * 2000);
295
296
297 if (label === 'PROFILE') {
298 console.log(`📋 Analyzing profile: @${username}`);
299
300 try {
301 await page.waitForSelector('header section', { timeout: 15000 });
302 } catch (e) {
303 console.log(` ⚠️ Could not load profile page: ${e.message}`);
304 return;
305 }
306
307
308 const profileData = await page.evaluate(() => {
309 try {
310 const data = {};
311
312 data.username = window.location.pathname.split('/')[1];
313
314 const nameEl = document.querySelector('header section h2, header h1, header section span');
315 data.fullName = nameEl ? nameEl.innerText.trim() : '';
316
317 const bioEl = document.querySelector('header section div.-vDIg span, header section ._aa_c span, header section span');
318 data.bio = bioEl ? bioEl.innerText.trim() : '';
319
320 const statsEls = document.querySelectorAll('header section ul li, header section a span');
321 statsEls.forEach(el => {
322 const text = el.innerText.toLowerCase();
323 const countMatch = text.match(/[\d,kmb.]+/i);
324 if (!countMatch) return;
325
326 let countStr = countMatch[0].replace(/,/g, '');
327 let multiplier = 1;
328 if (countStr.toLowerCase().includes('k')) {
329 multiplier = 1000;
330 countStr = countStr.replace(/k/i, '');
331 } else if (countStr.toLowerCase().includes('m')) {
332 multiplier = 1000000;
333 countStr = countStr.replace(/m/i, '');
334 } else if (countStr.toLowerCase().includes('b')) {
335 multiplier = 1000000000;
336 countStr = countStr.replace(/b/i, '');
337 }
338 const count = Math.floor(parseFloat(countStr) * multiplier);
339
340 if (text.includes('post')) data.posts = count;
341 if (text.includes('follower')) data.followers = count;
342 if (text.includes('following')) data.following = count;
343 });
344
345 const websiteEl = document.querySelector('header section a[href*="http"]');
346 data.website = websiteEl ? websiteEl.href : null;
347
348 data.isVerified = !!document.querySelector('header svg[aria-label*="Verified"], header svg[aria-label*="verified"]');
349
350 return data;
351 } catch (e) {
352 console.error('Error extracting profile data:', e.message);
353 return { username: window.location.pathname.split('/')[1] };
354 }
355 });
356
357 profileData.url = `https://www.instagram.com/${username}/`;
358 profileData.scrapedAt = new Date().toISOString();
359 profileData.email = extractEmailFromText(profileData.bio);
360 profileData.phone = extractPhoneFromText(profileData.bio);
361 profileData.accountType = 'personal';
362
363 console.log(` Profile: @${profileData.username} - ${profileData.followers || 0} followers`);
364
365
366 if (reqMode === 'MODE1' || reqMode === 'MODE2') {
367 const shouldScrapeFollowers = reqMode === 'MODE1' ? scrapeFollowers : scrapeFollowersMode2;
368 const shouldScrapeFollowing = reqMode === 'MODE1' ? scrapeFollowing : scrapeFollowingMode2;
369 const maxPerAccount = reqMode === 'MODE1' ? maxProfilesMode1 : maxProfilesPerAccount;
370
371 if (shouldScrapeFollowers) {
372 console.log(` 📥 Getting followers...`);
373
374 try {
375 await page.click('a[href*="/followers/"]', { timeout: 5000 });
376 await page.waitForTimeout(3000);
377
378 const followerUsernames = await page.evaluate(async (maxCount) => {
379 const usernames = new Set();
380 const dialog = document.querySelector('div[role="dialog"]');
381 if (!dialog) return [];
382
383 let lastCount = 0;
384 let stableCount = 0;
385
386 while (usernames.size < maxCount && stableCount < 5) {
387 const links = dialog.querySelectorAll('a[href^="/"]');
388 links.forEach(link => {
389 const href = link.getAttribute('href');
390 const match = href.match(/^\/([^\/]+)\/?$/);
391 if (match && match[1]) {
392 usernames.add(match[1]);
393 }
394 });
395
396 dialog.scrollTop = dialog.scrollHeight;
397 await new Promise(r => setTimeout(r, 1500));
398
399 if (usernames.size === lastCount) {
400 stableCount++;
401 } else {
402 stableCount = 0;
403 }
404 lastCount = usernames.size;
405 }
406
407 return Array.from(usernames);
408 }, maxPerAccount);
409
410 console.log(` ✅ Found ${followerUsernames.length} followers`);
411
412 for (const followerUsername of followerUsernames) {
413 if (profileCount >= maxProfiles) break;
414 if (scrapedUsernames.has(followerUsername)) continue;
415
416 scrapedUsernames.add(followerUsername);
417 await crawler.addRequests([{
418 url: `https://www.instagram.com/${followerUsername}/`,
419 userData: { label: 'PROFILE_DIRECT', username: followerUsername }
420 }]);
421 }
422
423 await page.keyboard.press('Escape');
424 await page.waitForTimeout(1000);
425 } catch (e) {
426 console.log(` ⚠️ Could not scrape followers: ${e.message}`);
427 }
428 }
429
430 if (shouldScrapeFollowing) {
431 console.log(` 📤 Getting following...`);
432
433 try {
434 await page.click('a[href*="/following/"]', { timeout: 5000 });
435 await page.waitForTimeout(3000);
436
437 const followingUsernames = await page.evaluate(async (maxCount) => {
438 const usernames = new Set();
439 const dialog = document.querySelector('div[role="dialog"]');
440 if (!dialog) return [];
441
442 let lastCount = 0;
443 let stableCount = 0;
444
445 while (usernames.size < maxCount && stableCount < 5) {
446 const links = dialog.querySelectorAll('a[href^="/"]');
447 links.forEach(link => {
448 const href = link.getAttribute('href');
449 const match = href.match(/^\/([^\/]+)\/?$/);
450 if (match && match[1]) {
451 usernames.add(match[1]);
452 }
453 });
454
455 dialog.scrollTop = dialog.scrollHeight;
456 await new Promise(r => setTimeout(r, 1500));
457
458 if (usernames.size === lastCount) {
459 stableCount++;
460 } else {
461 stableCount = 0;
462 }
463 lastCount = usernames.size;
464 }
465
466 return Array.from(usernames);
467 }, maxPerAccount);
468
469 console.log(` ✅ Found ${followingUsernames.length} following`);
470
471 for (const followingUsername of followingUsernames) {
472 if (profileCount >= maxProfiles) break;
473 if (scrapedUsernames.has(followingUsername)) continue;
474
475 scrapedUsernames.add(followingUsername);
476 await crawler.addRequests([{
477 url: `https://www.instagram.com/${followingUsername}/`,
478 userData: { label: 'PROFILE_DIRECT', username: followingUsername }
479 }]);
480 }
481 } catch (e) {
482 console.log(` ⚠️ Could not scrape following: ${e.message}`);
483 }
484 }
485 }
486 }
487
488
489 if (label === 'PROFILE_DIRECT') {
490 if (profileCount >= maxProfiles) {
491 console.log(`⏸️ Reached maximum profile limit (${maxProfiles})`);
492 return;
493 }
494
495 console.log(`👤 Scraping: @${username} (${profileCount + 1}/${maxProfiles})`);
496
497 try {
498 await page.waitForSelector('header section', { timeout: 15000 });
499 } catch (e) {
500 console.log(` ⚠️ Could not load profile: ${e.message}`);
501 return;
502 }
503
504 const profileData = await page.evaluate(() => {
505 try {
506 const data = {};
507 data.username = window.location.pathname.split('/')[1];
508
509 const nameEl = document.querySelector('header section h2, header h1, header section span');
510 data.fullName = nameEl ? nameEl.innerText.trim() : '';
511
512 const bioEl = document.querySelector('header section div.-vDIg span, header section ._aa_c span, header section span');
513 data.bio = bioEl ? bioEl.innerText.trim() : '';
514
515 const statsEls = document.querySelectorAll('header section ul li, header section a span');
516 statsEls.forEach(el => {
517 const text = el.innerText.toLowerCase();
518 const countMatch = text.match(/[\d,kmb.]+/i);
519 if (!countMatch) return;
520
521 let countStr = countMatch[0].replace(/,/g, '');
522 let multiplier = 1;
523 if (countStr.toLowerCase().includes('k')) {
524 multiplier = 1000;
525 countStr = countStr.replace(/k/i, '');
526 } else if (countStr.toLowerCase().includes('m')) {
527 multiplier = 1000000;
528 countStr = countStr.replace(/m/i, '');
529 } else if (countStr.toLowerCase().includes('b')) {
530 multiplier = 1000000000;
531 countStr = countStr.replace(/b/i, '');
532 }
533 const count = Math.floor(parseFloat(countStr) * multiplier);
534
535 if (text.includes('post')) data.posts = count;
536 if (text.includes('follower')) data.followers = count;
537 if (text.includes('following')) data.following = count;
538 });
539
540 const websiteEl = document.querySelector('header section a[href*="http"]');
541 data.website = websiteEl ? websiteEl.href : null;
542
543 data.isVerified = !!document.querySelector('header svg[aria-label*="Verified"], header svg[aria-label*="verified"]');
544
545 return data;
546 } catch (e) {
547 return { username: window.location.pathname.split('/')[1] };
548 }
549 });
550
551 profileData.url = `https://www.instagram.com/${username}/`;
552 profileData.scrapedAt = new Date().toISOString();
553 profileData.email = extractEmailFromText(profileData.bio);
554 profileData.phone = extractPhoneFromText(profileData.bio);
555 profileData.accountType = 'personal';
556
557 if (calculateEngagementRate && profileData.followers > 0) {
558 const estimatedRate = Math.max(0.5, Math.min(15, 100 / Math.log10(profileData.followers + 10)));
559 profileData.engagementRate = estimatedRate.toFixed(2) + '%';
560 }
561
562 if (applyFilters(profileData)) {
563 await Actor.pushData(profileData);
564 profileCount++;
565 console.log(` ✅ Saved (${profileCount}/${maxProfiles})`);
566 } else {
567 console.log(` ⛔ Filtered out`);
568 }
569 }
570
571
572 if (label === 'NETWORK_SEED') {
573 console.log(`🌐 Network expansion from: @${username} (depth: ${depth}/${expansionDepth})`);
574
575 try {
576 await page.waitForTimeout(3000);
577 await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
578 await page.waitForTimeout(2000);
579
580 const suggestedUsernames = await page.evaluate(() => {
581 const usernames = [];
582 const links = document.querySelectorAll('a[href^="/"]');
583 links.forEach(link => {
584 const href = link.getAttribute('href');
585 const match = href.match(/^\/([^\/]+)\/?$/);
586 if (match && match[1]) {
587 usernames.push(match[1]);
588 }
589 });
590 return [...new Set(usernames)];
591 });
592
593 console.log(` Found ${suggestedUsernames.length} suggested profiles`);
594
595 const toProcess = suggestedUsernames.slice(0, 20);
596 for (const suggestedUsername of toProcess) {
597 if (profileCount >= maxProfiles) break;
598 if (scrapedUsernames.has(suggestedUsername)) continue;
599
600 scrapedUsernames.add(suggestedUsername);
601
602 await crawler.addRequests([{
603 url: `https://www.instagram.com/${suggestedUsername}/`,
604 userData: { label: 'PROFILE_DIRECT', username: suggestedUsername }
605 }]);
606
607 if (depth < expansionDepth - 1) {
608 await crawler.addRequests([{
609 url: `https://www.instagram.com/${suggestedUsername}/`,
610 userData: { label: 'NETWORK_SEED', username: suggestedUsername, depth: depth + 1 }
611 }]);
612 }
613 }
614 } catch (e) {
615 console.log(` ⚠️ Network expansion error: ${e.message}`);
616 }
617 }
618 } catch (error) {
619 console.error(`❌ Error in request handler: ${error.message}`);
620 console.error(error.stack);
621 }
622 },
623
624 failedRequestHandler: async ({ request, error }) => {
625 console.log(`❌ Request failed: ${request.url}`);
626 console.log(` Error: ${error.message}`);
627 },
628
629 maxRequestRetries: 2,
630 requestHandlerTimeoutSecs: 120,
631 });
632
633
634 try {
635 const startUrls = generateStartUrls();
636 console.log(`\n🎯 Starting with ${startUrls.length} initial URLs`);
637 console.log(`⚙️ Data extraction: Email=${extractEmail}, Phone=${extractPhone}, Website=${extractWebsite}`);
638 if (keywords.length > 0) {
639 console.log(`🔍 Filtering by keywords: ${keywords.join(', ')}`);
640 }
641 console.log(`\n▶️ Starting crawl...\n`);
642
643 await crawler.run(startUrls);
644
645 console.log(`\n✅ Scraping completed!`);
646 console.log(`📊 Total profiles scraped: ${profileCount}`);
647 console.log(`💾 Results saved to dataset`);
648 } catch (error) {
649 console.error(`\n❌ Crawl error: ${error.message}`);
650 console.error(error.stack);
651 throw error;
652 }
653};
654
655
656try {
657 await Actor.init();
658 await main();
659 await Actor.exit();
660} catch (error) {
661 console.error(`\n💥 Critical error: ${error.message}`);
662 console.error(error.stack);
663 await Actor.exit({ statusMessage: error.message, exitCode: 1 });
664}