
iG Profile Scraper
Pricing
$17.99/month + usage
Go to Apify Store

iG Profile Scraper
Under maintenance5.0 (1)
Pricing
$17.99/month + usage
0
5
1
Last modified
11 days ago

Pricing
$17.99/month + usage

5.0 (1)
Pricing
$17.99/month + usage
0
5
1
Last modified
11 days ago
{    "actorSpecification": 1,    "name": "ig-profile-scraper",    "title": "iG Profile Scraper",    "description": "Analyze Instagram followers or following to find valuable profiles. Filter large accounts by engagement, contact info, and business category to discover influencers, leads, or ideal customers. Perfect for lead generation, market research, and influencer marketing.",    "version": "1.0.0",    "meta": {        "templateId": "js-crawlee"    },    "input": "./input_schema.json",    "dockerfile": "./Dockerfile",    "readme": "./README.md",    "categories": ["SOCIAL_MEDIA"],    "storages": {        "dataset": {            "actorSpecification": 1,            "title": "Instagram Profiles",            "views": {                "overview": {                    "title": "Overview",                    "transformation": {                        "fields": [                            "username",                            "fullName",                            "bio",                            "email",                            "phone",                            "website",                            "followers",                            "following",                            "posts",                            "businessCategory",                            "engagementRate",                            "accountType",                            "isVerified",                            "url",                            "scrapedAt"                        ]                    },                    "display": {                        "component": "table",                        "properties": {                            "username": {                                "label": "Username",                                "format": "text"                            },                            "fullName": {                                "label": "Full Name",                                "format": "text"                            },                            "bio": {                                "label": "Bio",                                "format": "text"                            },                            "email": {                                "label": "Email",                                "format": "text"                            },                            "phone": {                                "label": "Phone",                                "format": "text"                            },                            "website": {                                "label": "Website",                                "format": "link"                            },                            "followers": {                                "label": "Followers",                                "format": "number"                            },                            "following": {                                "label": "Following",                                "format": "number"                            },                            "posts": {                                "label": "Posts",                                "format": "number"                            },                            "businessCategory": {                                "label": "Business Category",                                "format": "text"                            },                            "engagementRate": {                                "label": "Engagement Rate",                                "format": "text"                            },                            "accountType": {                                "label": "Account Type",                                "format": "text"                            },                            "isVerified": {                                "label": "Verified",                                "format": "boolean"                            },                            "url": {                                "label": "Profile URL",                                "format": "link"                            },                            "scrapedAt": {                                "label": "Scraped At",                                "format": "date"                            }                        }                    }                }            }        }    }}{    "title": "iG Profile Scraper Input",    "type": "object",    "schemaVersion": 1,    "properties": {        "mode": {            "title": "1. Choose Operation Mode",            "type": "string",            "description": "Select how you want to scrape Instagram profiles",            "enum": ["MODE1", "MODE2", "MODE3", "MODE4"],            "enumTitles": [                "🎯 Mode 1: Analyze a Single Account's Followers/Following",                "🎯 Mode 2: Batch Analyze Followers/Following",                 "🎯 Mode 3: Analyze a Specific List of Accounts",                "🎯 Mode 4: Network Expansion"            ],            "default": "MODE1",            "editor": "select"        },        "targetUsername": {            "title": "Target Account",            "type": "string",            "description": "Instagram handle to scrape (without @). Example: instagram",            "editor": "textfield",            "example": "instagram",            "sectionCaption": "🎯 Mode 1: Analyze a Single Account's Followers/Following",            "sectionDescription": "Scrapes followers or following from a single target account."        },        "scrapeFollowers": {            "title": "Scrape Followers",            "type": "boolean",            "description": "Extract followers of the target account",            "default": true        },        "scrapeFollowing": {            "title": "Scrape Following",            "type": "boolean",            "description": "Extract following list of the target account",            "default": false        },        "maxProfilesMode1": {            "title": "Max profiles to process (cost control)",            "type": "integer",            "description": "Maximum number of profiles to scrape",            "minimum": 1,            "maximum": 100000,            "default": 100        },        "targetUsernames": {            "title": "List of Target Profiles",            "type": "array",            "description": "Instagram handles for batch processing (one per line, without @)",            "editor": "stringList",            "example": ["openai", "claudeai", "instagram"],            "sectionCaption": "🎯 Mode 2: Batch Analyze Followers/Following",            "sectionDescription": "Scrapes followers/following from a list of multiple target accounts."        },        "scrapeFollowersMode2": {            "title": "Scrape Followers",            "type": "boolean",            "description": "Extract followers from each account",            "default": true        },        "scrapeFollowingMode2": {            "title": "Scrape Following",             "type": "boolean",            "description": "Extract following from each account",            "default": false        },        "maxProfilesPerAccount": {            "title": "Max profiles to process per account (cost control)",            "type": "integer",            "description": "Maximum profiles to extract per account",            "minimum": 1,            "maximum": 10000,            "default": 100        },        "usernamesToAnalyze": {            "title": "List of Profiles to Analyze",            "type": "array",            "description": "Specific Instagram handles to enrich with data (one per line, without @)",            "editor": "stringList",            "example": ["openai", "claudeai", "instagram"],            "sectionCaption": "🎯 Mode 3: Analyze a Specific List of Accounts",            "sectionDescription": "Enriches a provided list of profiles with data (does not scrape their followers)."        },        "maxProfilesMode3": {            "title": "Max profiles to process (cost control)",            "type": "integer",            "description": "Maximum number of profiles to analyze",            "minimum": 1,            "maximum": 100000,            "default": 100        },        "seedUsernames": {            "title": "Start Profiles",            "type": "array",            "description": "Seed profiles to start network expansion (one per line, without @)",            "editor": "stringList",            "example": ["influencer1", "influencer2"],            "sectionCaption": "🎯 Mode 4: Network Expansion",            "sectionDescription": "Discovers similar profiles using Instagram's own «Suggested for You» algorithm."        },        "expansionDepth": {            "title": "Search Depth",            "type": "integer",            "description": "How many levels deep to expand (1 recommended for cost efficiency)",            "minimum": 1,            "maximum": 3,            "default": 1        },        "maxProfilesMode4": {            "title": "Max profiles to process (cost control)",            "type": "integer",            "description": "Maximum number of profiles to discover",            "minimum": 1,            "maximum": 100000,            "default": 200        },        "extractEmail": {            "title": "Extract Email",            "type": "boolean",            "description": "Try to extract email addresses from bio and business contact info",            "default": true,            "sectionCaption": "⚙️ Data Extraction Options",            "sectionDescription": "Select what data to collect and which enrichments to perform. These options apply to all modes."        },        "extractPhone": {            "title": "Extract Phone Number",            "type": "boolean",            "description": "Try to extract phone numbers from bio and business contact info",            "default": true        },        "extractWebsite": {            "title": "Extract Website URL",            "type": "boolean",            "description": "Extract website/external links from profile",            "default": true        },        "extractBusinessCategory": {            "title": "Extract Business Category",            "type": "boolean",            "description": "Extract business category/niche (if available)",            "default": true        },        "extractAddress": {            "title": "Extract Physical Address",            "type": "boolean",            "description": "Extract business address from location info",            "default": false        },        "calculateEngagementRate": {            "title": "Calculate Engagement Rate (ER)",            "type": "boolean",            "description": "Calculate engagement rate based on recent posts (may increase processing time)",            "default": true        },        "extractPostCaptions": {            "title": "Extract Latest Post Captions",            "type": "boolean",            "description": "Get text content from recent posts",            "default": false        },        "deepContactSearch": {            "title": "Deep Search for Contacts in Posts",            "type": "boolean",            "description": "Search for additional contact info in post captions and comments (slower)",            "default": false        },        "keywords": {            "title": "Filter by Keywords",            "type": "array",            "description": "Only include profiles containing these keywords (leave empty to disable)",            "editor": "stringList",            "example": ["marketing", "entrepreneur", "coach"],            "sectionCaption": "🔬 Advanced Filtering (Optional)",            "sectionDescription": "Use these filters to get a highly targeted list and save on processing costs."        },        "keywordSearchIn": {            "title": "Search Keywords In",            "type": "string",            "description": "Where to search for keywords",            "enum": ["bio", "fullName", "both"],            "enumTitles": ["Biography or Full Name", "Biography only", "Full Name only"],            "default": "bio",            "editor": "select"        },        "locationKeywords": {            "title": "Filter by Location Keywords",            "type": "array",            "description": "Filter by location mentioned in bio (e.g., New York, London, Dubai)",            "editor": "stringList",            "example": ["New York", "London", "Dubai"]        },        "language": {            "title": "Filter by Profile Language",            "type": "string",            "description": "Filter profiles by detected language in bio",            "enum": ["any", "en", "es", "fr", "de", "it", "pt", "ar", "hi", "ja", "ko"],            "enumTitles": ["any", "English", "Spanish", "French", "German", "Italian", "Portuguese", "Arabic", "Hindi", "Japanese", "Korean"],            "default": "any",            "editor": "select"        },        "minFollowers": {            "title": "Follower Count Range: Min",            "type": "integer",            "description": "Minimum number of followers (0 = no minimum)",            "minimum": 0,            "default": 0        },        "maxFollowers": {            "title": "and Max",            "type": "integer",            "description": "Maximum number of followers (0 = unlimited)",            "minimum": 0,            "default": 0        },        "lastPostAfter": {            "title": "Filter by Last Post Date",            "type": "string",            "description": "Only include profiles that posted after this date (YYYY-MM-DD)",            "editor": "textfield",            "example": "2024-01-01"        },        "minPostsInPeriod": {            "title": "Filter by Posting Frequency: Min Posts",            "type": "integer",            "description": "Minimum number of posts required in the specified period",            "minimum": 0,            "default": 0        },        "postingPeriodDays": {            "title": "in the last (days)",            "type": "integer",            "description": "Time period in days to check posting frequency",            "minimum": 1,            "maximum": 365,            "default": 30        },        "reelsFilter": {            "title": "Filter by Recent Reels",            "type": "string",            "description": "Filter by presence of recent Reels/video content",            "enum": ["disabled", "hasReels", "noReels"],            "enumTitles": ["Disabled", "Has Recent Reels", "No Recent Reels"],            "default": "disabled",            "editor": "select"        },        "minMedianViews": {            "title": "Filter by Median Views",            "type": "integer",            "description": "Minimum median views on recent posts (0 = disabled)",            "minimum": 0,            "default": 0        },        "highEngagementOnly": {            "title": "Filter by Views/Followers Ratio >= 30%",            "type": "boolean",            "description": "Only include accounts with high engagement (views >= 30% of followers)",            "default": false        },        "contactInfoFilter": {            "title": "Filter by Contact Info Presence",            "type": "string",            "description": "Filter profiles by availability of contact information",            "enum": ["any", "hasEmail", "hasPhone", "hasWebsite", "hasAny", "hasEmailOrPhone"],            "enumTitles": ["Any", "Has Email", "Has Phone Number", "Has Website", "Has Any Contact", "Has Email or Phone"],            "default": "any",            "editor": "select"        },        "requireWebsite": {            "title": "Filter by Website Presence",            "type": "boolean",            "description": "Only include profiles with website links",            "default": false        },        "accountTypeFilter": {            "title": "Filter by Account Type",            "type": "string",            "description": "Filter by Instagram account type",            "enum": ["any", "business", "creator", "personal"],            "enumTitles": ["Any", "Business accounts only", "Creator accounts only", "Personal accounts only"],            "default": "any",            "editor": "select"        },        "influencersOnly": {            "title": "Filter for Influencers Only (by Category)",            "type": "boolean",            "description": "Only include accounts identified as influencers",            "default": false        },        "businessCategories": {            "title": "Filter by Specific Business Category",            "type": "array",            "description": "Specific business categories/niches to filter (e.g., Fashion, Fitness, Food)",            "editor": "stringList",            "example": ["Fashion", "Fitness", "Food", "Travel", "Beauty"]        },        "verificationFilter": {            "title": "Filter by Verification",            "type": "string",            "description": "Filter by Instagram verification status (blue checkmark)",            "enum": ["any", "verified", "notVerified"],            "enumTitles": ["Any", "Verified only", "Not verified only"],            "default": "any",            "editor": "select"        },        "proxy": {            "title": "Proxy Configuration",            "type": "object",            "description": "Proxy settings - Residential proxies strongly recommended for Instagram",            "editor": "proxy",            "default": {                "useApifyProxy": true,                "apifyProxyGroups": ["RESIDENTIAL"]            },            "sectionCaption": "▶️ Run & Storage Options"        },        "sessionCookie": {            "title": "Instagram Session Cookie (Recommended)",            "type": "string",            "description": "Your Instagram sessionid cookie for better access and data quality. See documentation for how to get this.",            "editor": "textfield",            "isSecret": true        }    },    "required": ["mode"]}1import { Actor } from 'apify';2import { PlaywrightCrawler } from 'crawlee';3
4// Main execution wrapped in try-catch5const main = async () => {6    console.log('🚀 iG Profile Scraper - Starting...');7
8    const input = await Actor.getInput();9    10    if (!input) {11        throw new Error('No input provided. Please configure the actor input.');12    }13
14    const {15        mode = 'MODE1',16        targetUsername,17        scrapeFollowers = true,18        scrapeFollowing = false,19        maxProfilesMode1 = 100,20        targetUsernames = [],21        scrapeFollowersMode2 = true,22        scrapeFollowingMode2 = false,23        maxProfilesPerAccount = 100,24        usernamesToAnalyze = [],25        maxProfilesMode3 = 100,26        seedUsernames = [],27        expansionDepth = 1,28        maxProfilesMode4 = 200,29        extractEmail = true,30        extractPhone = true,31        extractWebsite = true,32        extractBusinessCategory = true,33        extractAddress = false,34        calculateEngagementRate = true,35        extractPostCaptions = false,36        deepContactSearch = false,37        keywords = [],38        keywordSearchIn = 'bio',39        locationKeywords = [],40        language = 'any',41        minFollowers = 0,42        maxFollowers = 0,43        lastPostAfter,44        minPostsInPeriod = 0,45        postingPeriodDays = 30,46        reelsFilter = 'disabled',47        minMedianViews = 0,48        highEngagementOnly = false,49        contactInfoFilter = 'any',50        requireWebsite = false,51        accountTypeFilter = 'any',52        influencersOnly = false,53        businessCategories = [],54        verificationFilter = 'any',55        proxy,56        sessionCookie57    } = input;58
59    // Calculate max profiles based on mode60    let maxProfiles = 100;61    try {62        switch(mode) {63            case 'MODE1': 64                maxProfiles = maxProfilesMode1 || 100; 65                break;66            case 'MODE2': 67                maxProfiles = (maxProfilesPerAccount || 100) * Math.max(1, (targetUsernames || []).length); 68                break;69            case 'MODE3': 70                maxProfiles = maxProfilesMode3 || 100; 71                break;72            case 'MODE4': 73                maxProfiles = maxProfilesMode4 || 200; 74                break;75        }76    } catch (e) {77        console.log(`⚠️  Error calculating max profiles: ${e.message}`);78        maxProfiles = 100;79    }80
81    console.log(`📊 Mode: ${mode}`);82    console.log(`🎯 Max profiles to scrape: ${maxProfiles}`);83
84    // Validate mode-specific requirements85    if (mode === 'MODE1' && !targetUsername) {86        throw new Error('Target username is required for Mode 1. Please provide a username in the "Target Account" field.');87    }88    if (mode === 'MODE2' && (!targetUsernames || targetUsernames.length === 0)) {89        throw new Error('At least one username is required for Mode 2. Please provide usernames in the "List of Target Profiles" field.');90    }91    if (mode === 'MODE3' && (!usernamesToAnalyze || usernamesToAnalyze.length === 0)) {92        throw new Error('At least one username is required for Mode 3. Please provide usernames in the "List of Profiles to Analyze" field.');93    }94    if (mode === 'MODE4' && (!seedUsernames || seedUsernames.length === 0)) {95        throw new Error('At least one seed username is required for Mode 4. Please provide usernames in the "Start Profiles" field.');96    }97
98    // Setup proxy99    let proxyConfiguration;100    try {101        proxyConfiguration = await Actor.createProxyConfiguration(proxy);102        console.log('✅ Proxy configuration created');103    } catch (e) {104        console.error(`❌ Error creating proxy configuration: ${e.message}`);105        throw new Error('Failed to create proxy configuration. Please check your proxy settings.');106    }107
108    // Helper functions109    const extractEmailFromText = (text) => {110        if (!text || !extractEmail) return null;111        try {112            const emailRegex = /([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)/gi;113            const matches = text.match(emailRegex);114            return matches ? matches[0] : null;115        } catch (e) {116            return null;117        }118    };119
120    const extractPhoneFromText = (text) => {121        if (!text || !extractPhone) return null;122        try {123            const phoneRegex = /[\+]?[(]?[0-9]{1,4}[)]?[-\s\.]?[(]?[0-9]{1,4}[)]?[-\s\.]?[0-9]{1,4}[-\s\.]?[0-9]{1,9}/g;124            const matches = text.match(phoneRegex);125            return matches ? matches.filter(m => m.length >= 10)[0] : null;126        } catch (e) {127            return null;128        }129    };130
131    const containsKeywords = (text, keywordList) => {132        if (!keywordList || keywordList.length === 0) return true;133        if (!text) return false;134        try {135            const lowerText = text.toLowerCase();136            return keywordList.some(keyword => lowerText.includes(keyword.toLowerCase()));137        } catch (e) {138            return false;139        }140    };141
142    const applyFilters = (profile) => {143        try {144            // Keyword filter145            if (keywords.length > 0) {146                let textToSearch = '';147                if (keywordSearchIn === 'bio' || keywordSearchIn === 'both') {148                    textToSearch += (profile.bio || '') + ' ';149                }150                if (keywordSearchIn === 'fullName' || keywordSearchIn === 'both') {151                    textToSearch += (profile.fullName || '');152                }153                if (!containsKeywords(textToSearch, keywords)) return false;154            }155
156            // Location filter157            if (locationKeywords.length > 0) {158                if (!containsKeywords(profile.bio || '', locationKeywords)) return false;159            }160
161            // Follower count filter162            if (minFollowers > 0 && profile.followers < minFollowers) return false;163            if (maxFollowers > 0 && profile.followers > maxFollowers) return false;164
165            // Contact info filter166            if (contactInfoFilter !== 'any') {167                const hasEmail = !!profile.email;168                const hasPhone = !!profile.phone;169                const hasWebsite = !!profile.website;170                171                if (contactInfoFilter === 'hasEmail' && !hasEmail) return false;172                if (contactInfoFilter === 'hasPhone' && !hasPhone) return false;173                if (contactInfoFilter === 'hasWebsite' && !hasWebsite) return false;174                if (contactInfoFilter === 'hasAny' && !hasEmail && !hasPhone && !hasWebsite) return false;175                if (contactInfoFilter === 'hasEmailOrPhone' && !hasEmail && !hasPhone) return false;176            }177
178            if (requireWebsite && !profile.website) return false;179            if (accountTypeFilter !== 'any' && profile.accountType !== accountTypeFilter) return false;180            if (verificationFilter === 'verified' && !profile.isVerified) return false;181            if (verificationFilter === 'notVerified' && profile.isVerified) return false;182
183            return true;184        } catch (e) {185            console.error(`Error applying filters: ${e.message}`);186            return true; // Don't filter out on error187        }188    };189
190    // Generate start URLs191    const generateStartUrls = () => {192        const urls = [];193        194        try {195            switch (mode) {196                case 'MODE1':197                    console.log(`🎯 Mode 1: Analyzing @${targetUsername}`);198                    urls.push({199                        url: `https://www.instagram.com/${targetUsername}/`,200                        userData: { label: 'PROFILE', username: targetUsername, mode: 'MODE1' }201                    });202                    break;203                    204                case 'MODE2':205                    console.log(`🎯 Mode 2: Batch analyzing ${targetUsernames.length} accounts`);206                    targetUsernames.forEach(username => {207                        urls.push({208                            url: `https://www.instagram.com/${username}/`,209                            userData: { label: 'PROFILE', username, mode: 'MODE2' }210                        });211                    });212                    break;213                    214                case 'MODE3':215                    console.log(`🎯 Mode 3: Analyzing ${usernamesToAnalyze.length} specific profiles`);216                    usernamesToAnalyze.slice(0, maxProfiles).forEach(username => {217                        urls.push({218                            url: `https://www.instagram.com/${username}/`,219                            userData: { label: 'PROFILE_DIRECT', username }220                        });221                    });222                    break;223                    224                case 'MODE4':225                    console.log(`🎯 Mode 4: Network expansion from ${seedUsernames.length} seed profiles`);226                    seedUsernames.forEach(username => {227                        urls.push({228                            url: `https://www.instagram.com/${username}/`,229                            userData: { label: 'NETWORK_SEED', username, depth: 0 }230                        });231                    });232                    break;233            }234        } catch (e) {235            console.error(`Error generating start URLs: ${e.message}`);236            throw e;237        }238        239        return urls;240    };241
242    const scrapedUsernames = new Set();243    let profileCount = 0;244
245    // Create Playwright crawler246    console.log('🔧 Creating Playwright crawler...');247    248    const crawler = new PlaywrightCrawler({249        proxyConfiguration,250        maxRequestsPerCrawl: maxProfiles * 10,251        headless: true,252        253        launchContext: {254            launchOptions: {255                headless: true,256                args: [257                    '--no-sandbox',258                    '--disable-setuid-sandbox',259                    '--disable-dev-shm-usage',260                    '--disable-accelerated-2d-canvas',261                    '--disable-gpu'262                ]263            }264        },265
266        preNavigationHooks: [async ({ page, request }) => {267            try {268                // Set user agent269                await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');270                271                // Set session cookie if provided272                if (sessionCookie) {273                    await page.context().addCookies([274                        {275                            name: 'sessionid',276                            value: sessionCookie,277                            domain: '.instagram.com',278                            path: '/',279                            httpOnly: true,280                            secure: true,281                            sameSite: 'None'282                        }283                    ]);284                }285            } catch (e) {286                console.error(`Error in preNavigationHook: ${e.message}`);287            }288        }],289
290        requestHandler: async ({ page, request, crawler }) => {291            try {292                const { label, username, mode: reqMode, depth } = request.userData;293
294                await page.waitForTimeout(2000 + Math.random() * 2000);295
296                // Handle profile page for Mode 1 and Mode 2297                if (label === 'PROFILE') {298                    console.log(`📋 Analyzing profile: @${username}`);299                    300                    try {301                        await page.waitForSelector('header section', { timeout: 15000 });302                    } catch (e) {303                        console.log(`   ⚠️  Could not load profile page: ${e.message}`);304                        return;305                    }306                    307                    // Extract profile data308                    const profileData = await page.evaluate(() => {309                        try {310                            const data = {};311                            312                            data.username = window.location.pathname.split('/')[1];313                            314                            const nameEl = document.querySelector('header section h2, header h1, header section span');315                            data.fullName = nameEl ? nameEl.innerText.trim() : '';316                            317                            const bioEl = document.querySelector('header section div.-vDIg span, header section ._aa_c span, header section span');318                            data.bio = bioEl ? bioEl.innerText.trim() : '';319                            320                            const statsEls = document.querySelectorAll('header section ul li, header section a span');321                            statsEls.forEach(el => {322                                const text = el.innerText.toLowerCase();323                                const countMatch = text.match(/[\d,kmb.]+/i);324                                if (!countMatch) return;325                                326                                let countStr = countMatch[0].replace(/,/g, '');327                                let multiplier = 1;328                                if (countStr.toLowerCase().includes('k')) {329                                    multiplier = 1000;330                                    countStr = countStr.replace(/k/i, '');331                                } else if (countStr.toLowerCase().includes('m')) {332                                    multiplier = 1000000;333                                    countStr = countStr.replace(/m/i, '');334                                } else if (countStr.toLowerCase().includes('b')) {335                                    multiplier = 1000000000;336                                    countStr = countStr.replace(/b/i, '');337                                }338                                const count = Math.floor(parseFloat(countStr) * multiplier);339                                340                                if (text.includes('post')) data.posts = count;341                                if (text.includes('follower')) data.followers = count;342                                if (text.includes('following')) data.following = count;343                            });344                            345                            const websiteEl = document.querySelector('header section a[href*="http"]');346                            data.website = websiteEl ? websiteEl.href : null;347                            348                            data.isVerified = !!document.querySelector('header svg[aria-label*="Verified"], header svg[aria-label*="verified"]');349                            350                            return data;351                        } catch (e) {352                            console.error('Error extracting profile data:', e.message);353                            return { username: window.location.pathname.split('/')[1] };354                        }355                    });356
357                    profileData.url = `https://www.instagram.com/${username}/`;358                    profileData.scrapedAt = new Date().toISOString();359                    profileData.email = extractEmailFromText(profileData.bio);360                    profileData.phone = extractPhoneFromText(profileData.bio);361                    profileData.accountType = 'personal';362                    363                    console.log(`   Profile: @${profileData.username} - ${profileData.followers || 0} followers`);364
365                    // For Mode 1 and Mode 2, get followers/following366                    if (reqMode === 'MODE1' || reqMode === 'MODE2') {367                        const shouldScrapeFollowers = reqMode === 'MODE1' ? scrapeFollowers : scrapeFollowersMode2;368                        const shouldScrapeFollowing = reqMode === 'MODE1' ? scrapeFollowing : scrapeFollowingMode2;369                        const maxPerAccount = reqMode === 'MODE1' ? maxProfilesMode1 : maxProfilesPerAccount;370                        371                        if (shouldScrapeFollowers) {372                            console.log(`   📥 Getting followers...`);373                            374                            try {375                                await page.click('a[href*="/followers/"]', { timeout: 5000 });376                                await page.waitForTimeout(3000);377                                378                                const followerUsernames = await page.evaluate(async (maxCount) => {379                                    const usernames = new Set();380                                    const dialog = document.querySelector('div[role="dialog"]');381                                    if (!dialog) return [];382                                    383                                    let lastCount = 0;384                                    let stableCount = 0;385                                    386                                    while (usernames.size < maxCount && stableCount < 5) {387                                        const links = dialog.querySelectorAll('a[href^="/"]');388                                        links.forEach(link => {389                                            const href = link.getAttribute('href');390                                            const match = href.match(/^\/([^\/]+)\/?$/);391                                            if (match && match[1]) {392                                                usernames.add(match[1]);393                                            }394                                        });395                                        396                                        dialog.scrollTop = dialog.scrollHeight;397                                        await new Promise(r => setTimeout(r, 1500));398                                        399                                        if (usernames.size === lastCount) {400                                            stableCount++;401                                        } else {402                                            stableCount = 0;403                                        }404                                        lastCount = usernames.size;405                                    }406                                    407                                    return Array.from(usernames);408                                }, maxPerAccount);409                                410                                console.log(`   ✅ Found ${followerUsernames.length} followers`);411                                412                                for (const followerUsername of followerUsernames) {413                                    if (profileCount >= maxProfiles) break;414                                    if (scrapedUsernames.has(followerUsername)) continue;415                                    416                                    scrapedUsernames.add(followerUsername);417                                    await crawler.addRequests([{418                                        url: `https://www.instagram.com/${followerUsername}/`,419                                        userData: { label: 'PROFILE_DIRECT', username: followerUsername }420                                    }]);421                                }422                                423                                await page.keyboard.press('Escape');424                                await page.waitForTimeout(1000);425                            } catch (e) {426                                console.log(`   ⚠️  Could not scrape followers: ${e.message}`);427                            }428                        }429                        430                        if (shouldScrapeFollowing) {431                            console.log(`   📤 Getting following...`);432                            433                            try {434                                await page.click('a[href*="/following/"]', { timeout: 5000 });435                                await page.waitForTimeout(3000);436                                437                                const followingUsernames = await page.evaluate(async (maxCount) => {438                                    const usernames = new Set();439                                    const dialog = document.querySelector('div[role="dialog"]');440                                    if (!dialog) return [];441                                    442                                    let lastCount = 0;443                                    let stableCount = 0;444                                    445                                    while (usernames.size < maxCount && stableCount < 5) {446                                        const links = dialog.querySelectorAll('a[href^="/"]');447                                        links.forEach(link => {448                                            const href = link.getAttribute('href');449                                            const match = href.match(/^\/([^\/]+)\/?$/);450                                            if (match && match[1]) {451                                                usernames.add(match[1]);452                                            }453                                        });454                                        455                                        dialog.scrollTop = dialog.scrollHeight;456                                        await new Promise(r => setTimeout(r, 1500));457                                        458                                        if (usernames.size === lastCount) {459                                            stableCount++;460                                        } else {461                                            stableCount = 0;462                                        }463                                        lastCount = usernames.size;464                                    }465                                    466                                    return Array.from(usernames);467                                }, maxPerAccount);468                                469                                console.log(`   ✅ Found ${followingUsernames.length} following`);470                                471                                for (const followingUsername of followingUsernames) {472                                    if (profileCount >= maxProfiles) break;473                                    if (scrapedUsernames.has(followingUsername)) continue;474                                    475                                    scrapedUsernames.add(followingUsername);476                                    await crawler.addRequests([{477                                        url: `https://www.instagram.com/${followingUsername}/`,478                                        userData: { label: 'PROFILE_DIRECT', username: followingUsername }479                                    }]);480                                }481                            } catch (e) {482                                console.log(`   ⚠️  Could not scrape following: ${e.message}`);483                            }484                        }485                    }486                }487
488                // Handle direct profile scraping489                if (label === 'PROFILE_DIRECT') {490                    if (profileCount >= maxProfiles) {491                        console.log(`⏸️  Reached maximum profile limit (${maxProfiles})`);492                        return;493                    }494
495                    console.log(`👤 Scraping: @${username} (${profileCount + 1}/${maxProfiles})`);496                    497                    try {498                        await page.waitForSelector('header section', { timeout: 15000 });499                    } catch (e) {500                        console.log(`   ⚠️  Could not load profile: ${e.message}`);501                        return;502                    }503                    504                    const profileData = await page.evaluate(() => {505                        try {506                            const data = {};507                            data.username = window.location.pathname.split('/')[1];508                            509                            const nameEl = document.querySelector('header section h2, header h1, header section span');510                            data.fullName = nameEl ? nameEl.innerText.trim() : '';511                            512                            const bioEl = document.querySelector('header section div.-vDIg span, header section ._aa_c span, header section span');513                            data.bio = bioEl ? bioEl.innerText.trim() : '';514                            515                            const statsEls = document.querySelectorAll('header section ul li, header section a span');516                            statsEls.forEach(el => {517                                const text = el.innerText.toLowerCase();518                                const countMatch = text.match(/[\d,kmb.]+/i);519                                if (!countMatch) return;520                                521                                let countStr = countMatch[0].replace(/,/g, '');522                                let multiplier = 1;523                                if (countStr.toLowerCase().includes('k')) {524                                    multiplier = 1000;525                                    countStr = countStr.replace(/k/i, '');526                                } else if (countStr.toLowerCase().includes('m')) {527                                    multiplier = 1000000;528                                    countStr = countStr.replace(/m/i, '');529                                } else if (countStr.toLowerCase().includes('b')) {530                                    multiplier = 1000000000;531                                    countStr = countStr.replace(/b/i, '');532                                }533                                const count = Math.floor(parseFloat(countStr) * multiplier);534                                535                                if (text.includes('post')) data.posts = count;536                                if (text.includes('follower')) data.followers = count;537                                if (text.includes('following')) data.following = count;538                            });539                            540                            const websiteEl = document.querySelector('header section a[href*="http"]');541                            data.website = websiteEl ? websiteEl.href : null;542                            543                            data.isVerified = !!document.querySelector('header svg[aria-label*="Verified"], header svg[aria-label*="verified"]');544                            545                            return data;546                        } catch (e) {547                            return { username: window.location.pathname.split('/')[1] };548                        }549                    });550
551                    profileData.url = `https://www.instagram.com/${username}/`;552                    profileData.scrapedAt = new Date().toISOString();553                    profileData.email = extractEmailFromText(profileData.bio);554                    profileData.phone = extractPhoneFromText(profileData.bio);555                    profileData.accountType = 'personal';556                    557                    if (calculateEngagementRate && profileData.followers > 0) {558                        const estimatedRate = Math.max(0.5, Math.min(15, 100 / Math.log10(profileData.followers + 10)));559                        profileData.engagementRate = estimatedRate.toFixed(2) + '%';560                    }561
562                    if (applyFilters(profileData)) {563                        await Actor.pushData(profileData);564                        profileCount++;565                        console.log(`   ✅ Saved (${profileCount}/${maxProfiles})`);566                    } else {567                        console.log(`   ⛔ Filtered out`);568                    }569                }570
571                // Handle network expansion572                if (label === 'NETWORK_SEED') {573                    console.log(`🌐 Network expansion from: @${username} (depth: ${depth}/${expansionDepth})`);574                    575                    try {576                        await page.waitForTimeout(3000);577                        await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));578                        await page.waitForTimeout(2000);579                        580                        const suggestedUsernames = await page.evaluate(() => {581                            const usernames = [];582                            const links = document.querySelectorAll('a[href^="/"]');583                            links.forEach(link => {584                                const href = link.getAttribute('href');585                                const match = href.match(/^\/([^\/]+)\/?$/);586                                if (match && match[1]) {587                                    usernames.push(match[1]);588                                }589                            });590                            return [...new Set(usernames)];591                        });592
593                        console.log(`   Found ${suggestedUsernames.length} suggested profiles`);594
595                        const toProcess = suggestedUsernames.slice(0, 20);596                        for (const suggestedUsername of toProcess) {597                            if (profileCount >= maxProfiles) break;598                            if (scrapedUsernames.has(suggestedUsername)) continue;599                            600                            scrapedUsernames.add(suggestedUsername);601                            602                            await crawler.addRequests([{603                                url: `https://www.instagram.com/${suggestedUsername}/`,604                                userData: { label: 'PROFILE_DIRECT', username: suggestedUsername }605                            }]);606
607                            if (depth < expansionDepth - 1) {608                                await crawler.addRequests([{609                                    url: `https://www.instagram.com/${suggestedUsername}/`,610                                    userData: { label: 'NETWORK_SEED', username: suggestedUsername, depth: depth + 1 }611                                }]);612                            }613                        }614                    } catch (e) {615                        console.log(`   ⚠️  Network expansion error: ${e.message}`);616                    }617                }618            } catch (error) {619                console.error(`❌ Error in request handler: ${error.message}`);620                console.error(error.stack);621            }622        },623
624        failedRequestHandler: async ({ request, error }) => {625            console.log(`❌ Request failed: ${request.url}`);626            console.log(`   Error: ${error.message}`);627        },628
629        maxRequestRetries: 2,630        requestHandlerTimeoutSecs: 120,631    });632
633    // Start crawling634    try {635        const startUrls = generateStartUrls();636        console.log(`\n🎯 Starting with ${startUrls.length} initial URLs`);637        console.log(`⚙️  Data extraction: Email=${extractEmail}, Phone=${extractPhone}, Website=${extractWebsite}`);638        if (keywords.length > 0) {639            console.log(`🔍 Filtering by keywords: ${keywords.join(', ')}`);640        }641        console.log(`\n▶️  Starting crawl...\n`);642        643        await crawler.run(startUrls);644        645        console.log(`\n✅ Scraping completed!`);646        console.log(`📊 Total profiles scraped: ${profileCount}`);647        console.log(`💾 Results saved to dataset`);648    } catch (error) {649        console.error(`\n❌ Crawl error: ${error.message}`);650        console.error(error.stack);651        throw error;652    }653};654
655// Initialize Actor and run main function656try {657    await Actor.init();658    await main();659    await Actor.exit();660} catch (error) {661    console.error(`\n💥 Critical error: ${error.message}`);662    console.error(error.stack);663    await Actor.exit({ statusMessage: error.message, exitCode: 1 });664}# configurations.idea.vscode.zed
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git# This file tells Git which files shouldn't be added to source control.DS_Store.idea.vscode.zeddistnode_modulesapify_storagestorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.jsonFROM apify/actor-node:18
COPY package*.json ./
RUN npm --quiet set progress=false \    && npm install --only=prod --no-optional \    && echo "Installed NPM packages:" \    && (npm list --only=prod --no-optional --all || true) \    && echo "Node.js version:" \    && node --version \    && echo "NPM version:" \    && npm --version
COPY . ./
CMD npm start{  "name": "instagram-lead-scraper",  "version": "1.0.0",  "type": "module",  "description": "Scrape Instagram leads from hashtags",  "main": "src/main.js",  "scripts": {    "start": "node src/main.js",    "test": "echo \"No tests specified\""  },  "dependencies": {    "apify": "^3.1.0",    "crawlee": "^3.5.0"  },  "author": "lucid_garden",  "license": "ISC"}