LinkedIn Profile URL Scrapper (from Google) avatar

LinkedIn Profile URL Scrapper (from Google)

Try for free

No credit card required

View all Actors
LinkedIn Profile URL Scrapper (from Google)

LinkedIn Profile URL Scrapper (from Google)

glowing_treat/linkedin-scrapper-from-google
Try for free

No credit card required

Scrape Data from Google search with use of Designation and Location.

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor",
4    "title": "Project Cheerio Crawler Javascript",
5    "description": "Crawlee and Cheerio project in javascript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-cheerio"
9    },
10    "input": "./input_schema.json",
11    "storages": {
12        "dataset": "./dataset_schema.json"
13    },
14    "dockerfile": "./Dockerfile"
15}

.actor/dataset_schema.json

1{
2    "actorSpecification": 1,
3    "fields": {
4        "LinkedIn URL": {
5            "type": "string",
6            "description": "The URL of the LinkedIn profile."
7        }
8    },
9    "views": {
10        "overview": {
11            "title": "Overview",
12            "transformation": {
13                "fields": ["LinkedIn URL"]
14            },
15            "display": {
16                "component": "table",
17                "properties": {
18                    "LinkedIn URL": {
19                        "label": "LinkedIn Profile URL",
20                        "format": "link"
21                    }
22                }
23            }
24        }
25    }
26}

.actor/input_schema.json

1{
2    "title": "Scrape LinkedIn profiles based on keywords",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "keywords": {
7            "title": "Search Keywords",
8            "type": "array",
9            "description": "Enter the keywords to search for LinkedIn profiles, e.g., job titles, and locations.",
10            "editor": "stringList",
11            "items": {
12                "type": "string"
13            },
14            "prefill": ["chief product officer", "united states"]
15        },
16        "numPages": {
17            "title": "Number of Pages",
18            "type": "integer",
19            "description": "The number of pages to scrape (each page corresponds to a set of search results).",
20            "editor": "number",
21            "minimum": 1,
22            "default": 1
23        }
24    },
25    "required": ["keywords", "numPages"]
26}

.actor/output_schema.json

1{
2    "title": "LinkedIn Profile URLs",
3    "type": "array",
4    "items": {
5        "type": "object",
6        "properties": {
7            "LinkedIn URL": {
8                "type": "string",
9                "description": "The URL of the LinkedIn profile."
10            }
11        },
12        "required": ["LinkedIn URL"]
13    }
14}

src/main.js

1import { Actor, log } from 'apify';
2import { CheerioCrawler } from 'crawlee';
3
4await Actor.init();
5
6try {
7    const input = await Actor.getInput();
8    const keywords = input.keywords ? input.keywords.slice(0, 2) : ["chief product officer", "united states"];
9    const numPages = input.numPages || 1;
10
11    const baseUrl = 'https://www.google.com/search?q=site%3Alinkedin.com%2Fin%2F+';
12    const formattedKeywords = keywords.map(keyword => `%22${keyword.replace(/ /g, "+")}%22`).join('+');
13    const headers = {
14        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
15    };
16
17    const linkedinUrlsSet = new Set();
18
19    // Generate URLs for the number of pages specified
20    const urls = [];
21    for (let page = 0; page < numPages; page++) {
22        const start = page * 10;
23        urls.push(`${baseUrl}${formattedKeywords}&start=${start}`);
24    }
25
26    const requestQueue = await Actor.openRequestQueue();
27    for (const url of urls) {
28        await requestQueue.addRequest({ url, headers });
29    }
30
31    const crawler = new CheerioCrawler({
32        requestQueue,
33        requestHandler: async ({ request, $ }) => {
34            console.log(`Processing ${request.url}...`);
35
36            $('a').each((index, element) => {
37                const href = $(element).attr('href');
38                const match = href && href.match(/(https?:\/\/www\.linkedin\.com\/in\/[^&]+)/);
39                if (match) {
40                    const linkedinUrl = match[1];
41                    linkedinUrlsSet.add(linkedinUrl); // Add to set to ensure uniqueness
42                }
43            });
44        },
45    });
46
47    await crawler.run();
48
49    // Convert set to array for pushing to dataset
50    const linkedinUrlsArray = Array.from(linkedinUrlsSet);
51
52    // Push unique results to dataset
53    const results = linkedinUrlsArray.map(url => ({ "LinkedIn URL": url }));
54    await Actor.pushData(results);
55
56    log.info(`Found and saved ${linkedinUrlsSet.size} unique LinkedIn URLs based on the keywords across ${numPages} pages.`);
57} catch (error) {
58    console.error('Error during actor run:', error);
59    throw error;
60}
61
62await Actor.exit();

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-cheerio-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is a boilerplate of an Apify actor.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.1.10",
11        "crawlee": "^3.5.4",
12        "googleapis": "^118.0.0"
13    },
14    "devDependencies": {
15        "@apify/eslint-config": "^0.4.0",
16        "eslint": "^8.50.0"
17    },
18    "scripts": {
19        "start": "node src/main.js",
20        "lint": "eslint ./src --ext .js,.jsx",
21        "lint:fix": "eslint ./src --ext .js,.jsx --fix",
22        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
23    },
24    "author": "It's not you it's me",
25    "license": "ISC"
26}
Developer
Maintained by Community
Actor metrics
  • 23 monthly users
  • 3 stars
  • 100.0% runs succeeded
  • Created in Jul 2024
  • Modified 3 months ago