LinkedIn Profile URL Scrapper (from Google)
Try for free
No credit card required
View all Actors
LinkedIn Profile URL Scrapper (from Google)
glowing_treat/linkedin-scrapper-from-google
Try for free
No credit card required
Scrape Data from Google search with use of Designation and Location.
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor",
4 "title": "Project Cheerio Crawler Javascript",
5 "description": "Crawlee and Cheerio project in javascript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-cheerio"
9 },
10 "input": "./input_schema.json",
11 "storages": {
12 "dataset": "./dataset_schema.json"
13 },
14 "dockerfile": "./Dockerfile"
15}
.actor/dataset_schema.json
1{
2 "actorSpecification": 1,
3 "fields": {
4 "LinkedIn URL": {
5 "type": "string",
6 "description": "The URL of the LinkedIn profile."
7 }
8 },
9 "views": {
10 "overview": {
11 "title": "Overview",
12 "transformation": {
13 "fields": ["LinkedIn URL"]
14 },
15 "display": {
16 "component": "table",
17 "properties": {
18 "LinkedIn URL": {
19 "label": "LinkedIn Profile URL",
20 "format": "link"
21 }
22 }
23 }
24 }
25 }
26}
.actor/input_schema.json
1{
2 "title": "Scrape LinkedIn profiles based on keywords",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "keywords": {
7 "title": "Search Keywords",
8 "type": "array",
9 "description": "Enter the keywords to search for LinkedIn profiles, e.g., job titles, and locations.",
10 "editor": "stringList",
11 "items": {
12 "type": "string"
13 },
14 "prefill": ["chief product officer", "united states"]
15 },
16 "numPages": {
17 "title": "Number of Pages",
18 "type": "integer",
19 "description": "The number of pages to scrape (each page corresponds to a set of search results).",
20 "editor": "number",
21 "minimum": 1,
22 "default": 1
23 }
24 },
25 "required": ["keywords", "numPages"]
26}
.actor/output_schema.json
1{
2 "title": "LinkedIn Profile URLs",
3 "type": "array",
4 "items": {
5 "type": "object",
6 "properties": {
7 "LinkedIn URL": {
8 "type": "string",
9 "description": "The URL of the LinkedIn profile."
10 }
11 },
12 "required": ["LinkedIn URL"]
13 }
14}
src/main.js
1import { Actor, log } from 'apify';
2import { CheerioCrawler } from 'crawlee';
3
4await Actor.init();
5
6try {
7 const input = await Actor.getInput();
8 const keywords = input.keywords ? input.keywords.slice(0, 2) : ["chief product officer", "united states"];
9 const numPages = input.numPages || 1;
10
11 const baseUrl = 'https://www.google.com/search?q=site%3Alinkedin.com%2Fin%2F+';
12 const formattedKeywords = keywords.map(keyword => `%22${keyword.replace(/ /g, "+")}%22`).join('+');
13 const headers = {
14 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
15 };
16
17 const linkedinUrlsSet = new Set();
18
19 // Generate URLs for the number of pages specified
20 const urls = [];
21 for (let page = 0; page < numPages; page++) {
22 const start = page * 10;
23 urls.push(`${baseUrl}${formattedKeywords}&start=${start}`);
24 }
25
26 const requestQueue = await Actor.openRequestQueue();
27 for (const url of urls) {
28 await requestQueue.addRequest({ url, headers });
29 }
30
31 const crawler = new CheerioCrawler({
32 requestQueue,
33 requestHandler: async ({ request, $ }) => {
34 console.log(`Processing ${request.url}...`);
35
36 $('a').each((index, element) => {
37 const href = $(element).attr('href');
38 const match = href && href.match(/(https?:\/\/www\.linkedin\.com\/in\/[^&]+)/);
39 if (match) {
40 const linkedinUrl = match[1];
41 linkedinUrlsSet.add(linkedinUrl); // Add to set to ensure uniqueness
42 }
43 });
44 },
45 });
46
47 await crawler.run();
48
49 // Convert set to array for pushing to dataset
50 const linkedinUrlsArray = Array.from(linkedinUrlsSet);
51
52 // Push unique results to dataset
53 const results = linkedinUrlsArray.map(url => ({ "LinkedIn URL": url }));
54 await Actor.pushData(results);
55
56 log.info(`Found and saved ${linkedinUrlsSet.size} unique LinkedIn URLs based on the keywords across ${numPages} pages.`);
57} catch (error) {
58 console.error('Error during actor run:', error);
59 throw error;
60}
61
62await Actor.exit();
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
package.json
1{
2 "name": "crawlee-cheerio-javascript",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is a boilerplate of an Apify actor.",
6 "engines": {
7 "node": ">=18.0.0"
8 },
9 "dependencies": {
10 "apify": "^3.1.10",
11 "crawlee": "^3.5.4",
12 "googleapis": "^118.0.0"
13 },
14 "devDependencies": {
15 "@apify/eslint-config": "^0.4.0",
16 "eslint": "^8.50.0"
17 },
18 "scripts": {
19 "start": "node src/main.js",
20 "lint": "eslint ./src --ext .js,.jsx",
21 "lint:fix": "eslint ./src --ext .js,.jsx --fix",
22 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
23 },
24 "author": "It's not you it's me",
25 "license": "ISC"
26}
Developer
Maintained by Community
Actor metrics
- 23 monthly users
- 3 stars
- 100.0% runs succeeded
- Created in Jul 2024
- Modified 3 months ago
Categories