Linkedin Url Scrape avatar
Linkedin Url Scrape
Try for free

No credit card required

View all Actors
Linkedin Url Scrape

Linkedin Url Scrape

mikepowers/linkedin-url-scrape
Try for free

No credit card required

Scrape Unlimited Linkedin Profile URL's

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-15",
4    "title": "Scrape single page in Python",
5    "description": "Scrape data from single page with provided URL.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "python-start"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Scrape LinkedIn profiles based on keywords",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "keywords": {
7            "title": "Search Keywords",
8            "type": "array",
9            "description": "Enter the keywords to search for LinkedIn profiles, e.g., job titles, industries, locations.",
10            "editor": "stringList",
11            "items": {
12                "type": "string"
13            },
14            "prefill": ["chief product officer", "united states", "insurance"]
15        },
16        "numPages": {
17            "title": "Number of Pages",
18            "type": "integer",
19            "description": "The number of pages to scrape (each page corresponds to a set of search results).",
20            "editor": "number",
21            "minimum": 1,
22            "default": 1
23        }
24    },
25    "required": ["keywords", "numPages"]
26}

src/__main__.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1import asyncio
2from bs4 import BeautifulSoup
3import requests
4from apify import Actor
5import re
6
7async def main() -> None:
8    async with Actor() as actor:
9        actor_input = await actor.get_input() or {}
10        keywords = actor_input.get('keywords', ["chief product officer", "united states", "insurance"])
11        num_pages = actor_input.get('numPages', 1)  # Get the number of pages from input
12
13        base_url = 'https://www.google.com/search?q=site%3Alinkedin.com%2Fin%2F+'
14        formatted_keywords = '+'.join(f'(%22{keyword.replace(" ", "+")}%22)' for keyword in keywords)
15        
16        headers = {
17            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
18
19        linkedin_urls = []
20
21        # Loop through pages based on `num_pages`, incrementing `start` by 10 for each page
22        for page_start in range(0, num_pages * 10, 10):
23            url = f"{base_url}{formatted_keywords}&start={page_start}"
24            print(url)
25            response = requests.get(url, headers=headers)
26            soup = BeautifulSoup(response.text, 'html.parser')
27            links = soup.find_all('a', href=True)
28
29            # Extract LinkedIn URLs
30            for link in links:
31                match = re.search(r'(https?://www\.linkedin\.com/in/[^&]+)', link['href'])
32                if match:
33                    linkedin_url = match.group(1)
34                    if linkedin_url not in linkedin_urls:  # Avoid duplicates
35                        linkedin_urls.append(linkedin_url)
36
37        # Output the LinkedIn URLs
38        for url in linkedin_urls:
39            await actor.push_data({"LinkedIn URL": url})
40
41        actor.log.info(f"Found and saved {len(linkedin_urls)} LinkedIn URLs based on the keywords across {num_pages} pages.")
42
43if __name__ == '__main__':
44    asyncio.run(main())

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json
13
14.venv/
15.env/
16__pypackages__
17dist/
18build/
19*.egg-info/
20*.egg
21
22__pycache__
23
24.mypy_cache
25.dmypy.json
26dmypy.json
27.pytest_cache
28.ruff_cache
29
30.scrapy
31*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.6.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7
8requests ~= 2.28.1
Developer
Maintained by Community
Actor metrics
  • 116 monthly users
  • 8 stars
  • 100.0% runs succeeded
  • 77 days response time
  • Created in Mar 2024
  • Modified 4 months ago