Linkedin Url Scrape
Try for free
No credit card required
Go to Store
Linkedin Url Scrape
mikepowers/linkedin-url-scrape
Try for free
No credit card required
Scrape Unlimited Linkedin Profile URL's
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor-15",
4 "title": "Scrape single page in Python",
5 "description": "Scrape data from single page with provided URL.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "python-start"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "Scrape LinkedIn profiles based on keywords",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "keywords": {
7 "title": "Search Keywords",
8 "type": "array",
9 "description": "Enter the keywords to search for LinkedIn profiles, e.g., job titles, industries, locations.",
10 "editor": "stringList",
11 "items": {
12 "type": "string"
13 },
14 "prefill": ["chief product officer", "united states", "insurance"]
15 },
16 "numPages": {
17 "title": "Number of Pages",
18 "type": "integer",
19 "description": "The number of pages to scrape (each page corresponds to a set of search results).",
20 "editor": "number",
21 "minimum": 1,
22 "default": 1
23 }
24 },
25 "required": ["keywords", "numPages"]
26}
src/__main__.py
1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())
src/main.py
1import asyncio
2from bs4 import BeautifulSoup
3import requests
4from apify import Actor
5import re
6
7async def main() -> None:
8 async with Actor() as actor:
9 actor_input = await actor.get_input() or {}
10 keywords = actor_input.get('keywords', ["chief product officer", "united states", "insurance"])
11 num_pages = actor_input.get('numPages', 1) # Get the number of pages from input
12
13 base_url = 'https://www.google.com/search?q=site%3Alinkedin.com%2Fin%2F+'
14 formatted_keywords = '+'.join(f'(%22{keyword.replace(" ", "+")}%22)' for keyword in keywords)
15
16 headers = {
17 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
18
19 linkedin_urls = []
20
21 # Loop through pages based on `num_pages`, incrementing `start` by 10 for each page
22 for page_start in range(0, num_pages * 10, 10):
23 url = f"{base_url}{formatted_keywords}&start={page_start}"
24 print(url)
25 response = requests.get(url, headers=headers)
26 soup = BeautifulSoup(response.text, 'html.parser')
27 links = soup.find_all('a', href=True)
28
29 # Extract LinkedIn URLs
30 for link in links:
31 match = re.search(r'(https?://www\.linkedin\.com/in/[^&]+)', link['href'])
32 if match:
33 linkedin_url = match.group(1)
34 if linkedin_url not in linkedin_urls: # Avoid duplicates
35 linkedin_urls.append(linkedin_url)
36
37 # Output the LinkedIn URLs
38 for url in linkedin_urls:
39 await actor.push_data({"LinkedIn URL": url})
40
41 actor.log.info(f"Found and saved {len(linkedin_urls)} LinkedIn URLs based on the keywords across {num_pages} pages.")
42
43if __name__ == '__main__':
44 asyncio.run(main())
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json
13
14.venv/
15.env/
16__pypackages__
17dist/
18build/
19*.egg-info/
20*.egg
21
22__pycache__
23
24.mypy_cache
25.dmypy.json
26dmypy.json
27.pytest_cache
28.ruff_cache
29
30.scrapy
31*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.6.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7
8requests ~= 2.28.1
Developer
Maintained by Community
Actor Metrics
139 monthly users
-
37 stars
>99% runs succeeded
15 days response time
Created in Mar 2024
Modified 9 months ago
Categories