Job Openings Metadata
Pricing
Pay per usage
Go to Apify Store
Job Openings Metadata
0.0 (0)
Pricing
Pay per usage
0
2
2
Last modified
6 days ago
Pricing
Pay per usage
0.0 (0)
Pricing
Pay per usage
0
2
2
Last modified
6 days ago
{ "actorSpecification": 1, "name": "my-actor-2", "title": "Getting started with Python and BeautifulSoup", "description": "Scrapes titles of websites using BeautifulSoup.", "version": "0.0", "buildTag": "latest", "meta": { "templateId": "python-beautifulsoup" }, "input": "./input_schema.json", "dockerfile": "../Dockerfile"}
{ "title": "Python BeautifulSoup Scraper", "type": "object", "schemaVersion": 1, "properties": { "start_urls": { "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [{ "url": "https://apify.com" }], "editor": "requestListSources" }, "max_depth": { "title": "Maximum depth", "type": "integer", "description": "Depth to which to scrape to", "default": 1 } }, "required": ["start_urls"]}
1
1import asyncio2
3from .main import main4
5# Execute the Actor entrypoint.6asyncio.run(main())
1"""Main entry point for the LinkedIn Job Count Metadata Scraper.2This Actor fetches job counts from LinkedIn job search URLs using HTTP requests and HTML parsing,3optimized for speed with 1-11 uniform random concurrent tasks to process 6,844 URLs in a single run,4spanning approximately 16 hours with random delays of 500-3,500 milliseconds between each query5and random delays of 3-5 seconds between retry attempts, with 5 retries.6Other queries continue processing during retry delays.7Key Features:8- Uses HTTP GET requests with a browser-like User-Agent to fetch raw HTML.9- Parses the initial HTML for job counts using BeautifulSoup, targeting '.results-context-header__context'10 for job numbers. Labels results as "No Jobs" if the title contains "0 jobs in" or the11 '.no-results__main-title-keywords' element is present, falling back to "Failed to Load" for failures.12- Pushes extracted data (URL, job count, timestamp) to the Apify dataset.13Usage:14- Input JSON should contain a 'start_urls' array with URLs like {'url': 'https://www.linkedin.com/jobs/search/?f_C=123&geoID=103644278', 'method': 'GET'}.15- Run the Actor via the Apify Starter Plan with 128 MB memory16- Monitor logs for success (e.g., "Extracted job count: 2 results") or failures (e.g., "No Jobs" or "Failed to Load").17- Relies on Apify's proxy rotation; a paid proxy plan18- Runtime may vary slightly based on randomization, retries, and network conditions.19Dependencies:20- apify>=2.7.321- requests>=2.28.022- beautifulsoup4>=4.12.023"""24
25import asyncio26import random27from apify import Actor28import requests29from bs4 import BeautifulSoup30from datetime import datetime, timedelta31from typing import List32
33async def fetch_url(url: str, proxy_configuration) -> tuple[str, str]:34 max_attempts = 5 # Increased to 5 retries35 for attempt in range(max_attempts):36 try:37 proxy_url = await proxy_configuration.new_url()38 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}39 response = await asyncio.to_thread(40 requests.get, url, headers=headers, proxies={"http": proxy_url, "https": proxy_url}, timeout=3041 )42 if response.status_code == 200:43 soup = BeautifulSoup(response.text, 'html.parser')44 Actor.log.info(f"Successfully fetched HTML for {url}")45 title = soup.title.string.lower() if soup.title else ""46 if title.startswith("0 jobs in"): # Strict check for "0 jobs in" at the start47 return url, "No Jobs"48 no_jobs_element = soup.find(class_="no-results__main-title-keywords")49 if no_jobs_element:50 return url, "No Jobs"51 job_element = soup.find(class_="results-context-header__context")52 if job_element:53 job_count_text = job_element.get_text().strip()54 if any(char.isdigit() for char in job_count_text):55 return url, job_count_text.split()[0] + " results" # e.g., "2 results"56 return url, "Failed to Load"57 else:58 Actor.log.error(f"Failed to fetch {url}, status code: {response.status_code}")59 return url, "Failed to Load"60 except Exception as e:61 Actor.log.warning(f"Attempt {attempt + 1} failed for {url}: {e}")62 if attempt < max_attempts - 1:63 retry_delay = random.uniform(3, 5) # Random delay between 3 and 5 seconds for retry64 Actor.log.info(f"Pausing for {retry_delay:.1f} seconds before retry {attempt + 2}...")65 await asyncio.sleep(retry_delay)66 continue67 return url, "Failed to Load"68
69async def process_urls(urls: List[str], proxy_configuration):70 max_concurrent_tasks = 8 # Cap at 8 tasks to fit 128 MB71 semaphore = asyncio.Semaphore(max_concurrent_tasks)72 73 async def bounded_fetch(url):74 async with semaphore:75 delay = random.uniform(0.5, 3.5) # Random delay in seconds (500-3,500 ms)76 await asyncio.sleep(delay)77 result = await fetch_url(url, proxy_configuration)78 return url, result[1] # Return URL and job_count79
80 # Process URLs in smaller chunks to manage memory81 chunk_size = 100 # Process in chunks to avoid loading all URLs at once82 for i in range(0, len(urls), chunk_size):83 chunk = urls[i:i + chunk_size]84 tasks = [bounded_fetch(url) for url in chunk]85 for task in asyncio.as_completed(tasks): # Process as tasks complete, avoiding double-await86 url, job_count = await task87 data = {88 "url": url,89 "job_count": job_count,90 "timestamp": datetime.utcnow().isoformat() + "Z",91 }92 await Actor.push_data(data)93 Actor.log.info(f"Extracted job count: {job_count} from {url}")94 # No need for additional gather, as as_completed handles awaiting95
96async def main():97 async with Actor:98 # Retrieve the Actor input from the Apify platform, exiting if no URLs are provided.99 actor_input = await Actor.get_input() or {}100 Actor.log.info(f"Received input: {actor_input}")101 all_start_urls = [url.get("url") for url in actor_input.get("start_urls", [])]102
103 if not all_start_urls:104 Actor.log.error("No start URLs specified in Actor input, exiting...")105 await Actor.exit()106
107 # Use all provided URLs without randomization.108 start_urls = all_start_urls109 Actor.log.info(f"Starting crawl with {len(start_urls)} URLs: {start_urls}")110 # Configure proxy for HTTP requests using Apify's residential proxy pool.111 proxy_configuration = await Actor.create_proxy_configuration(groups=["RESIDENTIAL"])112
113 # Process URLs individually with per-query delays and concurrent retries114 await process_urls(start_urls, proxy_configuration)115
116if __name__ == "__main__":117 asyncio.run(main())
.git.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
# Visual Studio Code# Ignores the folder created by VS Code when changing workspace settings, doing debugger# configuration, etc. Can be commented out to share Workspace Settings within a team.vscode
# Zed editor# Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out# to share Project Settings within a team.zed
.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
# Visual Studio Code# Ignores the folder created by VS Code when changing workspace settings, doing debugger# configuration, etc. Can be commented out to share Workspace Settings within a team.vscode
# Zed editor# Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out# to share Project Settings within a team.zed
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.13
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q src/
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
1apify>=2.7.32requests>=2.28.03beautifulsoup4>=4.12.0