1"""Main entry point for the LinkedIn Job Count Metadata Scraper.
2This Actor fetches job counts from LinkedIn job search URLs using HTTP requests and HTML parsing,
3optimized for speed with 1-11 uniform random concurrent tasks to process 6,844 URLs in a single run,
4spanning approximately 16 hours with random delays of 500-3,500 milliseconds between each query
5and random delays of 3-5 seconds between retry attempts, with 5 retries.
6Other queries continue processing during retry delays.
7Key Features:
8- Uses HTTP GET requests with a browser-like User-Agent to fetch raw HTML.
9- Parses the initial HTML for job counts using BeautifulSoup, targeting '.results-context-header__context'
10 for job numbers. Labels results as "No Jobs" if the title contains "0 jobs in" or the
11 '.no-results__main-title-keywords' element is present, falling back to "Failed to Load" for failures.
12- Pushes extracted data (URL, job count, timestamp) to the Apify dataset.
13Usage:
14- Input JSON should contain a 'start_urls' array with URLs like {'url': 'https://www.linkedin.com/jobs/search/?f_C=123&geoID=103644278', 'method': 'GET'}.
15- Run the Actor via the Apify Starter Plan with 128 MB memory
16- Monitor logs for success (e.g., "Extracted job count: 2 results") or failures (e.g., "No Jobs" or "Failed to Load").
17- Relies on Apify's proxy rotation; a paid proxy plan
18- Runtime may vary slightly based on randomization, retries, and network conditions.
19Dependencies:
20- apify>=2.7.3
21- requests>=2.28.0
22- beautifulsoup4>=4.12.0
23"""
24
25import asyncio
26import random
27from apify import Actor
28import requests
29from bs4 import BeautifulSoup
30from datetime import datetime, timedelta
31from typing import List
32
33async def fetch_url(url: str, proxy_configuration) -> tuple[str, str]:
34 max_attempts = 5
35 for attempt in range(max_attempts):
36 try:
37 proxy_url = await proxy_configuration.new_url()
38 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
39 response = await asyncio.to_thread(
40 requests.get, url, headers=headers, proxies={"http": proxy_url, "https": proxy_url}, timeout=30
41 )
42 if response.status_code == 200:
43 soup = BeautifulSoup(response.text, 'html.parser')
44 Actor.log.info(f"Successfully fetched HTML for {url}")
45 title = soup.title.string.lower() if soup.title else ""
46 if title.startswith("0 jobs in"):
47 return url, "No Jobs"
48 no_jobs_element = soup.find(class_="no-results__main-title-keywords")
49 if no_jobs_element:
50 return url, "No Jobs"
51 job_element = soup.find(class_="results-context-header__context")
52 if job_element:
53 job_count_text = job_element.get_text().strip()
54 if any(char.isdigit() for char in job_count_text):
55 return url, job_count_text.split()[0] + " results"
56 return url, "Failed to Load"
57 else:
58 Actor.log.error(f"Failed to fetch {url}, status code: {response.status_code}")
59 return url, "Failed to Load"
60 except Exception as e:
61 Actor.log.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
62 if attempt < max_attempts - 1:
63 retry_delay = random.uniform(3, 5)
64 Actor.log.info(f"Pausing for {retry_delay:.1f} seconds before retry {attempt + 2}...")
65 await asyncio.sleep(retry_delay)
66 continue
67 return url, "Failed to Load"
68
69async def process_urls(urls: List[str], proxy_configuration):
70 max_concurrent_tasks = 8
71 semaphore = asyncio.Semaphore(max_concurrent_tasks)
72
73 async def bounded_fetch(url):
74 async with semaphore:
75 delay = random.uniform(0.5, 3.5)
76 await asyncio.sleep(delay)
77 result = await fetch_url(url, proxy_configuration)
78 return url, result[1]
79
80
81 chunk_size = 100
82 for i in range(0, len(urls), chunk_size):
83 chunk = urls[i:i + chunk_size]
84 tasks = [bounded_fetch(url) for url in chunk]
85 for task in asyncio.as_completed(tasks):
86 url, job_count = await task
87 data = {
88 "url": url,
89 "job_count": job_count,
90 "timestamp": datetime.utcnow().isoformat() + "Z",
91 }
92 await Actor.push_data(data)
93 Actor.log.info(f"Extracted job count: {job_count} from {url}")
94
95
96async def main():
97 async with Actor:
98
99 actor_input = await Actor.get_input() or {}
100 Actor.log.info(f"Received input: {actor_input}")
101 all_start_urls = [url.get("url") for url in actor_input.get("start_urls", [])]
102
103 if not all_start_urls:
104 Actor.log.error("No start URLs specified in Actor input, exiting...")
105 await Actor.exit()
106
107
108 start_urls = all_start_urls
109 Actor.log.info(f"Starting crawl with {len(start_urls)} URLs: {start_urls}")
110
111 proxy_configuration = await Actor.create_proxy_configuration(groups=["RESIDENTIAL"])
112
113
114 await process_urls(start_urls, proxy_configuration)
115
116if __name__ == "__main__":
117 asyncio.run(main())