1"""
2LinkedIn job scraper for the Apify actor.
3
4Guest API (no cookie) covers the core fields.
5Voyager API (li_at cookie) adds deep enrichment: company profile,
6structured salary, job poster photo/title, workplace type, expiry, and more.
7"""
8
9import json
10import logging
11import re
12import time
13from datetime import datetime, timezone
14from typing import Iterator, Optional
15from urllib.parse import parse_qs, urlparse, unquote
16
17import requests
18from bs4 import BeautifulSoup
19
20logger = logging.getLogger(__name__)
21
22TIME_FILTERS = {
23 "day": "r86400",
24 "week": "r604800",
25 "month": "r2592000",
26 "any": "",
27}
28
29WORKPLACE_TYPE_MAP = {"1": "On-site", "2": "Remote", "3": "Hybrid"}
30
31APPLY_METHOD_MAP = {
32 "OffsiteApply": "OffsiteApply",
33 "ComplexOnsiteApply": "ComplexOnsiteApply",
34 "EasyApplyMethod": "EasyApply",
35}
36
37
38class LinkedInScraper:
39 GUEST_SEARCH_URL = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search"
40 GUEST_DETAIL_URL = "https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
41 VOYAGER_JOB_URL = "https://www.linkedin.com/voyager/api/jobs/jobPostings/{job_id}"
42 VOYAGER_DECORATION = "com.linkedin.voyager.deco.jobs.web.shared.WebFullJobPosting-65"
43
44 BASE_HEADERS = {
45 "User-Agent": (
46 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
47 "AppleWebKit/537.36 (KHTML, like Gecko) "
48 "Chrome/124.0.0.0 Safari/537.36"
49 ),
50 "Accept-Language": "en-US,en;q=0.9",
51 }
52
53 def __init__(
54 self,
55 li_at_cookie: str = "",
56 proxy_url: Optional[str] = None,
57 request_delay: float = 2.0,
58 max_retries: int = 3,
59 ):
60 self.li_at_cookie = li_at_cookie.strip()
61 self.proxy_url = proxy_url
62 self.request_delay = request_delay
63 self.max_retries = max_retries
64 self._csrf_token: Optional[str] = None
65
66 self.session = requests.Session()
67 self.session.headers.update(self.BASE_HEADERS)
68
69 if proxy_url:
70 self.session.proxies = {"http": proxy_url, "https": proxy_url}
71
72 if self.li_at_cookie:
73 self.session.cookies.set("li_at", self.li_at_cookie, domain=".linkedin.com")
74 self._init_csrf()
75
76 def _init_csrf(self) -> None:
77 try:
78 self.session.get("https://www.linkedin.com/jobs/", timeout=15)
79 csrf = self.session.cookies.get("JSESSIONID", "").strip('"')
80 if csrf:
81 self._csrf_token = csrf
82 self.session.headers["csrf-token"] = csrf
83 except Exception as exc:
84 logger.warning("Could not fetch CSRF token: %s", exc)
85
86 def _get(self, url: str, params: dict = None, accept: str = None) -> requests.Response:
87 headers = {"Accept": accept} if accept else {}
88 for attempt in range(1, self.max_retries + 1):
89 try:
90 resp = self.session.get(url, params=params, headers=headers, timeout=15)
91 if resp.status_code == 429:
92 wait = 30 * attempt
93 logger.warning("Rate-limited — waiting %ds (attempt %d/%d)", wait, attempt, self.max_retries)
94 time.sleep(wait)
95 continue
96 resp.raise_for_status()
97 return resp
98 except requests.RequestException as exc:
99 if attempt == self.max_retries:
100 raise
101 logger.warning("Request failed (%s) — retry %d/%d", exc, attempt, self.max_retries)
102 time.sleep(5 * attempt)
103
104
105
106 def search(
107 self,
108 keywords: str,
109 location: str = "",
110 published_within: str = "any",
111 max_results: int = 25,
112 input_url: str = "",
113 ) -> Iterator[dict]:
114 time_filter = TIME_FILTERS.get(published_within.lower(), "")
115 collected = 0
116 start = 0
117
118 while collected < max_results:
119 params = {"keywords": keywords, "location": location, "start": start}
120 if time_filter:
121 params["f_TPR"] = time_filter
122
123 logger.info("Search page start=%d (collected %d/%d)", start, collected, max_results)
124 resp = self._get(self.GUEST_SEARCH_URL, params=params)
125 time.sleep(self.request_delay)
126
127 soup = BeautifulSoup(resp.text, "html.parser")
128 cards = soup.find_all("div", class_=re.compile(r"base-card"))
129 if not cards:
130 break
131
132 for card in cards:
133 if collected >= max_results:
134 break
135 job = self._parse_card(card, input_url=input_url)
136 if job:
137 yield job
138 collected += 1
139
140 start += 25
141
142 def _parse_card(self, card: BeautifulSoup, input_url: str = "") -> Optional[dict]:
143 job: dict = {}
144
145
146 link_el = (
147 card.find("a", class_=re.compile(r"base-card__full-link"))
148 or card.find("a", href=re.compile(r"/jobs/view/"))
149 )
150 if not link_el:
151 return None
152
153 raw_href = link_el.get("href", "")
154 job["link"] = raw_href
155
156 parsed = urlparse(raw_href)
157 params = parse_qs(parsed.query)
158 job["jobUrl"] = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
159 job["trackingId"] = unquote(params.get("trackingId", [""])[0])
160 job["refId"] = unquote(params.get("refId", [""])[0])
161
162 title_el = card.find(["h3", "span"], class_=re.compile(r"base-search-card__title"))
163 if title_el:
164 job["title"] = title_el.get_text(strip=True)
165
166 company_el = card.find(["h4", "a"], class_=re.compile(r"base-search-card__subtitle"))
167 if company_el:
168 job["companyName"] = company_el.get_text(strip=True)
169 if company_el.get("href"):
170 job["companyLinkedinUrl"] = company_el["href"].split("?")[0]
171
172
173 img = card.find("img", class_=re.compile(r"artdeco-entity-image"))
174 if img:
175 logo = img.get("data-delayed-url") or img.get("src", "")
176 if logo and "ghost" not in logo:
177 job["companyLogo"] = logo
178
179 loc_el = card.find("span", class_=re.compile(r"job-search-card__location"))
180 if loc_el:
181 job["location"] = loc_el.get_text(strip=True)
182
183 time_el = card.find("time")
184 if time_el:
185 dt_str = time_el.get("datetime", "")
186 job["postedAt"] = dt_str
187
188 try:
189 dt = datetime.strptime(dt_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
190 job["postedAtTimestamp"] = int(dt.timestamp() * 1000)
191 except (ValueError, TypeError):
192 job["postedAtTimestamp"] = None
193
194 job["inputUrl"] = input_url
195 job["scrapedAt"] = datetime.now(timezone.utc).isoformat()
196 return job
197
198
199
200 def enrich(self, job: dict) -> dict:
201 job_id = self._extract_job_id(job.get("jobUrl"))
202 if not job_id:
203 return job
204
205 job["id"] = job_id
206
207 try:
208 self._enrich_guest(job, job_id)
209 except Exception as exc:
210 logger.warning("Guest enrich failed for %s: %s", job_id, exc)
211
212 if self.li_at_cookie and self._csrf_token:
213 try:
214 self._enrich_voyager(job, job_id)
215 except Exception as exc:
216 logger.warning("Voyager enrich failed for %s: %s", job_id, exc)
217
218 return job
219
220 @staticmethod
221 def _extract_job_id(url: Optional[str]) -> Optional[str]:
222 if not url:
223 return None
224 m = re.search(r"-(\d{7,})(?:[/?]|$)", url) or re.search(r"/view/(\d+)", url)
225 return m.group(1) if m else None
226
227
228
229 def _enrich_guest(self, job: dict, job_id: str) -> None:
230 resp = self._get(self.GUEST_DETAIL_URL.format(job_id=job_id))
231 time.sleep(self.request_delay)
232 soup = BeautifulSoup(resp.text, "html.parser")
233
234
235 for script in soup.find_all("script", {"type": "application/ld+json"}):
236 try:
237 ld = json.loads(script.string or "")
238 if ld.get("@type") == "JobPosting":
239 self._apply_jsonld(ld, job)
240 except (json.JSONDecodeError, AttributeError):
241 pass
242
243
244 desc_el = soup.find("div", class_=re.compile(r"description__text|show-more-less-html"))
245 if desc_el:
246 if not job.get("descriptionHtml"):
247 job["descriptionHtml"] = str(desc_el)
248 if not job.get("descriptionText"):
249 job["descriptionText"] = desc_el.get_text(separator="\n", strip=True)
250
251
252 if not job.get("applicantsCount"):
253 found = soup.find(string=re.compile(r"\d+\s+applicant", re.I))
254 if found:
255 raw = found.parent.get_text(strip=True)
256 job["applicantsCount"] = re.search(r"[\d,]+", raw).group() if re.search(r"[\d,]+", raw) else raw
257
258
259 for item in soup.find_all("li", class_=re.compile(r"description__job-criteria-item")):
260 h3 = item.find("h3")
261 span = item.find("span", class_=re.compile(r"description__job-criteria-text--criteria"))
262 if not h3 or not span:
263 continue
264 header = h3.get_text(strip=True).lower()
265 value = span.get_text(strip=True)
266 if "seniority" in header:
267 job.setdefault("seniorityLevel", value)
268 elif "employment" in header:
269 job.setdefault("employmentType", value)
270 elif "function" in header:
271 job.setdefault("jobFunction", value)
272 elif "industr" in header:
273 job.setdefault("industries", value)
274
275
276 if not job.get("workplaceTypes"):
277 badge = soup.find(string=re.compile(r"\b(Remote|Hybrid|On-site)\b", re.I))
278 if badge:
279 text = badge.strip()
280 if re.search(r"hybrid", text, re.I):
281 job["workplaceTypes"] = ["Hybrid"]
282 elif re.search(r"remote", text, re.I):
283 job["workplaceTypes"] = ["Remote"]
284 elif re.search(r"on.?site", text, re.I):
285 job["workplaceTypes"] = ["On-site"]
286
287
288 if not job.get("companyLinkedinUrl"):
289 link = soup.find("a", href=re.compile(r"linkedin\.com/company/"))
290 if link:
291 m = re.search(r"company/([^/?]+)", link["href"])
292 if m:
293 job.setdefault("companyId", m.group(1))
294 job["companyLinkedinUrl"] = link["href"].split("?")[0]
295
296
297 if not job.get("applyUrl"):
298 btn = soup.find("a", class_=re.compile(r"apply-button"))
299 if btn:
300 job["applyUrl"] = btn.get("href") or job.get("jobUrl", "")
301 job["applyMethod"] = "EasyApply" if "easy" in btn.get_text(strip=True).lower() else "OffsiteApply"
302 else:
303 job["applyUrl"] = job.get("jobUrl", "")
304 job["applyMethod"] = "Unknown"
305
306 def _apply_jsonld(self, data: dict, job: dict) -> None:
307 job.setdefault("title", data.get("title"))
308
309
310 date_posted = data.get("datePosted")
311 if date_posted and not job.get("postedAt"):
312 job["postedAt"] = date_posted
313 if date_posted and not job.get("postedAtTimestamp"):
314 try:
315 dt = datetime.strptime(date_posted, "%Y-%m-%d").replace(tzinfo=timezone.utc)
316 job["postedAtTimestamp"] = int(dt.timestamp() * 1000)
317 except (ValueError, TypeError):
318 pass
319
320 valid_through = data.get("validThrough")
321 if valid_through and not job.get("expireAt"):
322 try:
323 dt = datetime.fromisoformat(valid_through.replace("Z", "+00:00"))
324 job["expireAt"] = int(dt.timestamp() * 1000)
325 except (ValueError, TypeError):
326 pass
327
328
329 raw = data.get("description", "")
330 if raw and not job.get("descriptionText"):
331 job["descriptionHtml"] = raw
332 job["descriptionText"] = BeautifulSoup(raw, "html.parser").get_text(separator="\n", strip=True)
333
334
335 org = data.get("hiringOrganization", {})
336 job.setdefault("companyName", org.get("name"))
337 job.setdefault("companyLinkedinUrl", org.get("sameAs"))
338
339
340 sal = data.get("baseSalary", {})
341 if sal and not job.get("salary"):
342 val = sal.get("value", {})
343 mn = val.get("minValue")
344 mx = val.get("maxValue")
345 curr = sal.get("currency", "")
346 unit = val.get("unitText", "YEAR")
347 if mn or mx:
348 period_suffix = "/yr" if "YEAR" in unit.upper() else f"/{unit.lower()}"
349 parts = [f"${mn:,.2f}" if mn else "", f"${mx:,.2f}" if mx else ""]
350 job["salary"] = f"{' - '.join(p for p in parts if p)}{period_suffix}"
351 job["salaryInsights"] = {
352 "compensationBreakdown": [{
353 "minSalary": str(mn) if mn else None,
354 "maxSalary": str(mx) if mx else None,
355 "payPeriod": unit.upper(),
356 "currencyCode": curr,
357 "compensationType": "BASE_SALARY",
358 }],
359 "compensationSource": "JOB_POSTER_PROVIDED",
360 }
361
362
363 emp = data.get("employmentType")
364 if emp and not job.get("employmentType"):
365 job["employmentType"] = emp if isinstance(emp, str) else ", ".join(emp)
366
367
368 job.setdefault("industries", data.get("industry"))
369
370
371 if not job.get("workplaceTypes"):
372 if data.get("jobLocationType") == "TELECOMMUTE":
373 job["workplaceTypes"] = ["Remote"]
374 job["workRemoteAllowed"] = True
375
376
377 job_loc = data.get("jobLocation") or {}
378 if isinstance(job_loc, list):
379 job_loc = job_loc[0] if job_loc else {}
380 addr = job_loc.get("address", {})
381 if addr and not job.get("location"):
382 parts = filter(None, [addr.get("addressLocality"), addr.get("addressRegion"), addr.get("addressCountry")])
383 loc = ", ".join(parts)
384 if loc:
385 job["location"] = loc
386
387
388 if addr and not job.get("country"):
389 job["country"] = addr.get("addressCountry", "")
390
391
392 if not job.get("applyMethod") and data.get("directApply"):
393 job["applyMethod"] = "EasyApply"
394
395
396
397 def _enrich_voyager(self, job: dict, job_id: str) -> None:
398 resp = self._get(
399 self.VOYAGER_JOB_URL.format(job_id=job_id),
400 params={"decorationId": self.VOYAGER_DECORATION},
401 accept="application/vnd.linkedin.normalized+json+2.1",
402 )
403 time.sleep(self.request_delay)
404 payload = resp.json()
405 data = payload.get("data", {})
406 included = payload.get("included", [])
407
408
409 desc = data.get("description", {})
410 if isinstance(desc, dict):
411 html = desc.get("text", "")
412 if html:
413 job["descriptionHtml"] = html
414 job["descriptionText"] = BeautifulSoup(html, "html.parser").get_text(separator="\n", strip=True)
415
416
417 listed_at = data.get("listedAt")
418 if listed_at:
419 job["postedAtTimestamp"] = listed_at
420 dt = datetime.fromtimestamp(listed_at / 1000, tz=timezone.utc)
421 job["postedAt"] = dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
422
423 expire_at = data.get("expireAt")
424 if expire_at:
425 job["expireAt"] = expire_at
426
427
428 applies = data.get("applies")
429 if applies is not None:
430 job["applicantsCount"] = str(applies)
431
432
433 job.setdefault("employmentType", data.get("formattedEmploymentStatus") or "")
434
435
436 workplace_urns = data.get("workplaceTypes", [])
437 if workplace_urns:
438 codes = [re.search(r":(\d+)$", u).group(1) for u in workplace_urns if re.search(r":(\d+)$", u)]
439 labels = [WORKPLACE_TYPE_MAP[c] for c in codes if c in WORKPLACE_TYPE_MAP]
440 if labels:
441 job["workplaceTypes"] = labels
442 remote = data.get("workRemoteAllowed")
443 if remote is not None:
444 job["workRemoteAllowed"] = remote
445 if not job.get("workplaceTypes"):
446 job["workplaceTypes"] = ["Remote"] if remote else ["On-site"]
447
448
449 industries = data.get("formattedIndustries") or []
450 if industries:
451 job.setdefault("industries", ", ".join(industries))
452
453
454 sal = data.get("salary") or {}
455 if sal and not job.get("salary"):
456 mn = sal.get("min")
457 mx = sal.get("max")
458 curr = sal.get("currencyCode", "")
459 per = sal.get("payPeriod", "YEAR")
460 if mn or mx:
461 suffix = "/yr" if "YEAR" in per.upper() else f"/{per.lower()}"
462 parts = [f"${mn:,.2f}" if mn else "", f"${mx:,.2f}" if mx else ""]
463 job["salary"] = f"{curr} {' - '.join(p for p in parts if p)}{suffix}".strip()
464 job.setdefault("salaryInsights", {
465 "compensationBreakdown": [{
466 "minSalary": str(mn) if mn else None,
467 "maxSalary": str(mx) if mx else None,
468 "payPeriod": per.upper(),
469 "currencyCode": curr,
470 "compensationType": "BASE_SALARY",
471 }],
472 "compensationSource": "JOB_POSTER_PROVIDED",
473 })
474
475 if not job.get("salaryInsights"):
476 job["salaryInsights"] = {}
477
478
479 apply_method = data.get("applyMethod", {})
480 raw_type = apply_method.get("$type", "")
481 for key, label in APPLY_METHOD_MAP.items():
482 if key in raw_type:
483 job["applyMethod"] = label
484 break
485 if "EasyApply" in raw_type:
486 job["applyUrl"] = job.get("jobUrl", "")
487 else:
488 job.setdefault("applyUrl", apply_method.get("companyApplyUrl") or job.get("jobUrl", ""))
489
490
491 if not job.get("country"):
492 geo = data.get("jobGeoLocation", {}) or {}
493 country_urn = geo.get("country", "")
494 if country_urn:
495 m = re.search(r":([A-Z]{2})$", country_urn)
496 if m:
497 job["country"] = m.group(1)
498
499
500 for item in included:
501 if item.get("$type") == "com.linkedin.voyager.jobs.shared.Title":
502 job.setdefault("standardizedTitle", item.get("localizedName", ""))
503 seniority = item.get("experienceLevel", {})
504 if isinstance(seniority, dict):
505 job.setdefault("seniorityLevel", seniority.get("localizedName", ""))
506 break
507
508
509 for item in included:
510 if item.get("$type") == "com.linkedin.voyager.jobs.JobHiringTeam":
511 members = item.get("hiringTeamMembers", [])
512 if members:
513 first = members[0]
514 name = f"{first.get('firstName', '')} {first.get('lastName', '')}".strip()
515 slug = first.get("publicIdentifier", "")
516 job.setdefault("jobPosterName", name or None)
517 job.setdefault("jobPosterTitle", first.get("occupation") or None)
518 job.setdefault("jobPosterProfileUrl", f"https://www.linkedin.com/in/{slug}" if slug else None)
519
520 picture = first.get("picture", {}) or {}
521 artifacts = picture.get("artifacts", [])
522 if artifacts:
523 root = picture.get("rootUrl", "")
524 best = max(artifacts, key=lambda a: a.get("width", 0))
525 job.setdefault("jobPosterPhoto", root + best.get("fileIdentifyingUrlPathSegment", ""))
526 break
527
528
529 for item in included:
530 if item.get("$type") == "com.linkedin.voyager.organization.Company":
531 slug = item.get("universalName", "")
532 if slug:
533 job.setdefault("companyId", slug)
534 job.setdefault("companyLinkedinUrl", f"https://www.linkedin.com/company/{slug}")
535
536 job.setdefault("companyWebsite", item.get("companyPageUrl") or item.get("websiteUrl") or "")
537 job.setdefault("companySlogan", item.get("tagline") or "")
538 job.setdefault("companyDescription", item.get("description") or "")
539 job.setdefault("companyEmployeesCount", item.get("staffCount") or item.get("staffCountRange", {}).get("start"))
540
541
542 logo_obj = item.get("logoV2") or item.get("logo") or {}
543 artifacts = logo_obj.get("artifacts", [])
544 if artifacts:
545 root = logo_obj.get("rootUrl", "")
546 best = max(artifacts, key=lambda a: a.get("width", 0))
547 job.setdefault("companyLogo", root + best.get("fileIdentifyingUrlPathSegment", ""))
548
549
550 hq = item.get("headquarter") or {}
551 if hq:
552 job.setdefault("companyAddress", {
553 "type": "PostalAddress",
554 "streetAddress": " ".join(filter(None, [hq.get("street1"), hq.get("street2")])),
555 "addressLocality": hq.get("city", ""),
556 "addressRegion": hq.get("geographicArea", ""),
557 "postalCode": hq.get("postalCode", ""),
558 "addressCountry": hq.get("country", ""),
559 })
560 break
561
562
563 try:
564 ben = self.session.get(
565 f"https://www.linkedin.com/voyager/api/jobs/jobPostings/{job_id}/benefits",
566 headers={"Accept": "application/vnd.linkedin.normalized+json+2.1"},
567 timeout=10,
568 )
569 if ben.status_code == 200:
570 ben_data = ben.json().get("data", {})
571 items = ben_data.get("benefits") or ben_data.get("elements") or []
572 job["benefits"] = [
573 i.get("localizedName") or i.get("name") or ""
574 for i in items if isinstance(i, dict)
575 ]
576 else:
577 job.setdefault("benefits", [])
578 except Exception:
579 job.setdefault("benefits", [])
580
581
582
583 def search_and_enrich(
584 self,
585 keywords: str,
586 location: str = "",
587 published_within: str = "any",
588 max_results: int = 25,
589 input_url: str = "",
590 ) -> Iterator[dict]:
591 for job in self.search(keywords, location, published_within, max_results, input_url):
592 yield self.enrich(job)