1import asyncio
2from bs4 import BeautifulSoup
3import requests
4from apify import Actor
5import re
6
7async def main() -> None:
8 async with Actor() as actor:
9 actor_input = await actor.get_input() or {}
10 keywords = actor_input.get('keywords', ["chief product officer", "united states", "insurance"])
11 num_pages = actor_input.get('numPages', 1)
12
13 base_url = 'https://www.google.com/search?q=site%3Alinkedin.com%2Fin%2F+'
14 formatted_keywords = '+'.join(f'(%22{keyword.replace(" ", "+")}%22)' for keyword in keywords)
15
16 headers = {
17 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
18
19 linkedin_urls = []
20
21
22 for page_start in range(0, num_pages * 10, 10):
23 url = f"{base_url}{formatted_keywords}&start={page_start}"
24 print(url)
25 response = requests.get(url, headers=headers)
26 soup = BeautifulSoup(response.text, 'html.parser')
27 links = soup.find_all('a', href=True)
28
29
30 for link in links:
31 match = re.search(r'(https?://www\.linkedin\.com/in/[^&]+)', link['href'])
32 if match:
33 linkedin_url = match.group(1)
34 if linkedin_url not in linkedin_urls:
35 linkedin_urls.append(linkedin_url)
36
37
38 for url in linkedin_urls:
39 await actor.push_data({"LinkedIn URL": url})
40
41 actor.log.info(f"Found and saved {len(linkedin_urls)} LinkedIn URLs based on the keywords across {num_pages} pages.")
42
43if __name__ == '__main__':
44 asyncio.run(main())