1import { Actor, log } from 'apify';
2import { CheerioCrawler } from 'crawlee';
3
4await Actor.init();
5
6try {
7 const input = await Actor.getInput();
8 const keywords = input.keywords ? input.keywords.slice(0, 2) : ["chief product officer", "united states"];
9 const numPages = input.numPages || 1;
10
11 const baseUrl = 'https://www.google.com/search?q=site%3Alinkedin.com%2Fin%2F+';
12 const formattedKeywords = keywords.map(keyword => `%22${keyword.replace(/ /g, "+")}%22`).join('+');
13 const headers = {
14 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
15 };
16
17 const linkedinUrlsSet = new Set();
18
19
20 const urls = [];
21 for (let page = 0; page < numPages; page++) {
22 const start = page * 10;
23 urls.push(`${baseUrl}${formattedKeywords}&start=${start}`);
24 }
25
26 const requestQueue = await Actor.openRequestQueue();
27 for (const url of urls) {
28 await requestQueue.addRequest({ url, headers });
29 }
30
31 const crawler = new CheerioCrawler({
32 requestQueue,
33 requestHandler: async ({ request, $ }) => {
34 console.log(`Processing ${request.url}...`);
35
36 $('a').each((index, element) => {
37 const href = $(element).attr('href');
38 const match = href && href.match(/(https?:\/\/www\.linkedin\.com\/in\/[^&]+)/);
39 if (match) {
40 const linkedinUrl = match[1];
41 linkedinUrlsSet.add(linkedinUrl);
42 }
43 });
44 },
45 });
46
47 await crawler.run();
48
49
50 const linkedinUrlsArray = Array.from(linkedinUrlsSet);
51
52
53 const results = linkedinUrlsArray.map(url => ({ "LinkedIn URL": url }));
54 await Actor.pushData(results);
55
56 log.info(`Found and saved ${linkedinUrlsSet.size} unique LinkedIn URLs based on the keywords across ${numPages} pages.`);
57} catch (error) {
58 console.error('Error during actor run:', error);
59 throw error;
60}
61
62await Actor.exit();