Upwork Job Scraper avatar
Upwork Job Scraper

Pricing

$19.00/month + usage

Go to Store
Upwork Job Scraper

Upwork Job Scraper

Developed by

Runtime

Maintained by Community

Upwork Job Scraper is an Apify actor that extracts job listings from Upwork based on keywords. It outputs structured data (title, budget, client info) in JSON/CSV for easy analysis.

5.0 (1)

Pricing

$19.00/month + usage

1

Monthly users

2

Runs succeeded

>99%

Last modified

9 days ago

.actor/Dockerfile

1# Dockerfile
2
3FROM apify/actor-node-puppeteer-chrome:20
4
5RUN npm ls crawlee apify puppeteer playwright
6
7COPY --chown=myuser package*.json ./
8
9RUN npm --quiet set progress=false \
10    && npm install --omit=dev --omit=optional \
11    && npm install puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-recaptcha random-useragent \
12    && echo "Installed NPM packages:" \
13    && (npm list --omit=dev --all || true) \
14    && echo "Node.js version:" \
15    && node --version \
16    && echo "NPM version:" \
17    && npm --version \
18    && rm -r ~/.npm
19
20
21COPY --chown=myuser . ./
22
23CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/README.md

1

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "upwork-job-scraper",
4    "title": "Project Puppeteer Crawler JavaScript",
5    "description": "Crawlee and Puppeteer project in JavaScript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-puppeteer-chrome"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Amazon Product Search",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "searchQuery": {
7            "title": "Search Query",
8            "type": "string",
9            "description": "Search query",
10            "editor": "textfield",
11            "default": "designer"
12        }
13    }
14}

src/main.js

1/**************************************************************************
2 * main.js - Single-file Apify Actor scraping Indeed using puppeteer-extra
3 *           and puppeteer-extra-plugin-recaptcha for reCAPTCHA solving,
4 *           with added logs for page titles.
5 **************************************************************************/
6
7import { Actor } from 'apify';
8import { PuppeteerCrawler, Dataset } from 'crawlee';
9import puppeteer from 'puppeteer-extra';
10import StealthPlugin from 'puppeteer-extra-plugin-stealth';
11import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha';
12import randomUseragent from 'random-useragent';
13
14// 1) Use stealth plugin to mask many typical bot signatures
15puppeteer.use(StealthPlugin());
16
17// 2) Configure the recaptcha plugin (requires a solver, e.g. 2captcha)
18puppeteer.use(
19    RecaptchaPlugin({
20        provider: {
21            // Example: Using 2captcha (https://2captcha.com/).
22            // Store your 2captcha API key in an env var: process.env.CAPTCHA_API_KEY
23            id: '2captcha',
24            token: 'a8586dee497a9987d81ba6aa3f89a38c', // e.g., 'abc123...'
25        },
26        visualFeedback: false, // set to true to see boxes around captchas in headful mode
27    })
28);
29
30/**
31 * Helper to sleep a random time between minMs and maxMs
32 * to appear more human and reduce detection.
33 */
34function randomSleep(minMs, maxMs) {
35    const delay = Math.floor(Math.random() * (maxMs - minMs + 1)) + minMs;
36    return new Promise((resolve) => setTimeout(resolve, delay));
37}
38
39// The main request handler for each page (listings or detail)
40async function handleRequest({ request, page, log, session, crawler }) {
41    log.info(`Scraping: ${request.url}`);
42
43    try {
44        // ----- Random user agent -----
45        const userAgent = randomUseragent.getRandom();
46        if (userAgent) {
47            await page.setUserAgent(userAgent);
48        }
49
50        // ----- Block images/fonts/CSS to reduce overhead -----
51        await page.setRequestInterception(true);
52        page.on('request', (req) => {
53            const type = req.resourceType();
54            if (['image', 'stylesheet', 'font'].includes(type)) req.abort();
55            else req.continue();
56        });
57
58        // ----- Random sleep before navigation -----
59        await randomSleep(3000, 6000);
60
61        // ----- Navigate to the page -----
62        const response = await page.goto(request.url, { waitUntil: 'domcontentloaded', timeout: 90000 });
63        if (!response || response.status() === 403) {
64            log.warning('Received 403 or missing response. Marking session bad.');
65            session.markBad();
66            throw new Error(`Request blocked or invalid response at ${request.url}`);
67        }
68
69        // ← Added this: Log the page title (for main listing pages).
70        const listingTitle = await page.title();
71        log.info(`Listing page title: ${listingTitle}`);
72
73        // ----- Attempt to solve any reCAPTCHAs on the page -----
74        const { solved, error } = await page.solveRecaptchas();
75        if (error) {
76            log.error(`Captcha solve error: ${error.message}`);
77        } else if (solved?.length) {
78            log.info(`Captcha solved: ${solved.length} reCAPTCHAs found and solved.`);
79        }
80
81        // ----- Wait for main container with job postings -----
82        await page.waitForSelector('.cardOutline.tapItem', { timeout: 60000 });
83        log.info('Indeed listings found. Extracting...');
84
85        // ----- Extract job listings from this page -----
86        const jobs = await page.evaluate(() => {
87            const results = [];
88            document.querySelectorAll('.cardOutline.tapItem').forEach((job) => {
89                const titleElement = job.querySelector('h2.jobTitle > a');
90                const companyElement = job.querySelector('[data-testid="company-name"]');
91                const locationElement = job.querySelector('[data-testid="text-location"]');
92
93                results.push({
94                    title: titleElement?.textContent.trim() || null,
95                    company: companyElement?.textContent.trim() || null,
96                    location: locationElement?.textContent.trim() || null,
97                    jobLink: titleElement
98                        ? new URL(titleElement.href, 'https://www.indeed.com').href
99                        : null,
100                });
101            });
102            return results;
103        });
104
105        log.info(`Found ${jobs.length} job postings on this page.`);
106
107        // ----- For each job, navigate to the detail page (optional) -----
108        for (const job of jobs) {
109            if (!job.jobLink) continue;
110            log.info(`Scraping detail page: ${job.jobLink}`);
111
112            try {
113                // Random delay before detail page
114                await randomSleep(2000, 5000);
115
116                const detailResponse = await page.goto(job.jobLink, { waitUntil: 'domcontentloaded', timeout: 90000 });
117                if (!detailResponse || detailResponse.status() === 403) {
118                    log.warning(`403 on detail page. Marking session bad. Job link: ${job.jobLink}`);
119                    session.markBad();
120                    continue;
121                }
122
123                // ← Added this: Log the page title (for job detail pages).
124                const detailTitle = await page.title();
125                log.info(`Detail page title: ${detailTitle}`);
126
127                // Attempt to solve captchas on detail page, if any
128                const { solved: detailSolved, error: detailError } = await page.solveRecaptchas();
129                if (detailError) {
130                    log.error(`Captcha solve error on detail: ${detailError.message}`);
131                } else if (detailSolved?.length) {
132                    log.info(`Captcha solved on detail page: ${detailSolved.length}`);
133                }
134
135                // Extract additional details from the job page
136                const detailData = await page.evaluate(() => {
137                    const descriptionElement = document.querySelector('#jobDescriptionText, .jobsearch-jobDescriptionText');
138                    const salaryInfoElement = document.querySelector('#salaryInfoAndJobType');
139                    const additionalDetails = {};
140
141                    let salary = null;
142                    let jobType = null;
143
144                    if (salaryInfoElement) {
145                        const salaryElement = salaryInfoElement.querySelector('.css-19j1a75');
146                        const jobTypeElement = salaryInfoElement.querySelector('.css-k5flys');
147                        salary = salaryElement?.textContent.trim() || null;
148                        jobType = jobTypeElement?.textContent.trim() || null;
149                    }
150
151                    const description = descriptionElement?.textContent.trim() || null;
152
153                    // Additional metadata
154                    document.querySelectorAll('.jobsearch-JobInfoHeader-meta > div').forEach((section) => {
155                        const key = section.querySelector('h3')?.textContent?.trim();
156                        const value = section.querySelector('div')?.textContent?.trim();
157                        if (key && value) {
158                            additionalDetails[key] = value;
159                        }
160                    });
161
162                    return { description, salary, jobType, additionalDetails };
163                });
164
165                // Attach detail info
166                job.description = detailData.description || 'N/A';
167                job.salary = detailData.salary || 'N/A';
168                job.jobType = detailData.jobType || 'N/A';
169                job.additionalDetails = detailData.additionalDetails;
170
171                // Push to default Apify Dataset
172                await Dataset.pushData(job);
173
174            } catch (detailErr) {
175                log.error(`Error scraping job detail for ${job.jobLink}: ${detailErr.message}`);
176            }
177        }
178
179        // ----- Pagination -----
180        if (request.userData.pageCount < request.userData.paginationLimit - 1) {
181            const nextHref = await page.evaluate(() => {
182                const nextButton = document.querySelector('[data-testid="pagination-page-next"]');
183                return nextButton ? nextButton.href : null;
184            });
185
186            if (nextHref) {
187                log.info(`Enqueuing next page: ${nextHref}`);
188                await crawler.addRequests([
189                    {
190                        url: nextHref,
191                        userData: {
192                            ...request.userData,
193                            pageCount: request.userData.pageCount + 1,
194                        },
195                    },
196                ]);
197            } else {
198                log.info('No more pages found.');
199            }
200        } else {
201            log.info(`Reached pagination limit of ${request.userData.paginationLimit} pages.`);
202        }
203
204    } catch (err) {
205        log.error(`Error scraping ${request.url}: ${err.message}`);
206        session.markBad();
207    }
208}
209
210Actor.main(async () => {
211    // Initialize the Apify Actor environment
212    await Actor.init();
213
214    // ----- Retrieve input or set defaults -----
215    const input = await Actor.getInput() || {};
216    const {
217        searchQuery = 'developer',
218        location = 'New York',
219        indeedUrl = 'www.indeed.com',
220        paginationLimit = 1,
221    } = input;
222
223    // Build the Indeed start URL
224    const startUrl = `https://${indeedUrl}/jobs?q=${encodeURIComponent(searchQuery)}&l=${encodeURIComponent(location)}`;
225
226    // (Optional) residential proxy usage
227    const proxyConfiguration = await Actor.createProxyConfiguration({
228        groups: ['RESIDENTIAL'],
229    });
230
231    // Create PuppeteerCrawler with puppeteer-extra (stealth + recaptcha plugin)
232    const crawler = new PuppeteerCrawler({
233        requestHandler: handleRequest,
234        maxConcurrency: 1, // keep low concurrency to avoid detection
235        maxRequestRetries: 3,
236        requestHandlerTimeoutSecs: 180,
237        proxyConfiguration,
238
239        // Force crawlee to use our puppeteer instance with plugins
240        launchContext: {
241            launcher: puppeteer,
242            launchOptions: {
243                headless: false,
244                args: [
245                    '--disable-gpu',
246                    '--no-sandbox',
247                    '--disable-setuid-sandbox',
248                    '--disable-blink-features=AutomationControlled',
249                ],
250            },
251        },
252    });
253
254    // Start the crawler
255    await crawler.run([
256        {
257            url: startUrl,
258            userData: {
259                pageCount: 0,
260                paginationLimit,
261            },
262        },
263    ]);
264
265    // Exit the Actor
266    await Actor.exit();
267});

src/routes.js

1// routes.js
2import { Dataset } from 'apify';
3
4export async function handleRequest({ request, page, log, crawler, session }) {
5    log.info(`Scraping page: ${request.url}`);
6
7    try {
8        // Optionally block images/fonts for performance
9        await page.setRequestInterception(true);
10        page.on('request', (req) => {
11            const resourceType = req.resourceType();
12            if (['image', 'font', 'stylesheet'].includes(resourceType)) {
13                req.abort();
14            } else {
15                req.continue();
16            }
17        });
18
19        // Navigate to the page
20        const response = await page.goto(request.url, { waitUntil: 'domcontentloaded', timeout: 90000 });
21        if (!response || response.status() === 403) {
22            log.warning(`Blocked or invalid response (403) at ${request.url}. Marking session as bad.`);
23            session.markBad();
24            throw new Error(`Request blocked or invalid response for ${request.url}`);
25        }
26
27        // Wait for job postings to appear
28        await page.waitForSelector('.cardOutline.tapItem', { timeout: 60000 });
29        log.info('Job listings container found. Extracting...');
30
31        // Extract job listings
32        const jobs = await page.evaluate(() => {
33            const results = [];
34            document.querySelectorAll('.cardOutline.tapItem').forEach((job) => {
35                const titleElement = job.querySelector('h2.jobTitle > a');
36                const companyElement = job.querySelector('[data-testid="company-name"]');
37                const locationElement = job.querySelector('[data-testid="text-location"]');
38
39                results.push({
40                    title: titleElement?.textContent?.trim() || null,
41                    company: companyElement?.textContent?.trim() || null,
42                    location: locationElement?.textContent?.trim() || null,
43                    jobLink: titleElement
44                        ? new URL(titleElement.href, 'https://www.indeed.com').href
45                        : null,
46                });
47            });
48            return results;
49        });
50
51        log.info(`Found ${jobs.length} job postings on this page.`);
52
53        // Optionally navigate to each job's detail page
54        for (const job of jobs) {
55            if (!job.jobLink) continue;
56            log.info(`Scraping detail page: ${job.jobLink}`);
57
58            try {
59                const detailResponse = await page.goto(job.jobLink, { waitUntil: 'domcontentloaded', timeout: 90000 });
60                if (!detailResponse || detailResponse.status() === 403) {
61                    log.warning(`403 on detail page ${job.jobLink}. Marking session as bad.`);
62                    session.markBad();
63                    continue;
64                }
65
66                const jobDetails = await page.evaluate(() => {
67                    const descriptionElement = document.querySelector('#jobDescriptionText, .jobsearch-jobDescriptionText');
68                    const salaryInfoElement = document.querySelector('#salaryInfoAndJobType');
69                    const additionalDetails = {};
70
71                    let salary = null;
72                    let jobType = null;
73
74                    if (salaryInfoElement) {
75                        const salaryElement = salaryInfoElement.querySelector('.css-19j1a75'); // Salary text
76                        const jobTypeElement = salaryInfoElement.querySelector('.css-k5flys'); // Full-time, etc.
77                        salary = salaryElement?.textContent.trim() || null;
78                        jobType = jobTypeElement?.textContent.trim() || null;
79                    }
80
81                    const description = descriptionElement?.textContent.trim() || null;
82
83                    // Gather any extra metadata
84                    document.querySelectorAll('.jobsearch-JobInfoHeader-meta > div').forEach((section) => {
85                        const key = section.querySelector('h3')?.textContent?.trim();
86                        const value = section.querySelector('div')?.textContent?.trim();
87                        if (key && value) {
88                            additionalDetails[key] = value;
89                        }
90                    });
91
92                    return { description, salary, jobType, additionalDetails };
93                });
94
95                // Merge detail info into the job object
96                job.description = jobDetails.description || 'N/A';
97                job.salary = jobDetails.salary || 'N/A';
98                job.jobType = jobDetails.jobType || 'N/A';
99                job.additionalDetails = jobDetails.additionalDetails;
100
101                // Push the final job data into Apify dataset
102                await Dataset.pushData(job);
103
104            } catch (detailError) {
105                log.error(`Error extracting detail from ${job.jobLink}: ${detailError.message}`);
106            }
107        }
108
109        // Handle pagination if not at the limit
110        if (request.userData.pageCount < request.userData.paginationLimit - 1) {
111            const nextPageLink = await page.evaluate(() => {
112                const nextButton = document.querySelector('[data-testid="pagination-page-next"]');
113                return nextButton ? nextButton.href : null;
114            });
115
116            if (nextPageLink) {
117                const nextPageCount = request.userData.pageCount + 1;
118                log.info(`Queueing next page: ${nextPageLink}`);
119                await crawler.addRequests([
120                    {
121                        url: nextPageLink,
122                        userData: {
123                            ...request.userData,
124                            pageCount: nextPageCount,
125                        },
126                    },
127                ]);
128            } else {
129                log.info('No more pages found.');
130            }
131        } else {
132            log.info(`Pagination limit reached: ${request.userData.paginationLimit} page(s).`);
133        }
134
135    } catch (error) {
136        log.error(`Error scraping page ${request.url}: ${error.message}`);
137        session.markBad();
138    }
139}

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-puppeteer-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "dependencies": {
7        "apify": "^3.2.6",
8        "crawlee": "^3.11.5",
9        "puppeteer": "*"
10    },
11    "devDependencies": {
12        "@apify/eslint-config": "^0.4.0",
13        "eslint": "^8.50.0"
14    },
15    "scripts": {
16        "start": "node src/main.js",
17        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
18    },
19    "author": "It's not you it's me",
20    "license": "ISC"
21}

Pricing

Pricing model

Rental 

To use this Actor, you have to pay a monthly rental fee to the developer. The rent is subtracted from your prepaid usage every month after the free trial period. You also pay for the Apify platform usage.

Free trial

Price

$19.00