Upwork Job Scraper
Pricing
$19.00/month + usage
Go to Store
Upwork Job Scraper
Upwork Job Scraper is an Apify actor that extracts job listings from Upwork based on keywords. It outputs structured data (title, budget, client info) in JSON/CSV for easy analysis.
5.0 (1)
Pricing
$19.00/month + usage
1
Monthly users
2
Runs succeeded
>99%
Last modified
9 days ago
.actor/Dockerfile
1# Dockerfile
2
3FROM apify/actor-node-puppeteer-chrome:20
4
5RUN npm ls crawlee apify puppeteer playwright
6
7COPY package*.json ./
8
9RUN npm --quiet set progress=false \
10 && npm install --omit=dev --omit=optional \
11 && npm install puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-recaptcha random-useragent \
12 && echo "Installed NPM packages:" \
13 && (npm list --omit=dev --all || true) \
14 && echo "Node.js version:" \
15 && node --version \
16 && echo "NPM version:" \
17 && npm --version \
18 && rm -r ~/.npm
19
20
21COPY . ./
22
23CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/README.md
1
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "upwork-job-scraper",
4 "title": "Project Puppeteer Crawler JavaScript",
5 "description": "Crawlee and Puppeteer project in JavaScript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-puppeteer-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "Amazon Product Search",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "searchQuery": {
7 "title": "Search Query",
8 "type": "string",
9 "description": "Search query",
10 "editor": "textfield",
11 "default": "designer"
12 }
13 }
14}
src/main.js
1/**************************************************************************
2 * main.js - Single-file Apify Actor scraping Indeed using puppeteer-extra
3 * and puppeteer-extra-plugin-recaptcha for reCAPTCHA solving,
4 * with added logs for page titles.
5 **************************************************************************/
6
7import { Actor } from 'apify';
8import { PuppeteerCrawler, Dataset } from 'crawlee';
9import puppeteer from 'puppeteer-extra';
10import StealthPlugin from 'puppeteer-extra-plugin-stealth';
11import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha';
12import randomUseragent from 'random-useragent';
13
14// 1) Use stealth plugin to mask many typical bot signatures
15puppeteer.use(StealthPlugin());
16
17// 2) Configure the recaptcha plugin (requires a solver, e.g. 2captcha)
18puppeteer.use(
19 RecaptchaPlugin({
20 provider: {
21 // Example: Using 2captcha (https://2captcha.com/).
22 // Store your 2captcha API key in an env var: process.env.CAPTCHA_API_KEY
23 id: '2captcha',
24 token: 'a8586dee497a9987d81ba6aa3f89a38c', // e.g., 'abc123...'
25 },
26 visualFeedback: false, // set to true to see boxes around captchas in headful mode
27 })
28);
29
30/**
31 * Helper to sleep a random time between minMs and maxMs
32 * to appear more human and reduce detection.
33 */
34function randomSleep(minMs, maxMs) {
35 const delay = Math.floor(Math.random() * (maxMs - minMs + 1)) + minMs;
36 return new Promise((resolve) => setTimeout(resolve, delay));
37}
38
39// The main request handler for each page (listings or detail)
40async function handleRequest({ request, page, log, session, crawler }) {
41 log.info(`Scraping: ${request.url}`);
42
43 try {
44 // ----- Random user agent -----
45 const userAgent = randomUseragent.getRandom();
46 if (userAgent) {
47 await page.setUserAgent(userAgent);
48 }
49
50 // ----- Block images/fonts/CSS to reduce overhead -----
51 await page.setRequestInterception(true);
52 page.on('request', (req) => {
53 const type = req.resourceType();
54 if (['image', 'stylesheet', 'font'].includes(type)) req.abort();
55 else req.continue();
56 });
57
58 // ----- Random sleep before navigation -----
59 await randomSleep(3000, 6000);
60
61 // ----- Navigate to the page -----
62 const response = await page.goto(request.url, { waitUntil: 'domcontentloaded', timeout: 90000 });
63 if (!response || response.status() === 403) {
64 log.warning('Received 403 or missing response. Marking session bad.');
65 session.markBad();
66 throw new Error(`Request blocked or invalid response at ${request.url}`);
67 }
68
69 // ← Added this: Log the page title (for main listing pages).
70 const listingTitle = await page.title();
71 log.info(`Listing page title: ${listingTitle}`);
72
73 // ----- Attempt to solve any reCAPTCHAs on the page -----
74 const { solved, error } = await page.solveRecaptchas();
75 if (error) {
76 log.error(`Captcha solve error: ${error.message}`);
77 } else if (solved?.length) {
78 log.info(`Captcha solved: ${solved.length} reCAPTCHAs found and solved.`);
79 }
80
81 // ----- Wait for main container with job postings -----
82 await page.waitForSelector('.cardOutline.tapItem', { timeout: 60000 });
83 log.info('Indeed listings found. Extracting...');
84
85 // ----- Extract job listings from this page -----
86 const jobs = await page.evaluate(() => {
87 const results = [];
88 document.querySelectorAll('.cardOutline.tapItem').forEach((job) => {
89 const titleElement = job.querySelector('h2.jobTitle > a');
90 const companyElement = job.querySelector('[data-testid="company-name"]');
91 const locationElement = job.querySelector('[data-testid="text-location"]');
92
93 results.push({
94 title: titleElement?.textContent.trim() || null,
95 company: companyElement?.textContent.trim() || null,
96 location: locationElement?.textContent.trim() || null,
97 jobLink: titleElement
98 ? new URL(titleElement.href, 'https://www.indeed.com').href
99 : null,
100 });
101 });
102 return results;
103 });
104
105 log.info(`Found ${jobs.length} job postings on this page.`);
106
107 // ----- For each job, navigate to the detail page (optional) -----
108 for (const job of jobs) {
109 if (!job.jobLink) continue;
110 log.info(`Scraping detail page: ${job.jobLink}`);
111
112 try {
113 // Random delay before detail page
114 await randomSleep(2000, 5000);
115
116 const detailResponse = await page.goto(job.jobLink, { waitUntil: 'domcontentloaded', timeout: 90000 });
117 if (!detailResponse || detailResponse.status() === 403) {
118 log.warning(`403 on detail page. Marking session bad. Job link: ${job.jobLink}`);
119 session.markBad();
120 continue;
121 }
122
123 // ← Added this: Log the page title (for job detail pages).
124 const detailTitle = await page.title();
125 log.info(`Detail page title: ${detailTitle}`);
126
127 // Attempt to solve captchas on detail page, if any
128 const { solved: detailSolved, error: detailError } = await page.solveRecaptchas();
129 if (detailError) {
130 log.error(`Captcha solve error on detail: ${detailError.message}`);
131 } else if (detailSolved?.length) {
132 log.info(`Captcha solved on detail page: ${detailSolved.length}`);
133 }
134
135 // Extract additional details from the job page
136 const detailData = await page.evaluate(() => {
137 const descriptionElement = document.querySelector('#jobDescriptionText, .jobsearch-jobDescriptionText');
138 const salaryInfoElement = document.querySelector('#salaryInfoAndJobType');
139 const additionalDetails = {};
140
141 let salary = null;
142 let jobType = null;
143
144 if (salaryInfoElement) {
145 const salaryElement = salaryInfoElement.querySelector('.css-19j1a75');
146 const jobTypeElement = salaryInfoElement.querySelector('.css-k5flys');
147 salary = salaryElement?.textContent.trim() || null;
148 jobType = jobTypeElement?.textContent.trim() || null;
149 }
150
151 const description = descriptionElement?.textContent.trim() || null;
152
153 // Additional metadata
154 document.querySelectorAll('.jobsearch-JobInfoHeader-meta > div').forEach((section) => {
155 const key = section.querySelector('h3')?.textContent?.trim();
156 const value = section.querySelector('div')?.textContent?.trim();
157 if (key && value) {
158 additionalDetails[key] = value;
159 }
160 });
161
162 return { description, salary, jobType, additionalDetails };
163 });
164
165 // Attach detail info
166 job.description = detailData.description || 'N/A';
167 job.salary = detailData.salary || 'N/A';
168 job.jobType = detailData.jobType || 'N/A';
169 job.additionalDetails = detailData.additionalDetails;
170
171 // Push to default Apify Dataset
172 await Dataset.pushData(job);
173
174 } catch (detailErr) {
175 log.error(`Error scraping job detail for ${job.jobLink}: ${detailErr.message}`);
176 }
177 }
178
179 // ----- Pagination -----
180 if (request.userData.pageCount < request.userData.paginationLimit - 1) {
181 const nextHref = await page.evaluate(() => {
182 const nextButton = document.querySelector('[data-testid="pagination-page-next"]');
183 return nextButton ? nextButton.href : null;
184 });
185
186 if (nextHref) {
187 log.info(`Enqueuing next page: ${nextHref}`);
188 await crawler.addRequests([
189 {
190 url: nextHref,
191 userData: {
192 ...request.userData,
193 pageCount: request.userData.pageCount + 1,
194 },
195 },
196 ]);
197 } else {
198 log.info('No more pages found.');
199 }
200 } else {
201 log.info(`Reached pagination limit of ${request.userData.paginationLimit} pages.`);
202 }
203
204 } catch (err) {
205 log.error(`Error scraping ${request.url}: ${err.message}`);
206 session.markBad();
207 }
208}
209
210Actor.main(async () => {
211 // Initialize the Apify Actor environment
212 await Actor.init();
213
214 // ----- Retrieve input or set defaults -----
215 const input = await Actor.getInput() || {};
216 const {
217 searchQuery = 'developer',
218 location = 'New York',
219 indeedUrl = 'www.indeed.com',
220 paginationLimit = 1,
221 } = input;
222
223 // Build the Indeed start URL
224 const startUrl = `https://${indeedUrl}/jobs?q=${encodeURIComponent(searchQuery)}&l=${encodeURIComponent(location)}`;
225
226 // (Optional) residential proxy usage
227 const proxyConfiguration = await Actor.createProxyConfiguration({
228 groups: ['RESIDENTIAL'],
229 });
230
231 // Create PuppeteerCrawler with puppeteer-extra (stealth + recaptcha plugin)
232 const crawler = new PuppeteerCrawler({
233 requestHandler: handleRequest,
234 maxConcurrency: 1, // keep low concurrency to avoid detection
235 maxRequestRetries: 3,
236 requestHandlerTimeoutSecs: 180,
237 proxyConfiguration,
238
239 // Force crawlee to use our puppeteer instance with plugins
240 launchContext: {
241 launcher: puppeteer,
242 launchOptions: {
243 headless: false,
244 args: [
245 '--disable-gpu',
246 '--no-sandbox',
247 '--disable-setuid-sandbox',
248 '--disable-blink-features=AutomationControlled',
249 ],
250 },
251 },
252 });
253
254 // Start the crawler
255 await crawler.run([
256 {
257 url: startUrl,
258 userData: {
259 pageCount: 0,
260 paginationLimit,
261 },
262 },
263 ]);
264
265 // Exit the Actor
266 await Actor.exit();
267});
src/routes.js
1// routes.js
2import { Dataset } from 'apify';
3
4export async function handleRequest({ request, page, log, crawler, session }) {
5 log.info(`Scraping page: ${request.url}`);
6
7 try {
8 // Optionally block images/fonts for performance
9 await page.setRequestInterception(true);
10 page.on('request', (req) => {
11 const resourceType = req.resourceType();
12 if (['image', 'font', 'stylesheet'].includes(resourceType)) {
13 req.abort();
14 } else {
15 req.continue();
16 }
17 });
18
19 // Navigate to the page
20 const response = await page.goto(request.url, { waitUntil: 'domcontentloaded', timeout: 90000 });
21 if (!response || response.status() === 403) {
22 log.warning(`Blocked or invalid response (403) at ${request.url}. Marking session as bad.`);
23 session.markBad();
24 throw new Error(`Request blocked or invalid response for ${request.url}`);
25 }
26
27 // Wait for job postings to appear
28 await page.waitForSelector('.cardOutline.tapItem', { timeout: 60000 });
29 log.info('Job listings container found. Extracting...');
30
31 // Extract job listings
32 const jobs = await page.evaluate(() => {
33 const results = [];
34 document.querySelectorAll('.cardOutline.tapItem').forEach((job) => {
35 const titleElement = job.querySelector('h2.jobTitle > a');
36 const companyElement = job.querySelector('[data-testid="company-name"]');
37 const locationElement = job.querySelector('[data-testid="text-location"]');
38
39 results.push({
40 title: titleElement?.textContent?.trim() || null,
41 company: companyElement?.textContent?.trim() || null,
42 location: locationElement?.textContent?.trim() || null,
43 jobLink: titleElement
44 ? new URL(titleElement.href, 'https://www.indeed.com').href
45 : null,
46 });
47 });
48 return results;
49 });
50
51 log.info(`Found ${jobs.length} job postings on this page.`);
52
53 // Optionally navigate to each job's detail page
54 for (const job of jobs) {
55 if (!job.jobLink) continue;
56 log.info(`Scraping detail page: ${job.jobLink}`);
57
58 try {
59 const detailResponse = await page.goto(job.jobLink, { waitUntil: 'domcontentloaded', timeout: 90000 });
60 if (!detailResponse || detailResponse.status() === 403) {
61 log.warning(`403 on detail page ${job.jobLink}. Marking session as bad.`);
62 session.markBad();
63 continue;
64 }
65
66 const jobDetails = await page.evaluate(() => {
67 const descriptionElement = document.querySelector('#jobDescriptionText, .jobsearch-jobDescriptionText');
68 const salaryInfoElement = document.querySelector('#salaryInfoAndJobType');
69 const additionalDetails = {};
70
71 let salary = null;
72 let jobType = null;
73
74 if (salaryInfoElement) {
75 const salaryElement = salaryInfoElement.querySelector('.css-19j1a75'); // Salary text
76 const jobTypeElement = salaryInfoElement.querySelector('.css-k5flys'); // Full-time, etc.
77 salary = salaryElement?.textContent.trim() || null;
78 jobType = jobTypeElement?.textContent.trim() || null;
79 }
80
81 const description = descriptionElement?.textContent.trim() || null;
82
83 // Gather any extra metadata
84 document.querySelectorAll('.jobsearch-JobInfoHeader-meta > div').forEach((section) => {
85 const key = section.querySelector('h3')?.textContent?.trim();
86 const value = section.querySelector('div')?.textContent?.trim();
87 if (key && value) {
88 additionalDetails[key] = value;
89 }
90 });
91
92 return { description, salary, jobType, additionalDetails };
93 });
94
95 // Merge detail info into the job object
96 job.description = jobDetails.description || 'N/A';
97 job.salary = jobDetails.salary || 'N/A';
98 job.jobType = jobDetails.jobType || 'N/A';
99 job.additionalDetails = jobDetails.additionalDetails;
100
101 // Push the final job data into Apify dataset
102 await Dataset.pushData(job);
103
104 } catch (detailError) {
105 log.error(`Error extracting detail from ${job.jobLink}: ${detailError.message}`);
106 }
107 }
108
109 // Handle pagination if not at the limit
110 if (request.userData.pageCount < request.userData.paginationLimit - 1) {
111 const nextPageLink = await page.evaluate(() => {
112 const nextButton = document.querySelector('[data-testid="pagination-page-next"]');
113 return nextButton ? nextButton.href : null;
114 });
115
116 if (nextPageLink) {
117 const nextPageCount = request.userData.pageCount + 1;
118 log.info(`Queueing next page: ${nextPageLink}`);
119 await crawler.addRequests([
120 {
121 url: nextPageLink,
122 userData: {
123 ...request.userData,
124 pageCount: nextPageCount,
125 },
126 },
127 ]);
128 } else {
129 log.info('No more pages found.');
130 }
131 } else {
132 log.info(`Pagination limit reached: ${request.userData.paginationLimit} page(s).`);
133 }
134
135 } catch (error) {
136 log.error(`Error scraping page ${request.url}: ${error.message}`);
137 session.markBad();
138 }
139}
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
package.json
1{
2 "name": "crawlee-puppeteer-javascript",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "dependencies": {
7 "apify": "^3.2.6",
8 "crawlee": "^3.11.5",
9 "puppeteer": "*"
10 },
11 "devDependencies": {
12 "@apify/eslint-config": "^0.4.0",
13 "eslint": "^8.50.0"
14 },
15 "scripts": {
16 "start": "node src/main.js",
17 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
18 },
19 "author": "It's not you it's me",
20 "license": "ISC"
21}
Pricing
Pricing model
RentalTo use this Actor, you have to pay a monthly rental fee to the developer. The rent is subtracted from your prepaid usage every month after the free trial period. You also pay for the Apify platform usage.
Free trial
Price
$19.00