Upwork Job Scraper
Pricing
$29.00/month + usage
Go to Store
Upwork Job Scraper
Upwork Job Scraper is an Apify actor that extracts job listings from Upwork based on keywords. It outputs structured data (title, budget, client info) in JSON/CSV for easy analysis.
5.0 (1)
Pricing
$29.00/month + usage
1
Total users
2
Monthly users
1
Runs succeeded
>99%
Last modified
a month ago
.actor/Dockerfile
# Dockerfile
FROM apify/actor-node-puppeteer-chrome:20
RUN npm ls crawlee apify puppeteer playwright
COPY package*.json ./
RUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && npm install puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-recaptcha random-useragent \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
COPY . ./
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/README.md
1
.actor/actor.json
{ "actorSpecification": 1, "name": "upwork-job-scraper", "title": "Project Puppeteer Crawler JavaScript", "description": "Crawlee and Puppeteer project in JavaScript.", "version": "0.0", "meta": { "templateId": "js-crawlee-puppeteer-chrome" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Amazon Product Search", "type": "object", "schemaVersion": 1, "properties": { "searchQuery": { "title": "Search Query", "type": "string", "description": "Search query", "editor": "textfield", "default": "designer" } }}
src/main.js
1/**************************************************************************2 * main.js - Single-file Apify Actor scraping Indeed using puppeteer-extra3 * and puppeteer-extra-plugin-recaptcha for reCAPTCHA solving,4 * with added logs for page titles.5 **************************************************************************/6
7import { Actor } from 'apify';8import { PuppeteerCrawler, Dataset } from 'crawlee';9import puppeteer from 'puppeteer-extra';10import StealthPlugin from 'puppeteer-extra-plugin-stealth';11import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha';12import randomUseragent from 'random-useragent';13
14// 1) Use stealth plugin to mask many typical bot signatures15puppeteer.use(StealthPlugin());16
17// 2) Configure the recaptcha plugin (requires a solver, e.g. 2captcha)18puppeteer.use(19 RecaptchaPlugin({20 provider: {21 // Example: Using 2captcha (https://2captcha.com/).22 // Store your 2captcha API key in an env var: process.env.CAPTCHA_API_KEY23 id: '2captcha',24 token: 'a8586dee497a9987d81ba6aa3f89a38c', // e.g., 'abc123...'25 },26 visualFeedback: false, // set to true to see boxes around captchas in headful mode27 })28);29
30/**31 * Helper to sleep a random time between minMs and maxMs32 * to appear more human and reduce detection.33 */34function randomSleep(minMs, maxMs) {35 const delay = Math.floor(Math.random() * (maxMs - minMs + 1)) + minMs;36 return new Promise((resolve) => setTimeout(resolve, delay));37}38
39// The main request handler for each page (listings or detail)40async function handleRequest({ request, page, log, session, crawler }) {41 log.info(`Scraping: ${request.url}`);42
43 try {44 // ----- Random user agent -----45 const userAgent = randomUseragent.getRandom();46 if (userAgent) {47 await page.setUserAgent(userAgent);48 }49
50 // ----- Block images/fonts/CSS to reduce overhead -----51 await page.setRequestInterception(true);52 page.on('request', (req) => {53 const type = req.resourceType();54 if (['image', 'stylesheet', 'font'].includes(type)) req.abort();55 else req.continue();56 });57
58 // ----- Random sleep before navigation -----59 await randomSleep(3000, 6000);60
61 // ----- Navigate to the page -----62 const response = await page.goto(request.url, { waitUntil: 'domcontentloaded', timeout: 90000 });63 if (!response || response.status() === 403) {64 log.warning('Received 403 or missing response. Marking session bad.');65 session.markBad();66 throw new Error(`Request blocked or invalid response at ${request.url}`);67 }68
69 // ← Added this: Log the page title (for main listing pages).70 const listingTitle = await page.title();71 log.info(`Listing page title: ${listingTitle}`);72
73 // ----- Attempt to solve any reCAPTCHAs on the page -----74 const { solved, error } = await page.solveRecaptchas();75 if (error) {76 log.error(`Captcha solve error: ${error.message}`);77 } else if (solved?.length) {78 log.info(`Captcha solved: ${solved.length} reCAPTCHAs found and solved.`);79 }80
81 // ----- Wait for main container with job postings -----82 await page.waitForSelector('.cardOutline.tapItem', { timeout: 60000 });83 log.info('Indeed listings found. Extracting...');84
85 // ----- Extract job listings from this page -----86 const jobs = await page.evaluate(() => {87 const results = [];88 document.querySelectorAll('.cardOutline.tapItem').forEach((job) => {89 const titleElement = job.querySelector('h2.jobTitle > a');90 const companyElement = job.querySelector('[data-testid="company-name"]');91 const locationElement = job.querySelector('[data-testid="text-location"]');92
93 results.push({94 title: titleElement?.textContent.trim() || null,95 company: companyElement?.textContent.trim() || null,96 location: locationElement?.textContent.trim() || null,97 jobLink: titleElement98 ? new URL(titleElement.href, 'https://www.indeed.com').href99 : null,100 });101 });102 return results;103 });104
105 log.info(`Found ${jobs.length} job postings on this page.`);106
107 // ----- For each job, navigate to the detail page (optional) -----108 for (const job of jobs) {109 if (!job.jobLink) continue;110 log.info(`Scraping detail page: ${job.jobLink}`);111
112 try {113 // Random delay before detail page114 await randomSleep(2000, 5000);115
116 const detailResponse = await page.goto(job.jobLink, { waitUntil: 'domcontentloaded', timeout: 90000 });117 if (!detailResponse || detailResponse.status() === 403) {118 log.warning(`403 on detail page. Marking session bad. Job link: ${job.jobLink}`);119 session.markBad();120 continue;121 }122
123 // ← Added this: Log the page title (for job detail pages).124 const detailTitle = await page.title();125 log.info(`Detail page title: ${detailTitle}`);126
127 // Attempt to solve captchas on detail page, if any128 const { solved: detailSolved, error: detailError } = await page.solveRecaptchas();129 if (detailError) {130 log.error(`Captcha solve error on detail: ${detailError.message}`);131 } else if (detailSolved?.length) {132 log.info(`Captcha solved on detail page: ${detailSolved.length}`);133 }134
135 // Extract additional details from the job page136 const detailData = await page.evaluate(() => {137 const descriptionElement = document.querySelector('#jobDescriptionText, .jobsearch-jobDescriptionText');138 const salaryInfoElement = document.querySelector('#salaryInfoAndJobType');139 const additionalDetails = {};140
141 let salary = null;142 let jobType = null;143
144 if (salaryInfoElement) {145 const salaryElement = salaryInfoElement.querySelector('.css-19j1a75');146 const jobTypeElement = salaryInfoElement.querySelector('.css-k5flys');147 salary = salaryElement?.textContent.trim() || null;148 jobType = jobTypeElement?.textContent.trim() || null;149 }150
151 const description = descriptionElement?.textContent.trim() || null;152
153 // Additional metadata154 document.querySelectorAll('.jobsearch-JobInfoHeader-meta > div').forEach((section) => {155 const key = section.querySelector('h3')?.textContent?.trim();156 const value = section.querySelector('div')?.textContent?.trim();157 if (key && value) {158 additionalDetails[key] = value;159 }160 });161
162 return { description, salary, jobType, additionalDetails };163 });164
165 // Attach detail info166 job.description = detailData.description || 'N/A';167 job.salary = detailData.salary || 'N/A';168 job.jobType = detailData.jobType || 'N/A';169 job.additionalDetails = detailData.additionalDetails;170
171 // Push to default Apify Dataset172 await Dataset.pushData(job);173
174 } catch (detailErr) {175 log.error(`Error scraping job detail for ${job.jobLink}: ${detailErr.message}`);176 }177 }178
179 // ----- Pagination -----180 if (request.userData.pageCount < request.userData.paginationLimit - 1) {181 const nextHref = await page.evaluate(() => {182 const nextButton = document.querySelector('[data-testid="pagination-page-next"]');183 return nextButton ? nextButton.href : null;184 });185
186 if (nextHref) {187 log.info(`Enqueuing next page: ${nextHref}`);188 await crawler.addRequests([189 {190 url: nextHref,191 userData: {192 ...request.userData,193 pageCount: request.userData.pageCount + 1,194 },195 },196 ]);197 } else {198 log.info('No more pages found.');199 }200 } else {201 log.info(`Reached pagination limit of ${request.userData.paginationLimit} pages.`);202 }203
204 } catch (err) {205 log.error(`Error scraping ${request.url}: ${err.message}`);206 session.markBad();207 }208}209
210Actor.main(async () => {211 // Initialize the Apify Actor environment212 await Actor.init();213
214 // ----- Retrieve input or set defaults -----215 const input = await Actor.getInput() || {};216 const {217 searchQuery = 'developer',218 location = 'New York',219 indeedUrl = 'www.indeed.com',220 paginationLimit = 1,221 } = input;222
223 // Build the Indeed start URL224 const startUrl = `https://${indeedUrl}/jobs?q=${encodeURIComponent(searchQuery)}&l=${encodeURIComponent(location)}`;225
226 // (Optional) residential proxy usage227 const proxyConfiguration = await Actor.createProxyConfiguration({228 groups: ['RESIDENTIAL'],229 });230
231 // Create PuppeteerCrawler with puppeteer-extra (stealth + recaptcha plugin)232 const crawler = new PuppeteerCrawler({233 requestHandler: handleRequest,234 maxConcurrency: 1, // keep low concurrency to avoid detection235 maxRequestRetries: 3,236 requestHandlerTimeoutSecs: 180,237 proxyConfiguration,238
239 // Force crawlee to use our puppeteer instance with plugins240 launchContext: {241 launcher: puppeteer,242 launchOptions: {243 headless: false,244 args: [245 '--disable-gpu',246 '--no-sandbox',247 '--disable-setuid-sandbox',248 '--disable-blink-features=AutomationControlled',249 ],250 },251 },252 });253
254 // Start the crawler255 await crawler.run([256 {257 url: startUrl,258 userData: {259 pageCount: 0,260 paginationLimit,261 },262 },263 ]);264
265 // Exit the Actor266 await Actor.exit();267});
src/routes.js
1// routes.js2import { Dataset } from 'apify';3
4export async function handleRequest({ request, page, log, crawler, session }) {5 log.info(`Scraping page: ${request.url}`);6
7 try {8 // Optionally block images/fonts for performance9 await page.setRequestInterception(true);10 page.on('request', (req) => {11 const resourceType = req.resourceType();12 if (['image', 'font', 'stylesheet'].includes(resourceType)) {13 req.abort();14 } else {15 req.continue();16 }17 });18
19 // Navigate to the page20 const response = await page.goto(request.url, { waitUntil: 'domcontentloaded', timeout: 90000 });21 if (!response || response.status() === 403) {22 log.warning(`Blocked or invalid response (403) at ${request.url}. Marking session as bad.`);23 session.markBad();24 throw new Error(`Request blocked or invalid response for ${request.url}`);25 }26
27 // Wait for job postings to appear28 await page.waitForSelector('.cardOutline.tapItem', { timeout: 60000 });29 log.info('Job listings container found. Extracting...');30
31 // Extract job listings32 const jobs = await page.evaluate(() => {33 const results = [];34 document.querySelectorAll('.cardOutline.tapItem').forEach((job) => {35 const titleElement = job.querySelector('h2.jobTitle > a');36 const companyElement = job.querySelector('[data-testid="company-name"]');37 const locationElement = job.querySelector('[data-testid="text-location"]');38
39 results.push({40 title: titleElement?.textContent?.trim() || null,41 company: companyElement?.textContent?.trim() || null,42 location: locationElement?.textContent?.trim() || null,43 jobLink: titleElement44 ? new URL(titleElement.href, 'https://www.indeed.com').href45 : null,46 });47 });48 return results;49 });50
51 log.info(`Found ${jobs.length} job postings on this page.`);52
53 // Optionally navigate to each job's detail page54 for (const job of jobs) {55 if (!job.jobLink) continue;56 log.info(`Scraping detail page: ${job.jobLink}`);57
58 try {59 const detailResponse = await page.goto(job.jobLink, { waitUntil: 'domcontentloaded', timeout: 90000 });60 if (!detailResponse || detailResponse.status() === 403) {61 log.warning(`403 on detail page ${job.jobLink}. Marking session as bad.`);62 session.markBad();63 continue;64 }65
66 const jobDetails = await page.evaluate(() => {67 const descriptionElement = document.querySelector('#jobDescriptionText, .jobsearch-jobDescriptionText');68 const salaryInfoElement = document.querySelector('#salaryInfoAndJobType');69 const additionalDetails = {};70
71 let salary = null;72 let jobType = null;73
74 if (salaryInfoElement) {75 const salaryElement = salaryInfoElement.querySelector('.css-19j1a75'); // Salary text76 const jobTypeElement = salaryInfoElement.querySelector('.css-k5flys'); // Full-time, etc.77 salary = salaryElement?.textContent.trim() || null;78 jobType = jobTypeElement?.textContent.trim() || null;79 }80
81 const description = descriptionElement?.textContent.trim() || null;82
83 // Gather any extra metadata84 document.querySelectorAll('.jobsearch-JobInfoHeader-meta > div').forEach((section) => {85 const key = section.querySelector('h3')?.textContent?.trim();86 const value = section.querySelector('div')?.textContent?.trim();87 if (key && value) {88 additionalDetails[key] = value;89 }90 });91
92 return { description, salary, jobType, additionalDetails };93 });94
95 // Merge detail info into the job object96 job.description = jobDetails.description || 'N/A';97 job.salary = jobDetails.salary || 'N/A';98 job.jobType = jobDetails.jobType || 'N/A';99 job.additionalDetails = jobDetails.additionalDetails;100
101 // Push the final job data into Apify dataset102 await Dataset.pushData(job);103
104 } catch (detailError) {105 log.error(`Error extracting detail from ${job.jobLink}: ${detailError.message}`);106 }107 }108
109 // Handle pagination if not at the limit110 if (request.userData.pageCount < request.userData.paginationLimit - 1) {111 const nextPageLink = await page.evaluate(() => {112 const nextButton = document.querySelector('[data-testid="pagination-page-next"]');113 return nextButton ? nextButton.href : null;114 });115
116 if (nextPageLink) {117 const nextPageCount = request.userData.pageCount + 1;118 log.info(`Queueing next page: ${nextPageLink}`);119 await crawler.addRequests([120 {121 url: nextPageLink,122 userData: {123 ...request.userData,124 pageCount: nextPageCount,125 },126 },127 ]);128 } else {129 log.info('No more pages found.');130 }131 } else {132 log.info(`Pagination limit reached: ${request.userData.paginationLimit} page(s).`);133 }134
135 } catch (error) {136 log.error(`Error scraping page ${request.url}: ${error.message}`);137 session.markBad();138 }139}
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
package.json
{ "name": "crawlee-puppeteer-javascript", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "dependencies": { "apify": "^3.2.6", "crawlee": "^3.11.5", "puppeteer": "*" }, "devDependencies": { "@apify/eslint-config": "^0.4.0", "eslint": "^8.50.0" }, "scripts": { "start": "node src/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}