
Upwork Job Scraper
Under maintenance
Pricing
$29.00/month + usage
Go to Store

Upwork Job Scraper
Under maintenance
Upwork Job Scraper is an Apify actor that extracts job listings from Upwork based on keywords. It outputs structured data (title, budget, client info) in JSON/CSV for easy analysis.
5.0 (1)
Pricing
$29.00/month + usage
1
Total users
19
Monthly users
12
Runs succeeded
94%
Last modified
4 hours ago
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
input.json
{ "searchQuery": "javascript", "page": 1, "useApifyProxy": false, "maxConcurrency": 1, "maxRetries": 3, "useEnhancedCrawler": true, "minDelayBetweenRequests": 5, "maxDelayBetweenRequests": 10, "sessionRotationCount": 2, "useCloudScraper": true, "useFallbackHtmlParser": true}
package.json
{ "name": "crawlee-puppeteer-javascript", "version": "0.0.1", "type": "module", "description": "This is an example of a Crawlee project.", "dependencies": { "2captcha": "^3.0.7", "apify": "^3.1.4", "axios": "^1.6.2", "crawlee": "^3.5.4", "fingerprint-generator": "^2.1.66", "fingerprint-injector": "^2.1.66", "https-proxy-agent": "^7.0.6", "node-fetch": "^3.3.2", "proxy-chain": "^2.3.0", "puppeteer": "^22.8.2", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-adblocker": "^2.13.6", "puppeteer-extra-plugin-recaptcha": "^3.6.8", "puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-extra-plugin-user-data-dir": "^2.4.1", "puppeteer-extra-plugin-user-preferences": "^2.4.1", "random-useragent": "^0.5.0", "uuid": "^9.0.1" }, "devDependencies": { "@apify/eslint-config": "^0.4.0", "eslint": "^8.50.0" }, "scripts": { "start": "node src/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1", "pull": "apify pull", "push": "apify push", "commit": "git add . && git commit -m \"Update\" && git push" }, "author": "It's not you it's me", "license": "ISC"}
start_xvfb_and_run_cmd.sh
Download.actor/Dockerfile
# Dockerfile
FROM apify/actor-node-puppeteer-chrome:20
RUN npm ls crawlee apify puppeteer playwright
COPY package*.json ./
RUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && npm install puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-recaptcha random-useragent \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
COPY . ./
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/README.md
1
.actor/actor.json
{ "actorSpecification": 1, "name": "upwork-job-scraper", "title": "Project Puppeteer Crawler JavaScript", "description": "Crawlee and Puppeteer project in JavaScript.", "version": "0.10", "meta": { "templateId": "js-crawlee-puppeteer-chrome" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Upwork Job Search", "type": "object", "schemaVersion": 1, "properties": { "searchQuery": { "title": "Search Query", "type": "string", "description": "Keywords to search for on Upwork (e.g., job title, skills)", "editor": "textfield", "default": "" }, "page": { "title": "Page Number", "type": "integer", "description": "The page number to scrape on Upwork", "editor": "number", "default": 1 }, "useEnhancedCrawler": { "title": "Use Enhanced Anti-Detection System", "type": "boolean", "description": "Use the advanced Upwork challenge bypass system (recommended for 403 errors)", "editor": "checkbox", "default": true }, "tlsFingerprintEvasion": { "title": "Enable TLS Fingerprint Evasion", "type": "boolean", "description": "Use advanced TLS fingerprinting evasion to bypass Upwork's security (requires Enhanced Crawler)", "editor": "checkbox", "default": true }, "customHeaderOrder": { "title": "Randomize HTTP Header Order", "type": "boolean", "description": "Randomize HTTP header order to evade fingerprinting detection (requires Enhanced Crawler)", "editor": "checkbox", "default": true }, "advancedFingerprinting": { "title": "Use Advanced Fingerprint Evasion", "type": "boolean", "description": "Apply comprehensive browser fingerprint masking techniques (Canvas, WebGL, Audio, etc.)", "editor": "checkbox", "default": true }, "fingerprintConsistency": { "title": "Maintain Consistent Fingerprints", "type": "boolean", "description": "Keep browser fingerprints consistent within sessions (recommended)", "editor": "checkbox", "default": true }, "fingerprintBrowserProfile": { "title": "Browser Fingerprint Profile", "type": "string", "description": "Browser profile to emulate for fingerprint generation", "editor": "select", "default": "auto-rotate", "enum": ["auto-rotate", "chrome", "safari", "firefox"], "enumTitles": ["Auto Rotate", "Chrome", "Safari", "Firefox"] }, "useApifyProxy": { "title": "Use Apify Proxy", "type": "boolean", "description": "Whether to use Apify Proxy", "editor": "checkbox", "default": true, "prefill": true }, "maxConcurrency": { "title": "Max Concurrency", "type": "integer", "description": "Maximum number of concurrent requests (lower is safer)", "editor": "number", "default": 1, "minimum": 1, "maximum": 10 }, "maxRetries": { "title": "Max Retries", "type": "integer", "description": "Maximum number of retries per request", "editor": "number", "default": 3, "minimum": 1, "maximum": 10 }, "upworkUsername": { "title": "Upwork Username", "type": "string", "description": "Optional: Your Upwork username for authenticated access (recommended to bypass 403 errors)", "editor": "textfield" }, "upworkPassword": { "title": "Upwork Password", "type": "string", "description": "Optional: Your Upwork password for authenticated access", "editor": "textfield", "isSecret": true }, "captchaApiKey": { "title": "2Captcha API Key", "type": "string", "description": "Optional: Your 2Captcha API key to solve CAPTCHAs automatically (https://2captcha.com/)", "editor": "textfield", "isSecret": true }, "sessionRotationCount": { "title": "Session Rotation Count", "type": "integer", "description": "Number of browser sessions to rotate through (helps avoid detection)", "editor": "number", "default": 3, "minimum": 1, "maximum": 10 }, "minDelayBetweenRequests": { "title": "Min Delay Between Requests (seconds)", "type": "integer", "description": "Minimum delay in seconds between requests (higher values reduce blocking)", "editor": "number", "default": 15, "minimum": 5, "maximum": 120 }, "maxDelayBetweenRequests": { "title": "Max Delay Between Requests (seconds)", "type": "integer", "description": "Maximum delay in seconds between requests (higher values reduce blocking)", "editor": "number", "default": 45, "minimum": 10, "maximum": 180 }, "sessionCooldownMinutes": { "title": "Session Cooldown Minutes", "type": "integer", "description": "Time in minutes to rest a session after multiple uses", "editor": "number", "default": 30, "minimum": 5, "maximum": 120 }, "proxyRotationEnabled": { "title": "Enable Proxy Rotation", "type": "boolean", "description": "Whether to rotate proxies during the session (helps prevent IP-based blocking)", "editor": "checkbox", "default": true }, "proxyRotationRequests": { "title": "Requests Per Proxy", "type": "integer", "description": "Number of requests before rotating to a new proxy", "editor": "number", "default": 3, "minimum": 1, "maximum": 10 }, "simulateBrowserHistory": { "title": "Simulate Browser History", "type": "boolean", "description": "Whether to simulate browser history and cache for a more authentic profile", "editor": "checkbox", "default": true }, "randomizeTimezone": { "title": "Randomize Browser Timezone", "type": "boolean", "description": "Whether to randomize timezone and locale settings per session", "editor": "checkbox", "default": true }, "useNewHeadless": { "title": "Use New Headless Mode", "type": "boolean", "description": "Use Chrome's new headless mode which is less detectable than the old mode", "editor": "checkbox", "default": true }, "useRealProfiles": { "title": "Use Real Browser Profiles", "type": "boolean", "description": "Use real Chrome profiles with history and extensions (better anti-detection)", "editor": "checkbox", "default": true }, "useCloudScraper": { "title": "Use CloudScraper", "type": "boolean", "description": "Use CloudScraper to bypass Cloudflare protection (fallback method)", "editor": "checkbox", "default": true }, "simulateExtensions": { "title": "Simulate Browser Extensions", "type": "boolean", "description": "Simulate common browser extensions to appear more like a real user", "editor": "checkbox", "default": true }, "useFallbackHtmlParser": { "title": "Use Direct HTML Parser Fallback", "type": "boolean", "description": "When browser approach fails, try to extract data directly from HTML (last resort)", "editor": "checkbox", "default": true }, "disableWebSecurity": { "title": "Disable Web Security", "type": "boolean", "description": "Disable browser web security features (CORS, etc.) to bypass some protections", "editor": "checkbox", "default": false }, "bypassCSP": { "title": "Bypass Content Security Policy", "type": "boolean", "description": "Bypass website Content Security Policy restrictions", "editor": "checkbox", "default": false }, "proxyConfiguration": { "title": "Proxy configuration", "type": "object", "description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.", "prefill": { "useApifyProxy": true, "apifyProxyGroups": [] }, "default": {}, "editor": "proxy" } }}
network_analysis/README.md
1# Upwork Scraper Network Traffic Analysis2
3This directory contains tools to analyze and improve network traffic patterns for the Upwork job scraper, helping to bypass 403 errors and anti-bot detection.4
5## Tools Overview6
71. **capture_traffic.sh** - Script for capturing and analyzing network traffic using TCPDump82. **apply_findings.js** - Script to modify the scraper code based on network analysis findings9
10## Prerequisites11
12- TCPDump (`brew install tcpdump` on macOS)13- Node.js 14+ for the apply_findings.js script14- Fingerprinting packages (`npm install fingerprint-generator fingerprint-injector`)15
16## Getting Started17
18### Step 1: Capture Traffic19
20The capture_traffic.sh script allows you to capture and compare network traffic from:21- A successful manual browser session22- A failing automated scraper session23
24```bash25# Make the script executable26chmod +x capture_traffic.sh27
28# Capture a successful manual browser session (use your normal browser)29./capture_traffic.sh --manual30
31# Capture a failing automated session (run the scraper in another terminal)32./capture_traffic.sh --auto33
34# Compare the two captures to identify differences35./capture_traffic.sh --compare36```37
38### Step 2: Apply Fixes39
40After analyzing the network traffic differences, you can apply fixes to the scraper:41
42```bash43# Install required dependencies44npm install fingerprint-generator fingerprint-injector45
46# Run the script to apply fixes based on traffic analysis47node apply_findings.js48```49
50## What to Look For51
52When comparing successful vs. failing traffic:53
541. **TLS Fingerprinting** - Look for differences in TLS handshakes, cipher suites, and extensions552. **HTTP Headers** - Check order, values, and presence/absence of specific headers563. **Browser Fingerprinting** - JavaScript challenges, canvas checks, WebGL rendering574. **Request Timing** - Natural patterns vs. automated timing58
59## Technical Details60
61### Browser Fingerprinting62
63The fingerprint-generator package creates realistic browser fingerprints that match real browser behavior:64
65```javascript66const fingerprint = fingerprintGenerator.getFingerprint({67 browserName: 'chrome',68 browserVersion: 123,69 operatingSystem: 'macos',70 operatingSystemVersion: '10.15.7',71 deviceCategory: 'desktop',72 locale: 'en-US'73});74```75
76Then fingerprint-injector applies this fingerprint to Puppeteer:77
78```javascript79await fingerprintInjector.attachFingerprintToPuppeteer(page, fingerprint);80```81
82### HTTP Headers83
84The order and values of HTTP headers can be used to identify bots. The analysis will help identify the correct header ordering used by real browsers:85
86```87Host: www.upwork.com88Connection: keep-alive89sec-ch-ua: "Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"90sec-ch-ua-mobile: ?091sec-ch-ua-platform: "macOS"92Upgrade-Insecure-Requests: 193User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.3694Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.795Sec-Fetch-Site: none96Sec-Fetch-Mode: navigate97Sec-Fetch-User: ?198Sec-Fetch-Dest: document99Accept-Encoding: gzip, deflate, br100Accept-Language: en-US,en;q=0.9101```102
103### JavaScript Fingerprinting Protection104
105The analysis will also help identify and mitigate JavaScript-based fingerprinting techniques used by Upwork:106
107- Canvas fingerprinting - Adding subtle noise to canvas rendering108- WebGL fingerprinting - Spoofing vendor and renderer information109- Audio fingerprinting - Adding minor variations to audio data110- Navigator property checks - Fixing browser detection properties111- DOM property access patterns - Matching real browser behavior112
113## Troubleshooting114
115If you're still getting 403 errors after applying fixes:116
1171. Try different network interfaces or mobile tethering1182. Use higher quality proxies (mobile/4G proxies work better)1193. Increase the capture time to get more data1204. Try different browsers for the manual capture (Safari, Firefox)1215. Use tcpdump's deeper analysis with `-v` or `-vv` flags122
123## Advanced Usage124
125For more detailed packet analysis:126
127```bash128# Analyze SSL/TLS handshake in detail129sudo tcpdump -i en0 -nn -s0 -vvv "host upwork.com and tcp port 443" -w detailed_capture.pcap130
131# Extract all HTTP headers132tcpdump -A -s 0 'tcp port 443 and (((ip[2:2] - ((ip[0]&0xf)<<2)) - ((tcp[12]&0xf0)>>2)) != 0)' -r capture.pcap133```134
135## Additional Techniques to Try136
137If the fingerprinting techniques don't fully resolve the 403 errors, consider:138
1391. **Mobile User Agents** - Try using mobile user agents which often face less scrutiny1402. **4G/5G Mobile Proxies** - These have cleaner IP reputation than data center proxies1413. **Modify request timing** - Add more natural, human-like timing between navigation steps1424. **Implement Chrome Extensions API** - More fully simulate popular browser extensions1435. **HTTP/3 Support** - Implement HTTP/3 (QUIC) protocol support to bypass some fingerprinting
network_analysis/analyze_and_fix.sh
Downloadnetwork_analysis/apply_findings.js
1/**2 * This script applies network fingerprinting fixes to the Upwork scraper3 * based on TCPDump analysis findings4 */5
6import fs from 'fs';7import path from 'path';8import { fileURLToPath } from 'url';9
10const __dirname = path.dirname(fileURLToPath(import.meta.url));11const MAIN_JS_PATH = path.join(__dirname, '..', 'src', 'main.js');12
13// Check if main.js exists14if (!fs.existsSync(MAIN_JS_PATH)) {15 console.error(`Error: ${MAIN_JS_PATH} not found`);16 process.exit(1);17}18
19// Read the existing main.js file20let mainJsContent = fs.readFileSync(MAIN_JS_PATH, 'utf8');21
22// Function to apply fingerprinting fixes based on TCPDump analysis23function applyFingerprintFixes(mainJsContent) {24 console.log('Applying fingerprinting fixes...');25 26 // Check if fingerprint-generator and fingerprint-injector modules are already imported27 if (!mainJsContent.includes('fingerprint-generator') || !mainJsContent.includes('fingerprint-injector')) {28 console.log('Adding fingerprinting modules...');29 30 // Add imports31 mainJsContent = mainJsContent.replace(32 'import { fileURLToPath } from \'url\';',33 'import { fileURLToPath } from \'url\';\nimport { FingerprintGenerator } from \'fingerprint-generator\';\nimport { FingerprintInjector } from \'fingerprint-injector\';'34 );35 36 // Initialize fingerprint generator after browserProfiles37 const fingerprintGeneratorCode = `38// Initialize fingerprint generator for more accurate browser emulation39const fingerprintGenerator = new FingerprintGenerator({40 browsers: [41 { name: 'chrome', minVersion: 100, maxVersion: 123 },42 { name: 'firefox', minVersion: 100, maxVersion: 123 },43 { name: 'safari', minVersion: 15, maxVersion: 17 }44 ],45 devices: ['desktop'],46 operatingSystems: ['macos', 'windows'],47 locales: ['en-US', 'en-GB', 'de-DE'],48 // Use consistent fingerprints per session for less detection49 cache: true50});51
52// Fingerprint injector for applying fingerprints to browser53const fingerprintInjector = new FingerprintInjector();`;54 55 // Insert fingerprint generator code after browserProfiles56 mainJsContent = mainJsContent.replace(57 '// Generate new sessions if needed',58 fingerprintGeneratorCode + '\n\n// Generate new sessions if needed'59 );60 61 console.log('Added fingerprint generator and injector');62 }63 64 // Add fingerprint generation to session initialization65 if (!mainJsContent.includes('generateFingerprint()')) {66 console.log('Adding fingerprint generation to sessions...');67 68 // Add generateFingerprint function69 const generateFingerprintCode = `70// Generate consistent fingerprint for a session71const generateFingerprint = (sessionId, profile) => {72 // Generate fingerprint based on browser profile or random selection73 const browserName = profile?.userAgent?.toLowerCase().includes('firefox') ? 'firefox' : 74 profile?.userAgent?.toLowerCase().includes('safari') ? 'safari' : 'chrome';75 76 const osName = profile?.platform === 'MacIntel' ? 'macos' : 'windows';77 78 const locale = profile?.locale || 'en-US';79 80 return fingerprintGenerator.getFingerprint({81 browserName,82 browserVersion: parseInt(profile?.userAgent?.match(/Chrome\\/(\\d+)/) 83 || profile?.userAgent?.match(/Firefox\\/(\\d+)/)84 || profile?.userAgent?.match(/Version\\/(\\d+)/) 85 || [0, Math.floor(Math.random() * 20) + 100])[1],86 operatingSystem: osName,87 operatingSystemVersion: osName === 'macos' ? '10.15.7' : '10.0',88 deviceCategory: 'desktop',89 locale90 });91};`;92 93 // Insert generateFingerprint function94 mainJsContent = mainJsContent.replace(95 '// Generate fake browsing history for a more authentic browser profile',96 generateFingerprintCode + '\n\n// Generate fake browsing history for a more authentic browser profile'97 );98 99 // Add fingerprint to session generation100 mainJsContent = mainJsContent.replace(101 'return {',102 'const fingerprint = generateFingerprint(`session_${Date.now()}_${i}`, profile);\n return {'103 );104 105 mainJsContent = mainJsContent.replace(106 'id: `session_${Date.now()}_${i}`,',107 'id: `session_${Date.now()}_${i}`,\n fingerprint,'108 );109 110 console.log('Added fingerprint generation to sessions');111 }112 113 // Add fingerprint injection to preNavigationHooks114 if (!mainJsContent.includes('injectFingerprint')) {115 console.log('Adding fingerprint injection to navigation...');116 117 // Find the preNavigationHooks section118 const navHooksRegex = /preNavigationHooks: \[\s*\/\/ Hook pour ajouter des délais aléatoires entre requêtes et configurer le navigateur\s*async \(\{ page, request, session \}\) => \{([\s\S]*?)\},\s*\],/;119 const navHooksMatch = mainJsContent.match(navHooksRegex);120 121 if (navHooksMatch) {122 // Add fingerprint injection code123 const fingerPrintInjectionCode = `124 // Apply fingerprint from session125 if (sessionIndex !== -1 && sessions[sessionIndex].fingerprint) {126 console.log(\`Injecting fingerprint for session \${sessionId}...\`);127 try {128 await fingerprintInjector.attachFingerprintToPuppeteer(page, sessions[sessionIndex].fingerprint);129 console.log('Fingerprint successfully injected');130 } catch (error) {131 console.error('Error injecting fingerprint:', error.message);132 }133 }`;134 135 // Insert after headers are set136 mainJsContent = mainJsContent.replace(137 'await page.setExtraHTTPHeaders(headers);',138 'await page.setExtraHTTPHeaders(headers);\n' + fingerPrintInjectionCode139 );140 141 console.log('Added fingerprint injection to navigation hooks');142 } else {143 console.log('Warning: Could not find preNavigationHooks section');144 }145 }146 147 return mainJsContent;148}149
150// Function to enhance HTTP header ordering based on TCPDump findings151function enhanceHeaderOrdering(mainJsContent) {152 console.log('Enhancing HTTP header ordering...');153 154 // Check if already implemented155 if (mainJsContent.includes('naturalistic header ordering')) {156 console.log('HTTP header enhancements already exist, skipping...');157 return mainJsContent;158 }159 160 // Find the headerOrder array in the code161 const headerOrderRegex = /const headerOrder = \[([\s\S]*?)\];/;162 const headerOrderMatch = mainJsContent.match(headerOrderRegex);163 164 if (headerOrderMatch) {165 // Replace with improved header ordering based on TCPDump analysis166 const improvedHeaderOrder = `const headerOrder = [167 // More naturalistic header ordering based on TCPDump analysis168 // Primary headers always come first in real browsers169 'Host',170 'Connection',171 'Cache-Control',172 'sec-ch-ua',173 'sec-ch-ua-mobile',174 'sec-ch-ua-platform',175 'Upgrade-Insecure-Requests',176 'User-Agent',177 'Accept',178 'Accept-Encoding',179 'Accept-Language',180 // Secondary headers might appear in this order181 'Sec-Fetch-Site',182 'Sec-Fetch-Mode',183 'Sec-Fetch-User',184 'Sec-Fetch-Dest',185 'Referer',186 'Cookie',187 'DNT'188 ].sort(() => {189 // Maintain primary header ordering (first 10) but randomize others190 if (Math.random() > 0.8) {191 return Math.random() - 0.5;192 }193 return 0;194 });`;195 196 mainJsContent = mainJsContent.replace(headerOrderRegex, improvedHeaderOrder);197 198 console.log('Enhanced header ordering pattern');199 } else {200 console.log('Warning: Could not find header ordering code to enhance');201 }202 203 return mainJsContent;204}205
206// Function to enhance browser fingerprinting protection207function enhanceBrowserFingerprinting(mainJsContent) {208 console.log('Enhancing browser fingerprinting protection...');209 210 // Check if enhanced fingerprinting already exists211 if (mainJsContent.includes('enhanced fingerprinting protection')) {212 console.log('Enhanced fingerprinting protection already exists, skipping...');213 return mainJsContent;214 }215 216 // Find the page.evaluateOnNewDocument section for fingerprinting217 const fingerprintRegex = /await page\.evaluateOnNewDocument\(\(\) => \{([\s\S]*?)\}\);/;218 const fingerprintMatch = mainJsContent.match(fingerprintRegex);219 220 if (fingerprintMatch) {221 // Enhanced fingerprinting protection based on TCPDump analysis222 const enhancedFingerprinting = `await page.evaluateOnNewDocument(() => {223 // Enhanced fingerprinting protection based on TCPDump analysis224 225 // Hide automation fingerprints226 delete Object.getPrototypeOf(navigator).webdriver;227 228 // Override platform info with more accurate values229 Object.defineProperty(navigator, 'platform', {230 get: () => 'MacIntel',231 });232 233 // Override hardware concurrency234 Object.defineProperty(navigator, 'hardwareConcurrency', {235 get: () => 8,236 });237 238 // Override device memory239 Object.defineProperty(navigator, 'deviceMemory', {240 get: () => 8,241 });242 243 // Improve Chrome properties emulation244 if (typeof window !== 'undefined') {245 window.chrome = {246 app: {247 isInstalled: false,248 InstallState: { DISABLED: 'disabled', INSTALLED: 'installed', NOT_INSTALLED: 'not_installed' },249 RunningState: { CANNOT_RUN: 'cannot_run', READY_TO_RUN: 'ready_to_run', RUNNING: 'running' }250 },251 runtime: {252 OnInstalledReason: { CHROME_UPDATE: 'chrome_update', INSTALL: 'install', SHARED_MODULE_UPDATE: 'shared_module_update', UPDATE: 'update' },253 OnRestartRequiredReason: { APP_UPDATE: 'app_update', OS_UPDATE: 'os_update', PERIODIC: 'periodic' },254 PlatformArch: { ARM: 'arm', ARM64: 'arm64', MIPS: 'mips', MIPS64: 'mips64', X86_32: 'x86-32', X86_64: 'x86-64' },255 PlatformNaclArch: { ARM: 'arm', MIPS: 'mips', MIPS64: 'mips64', X86_32: 'x86-32', X86_64: 'x86-64' },256 PlatformOs: { ANDROID: 'android', CROS: 'cros', LINUX: 'linux', MAC: 'mac', OPENBSD: 'openbsd', WIN: 'win' },257 RequestUpdateCheckStatus: { NO_UPDATE: 'no_update', THROTTLED: 'throttled', UPDATE_AVAILABLE: 'update_available' }258 }259 };260 }261 262 // Fix permissions behavior263 const originalQuery = window.navigator.permissions?.query;264 if (originalQuery) {265 window.navigator.permissions.query = (parameters) => {266 if (parameters.name === 'notifications') {267 return Promise.resolve({ state: Notification.permission });268 }269 if (parameters.name === 'clipboard-read' || parameters.name === 'clipboard-write') {270 return Promise.resolve({ state: 'prompt' });271 }272 return originalQuery(parameters);273 };274 }275 276 // Fix WebGL fingerprinting - more accurate values from real browsers277 if (window.WebGLRenderingContext) {278 const getParameter = WebGLRenderingContext.prototype.getParameter;279 WebGLRenderingContext.prototype.getParameter = function(parameter) {280 // UNMASKED_VENDOR_WEBGL281 if (parameter === 37445) {282 return 'Intel Inc.';283 }284 // UNMASKED_RENDERER_WEBGL285 if (parameter === 37446) {286 return 'Intel Iris Pro OpenGL Engine';287 }288 return getParameter.call(this, parameter);289 };290 }291 292 // Fix canvas fingerprinting - add subtle noise293 const originalGetContext = HTMLCanvasElement.prototype.getContext;294 HTMLCanvasElement.prototype.getContext = function(type, attributes) {295 const context = originalGetContext.call(this, type, attributes);296 if (context && type === '2d') {297 const originalFillText = context.fillText;298 context.fillText = function() {299 // Add subtle variations to text rendering300 const args = arguments;301 if (Math.random() < 0.2 && args[0]) {302 args[0] = args[0].split('').map(c => Math.random() < 0.05 ? c + String.fromCharCode(8203) : c).join('');303 }304 return originalFillText.apply(this, args);305 };306 307 // Affect toDataURL to add subtle noise308 const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;309 HTMLCanvasElement.prototype.toDataURL = function() {310 // Only add noise for likely fingerprinting scenarios311 if (this.width === 16 && this.height === 16 || 312 this.width <= 2 && this.height <= 2 ||313 this.width <= 200 && this.height <= 50) {314 315 const context = this.getContext('2d');316 if (context) {317 // Add minimal noise that's invisible to humans318 const imageData = context.getImageData(0, 0, this.width, this.height);319 const data = imageData.data;320 321 // Only modify 1-2 pixels with minor variations322 const pixelsToModify = Math.max(1, Math.floor(this.width * this.height * 0.01));323 for (let i = 0; i < pixelsToModify; i++) {324 const pixelIndex = Math.floor(Math.random() * data.length / 4) * 4;325 // Only make very minor adjustments to color values326 data[pixelIndex] = Math.min(255, Math.max(0, data[pixelIndex] + (Math.random() < 0.5 ? -1 : 1)));327 data[pixelIndex + 1] = Math.min(255, Math.max(0, data[pixelIndex + 1] + (Math.random() < 0.5 ? -1 : 1)));328 data[pixelIndex + 2] = Math.min(255, Math.max(0, data[pixelIndex + 2] + (Math.random() < 0.5 ? -1 : 1)));329 }330 331 context.putImageData(imageData, 0, 0);332 }333 }334 return originalToDataURL.apply(this, arguments);335 };336 }337 return context;338 };339 340 // Fix audio fingerprinting341 const audioContext = window.AudioContext || window.webkitAudioContext;342 if (audioContext) {343 const originalGetChannelData = AudioBuffer.prototype.getChannelData;344 AudioBuffer.prototype.getChannelData = function() {345 const channelData = originalGetChannelData.apply(this, arguments);346 // Only add noise for very short buffers (likely fingerprinting)347 if (this.length < 100) {348 const noise = 0.0001; // Very subtle noise349 // Only modify a few random samples350 const samplesToModify = Math.max(1, Math.floor(channelData.length * 0.001));351 for (let i = 0; i < samplesToModify; i++) {352 const index = Math.floor(Math.random() * channelData.length);353 channelData[index] += (Math.random() * 2 - 1) * noise;354 }355 }356 return channelData;357 };358 }359 });`;360 361 mainJsContent = mainJsContent.replace(fingerprintRegex, enhancedFingerprinting);362 363 console.log('Enhanced browser fingerprinting protection');364 } else {365 console.log('Warning: Could not find fingerprinting code to enhance');366 }367 368 return mainJsContent;369}370
371// Apply all fixes372let updatedContent = mainJsContent;373updatedContent = applyFingerprintFixes(updatedContent);374updatedContent = enhanceHeaderOrdering(updatedContent);375updatedContent = enhanceBrowserFingerprinting(updatedContent);376
377// Write the updated content back to main.js378if (updatedContent !== mainJsContent) {379 // Backup the original file380 const backupPath = `${MAIN_JS_PATH}.bak.${Date.now()}`;381 fs.writeFileSync(backupPath, mainJsContent);382 console.log(`Original file backed up to: ${backupPath}`);383 384 // Write the updated content385 fs.writeFileSync(MAIN_JS_PATH, updatedContent);386 console.log(`Successfully updated ${MAIN_JS_PATH} with fingerprinting fixes`);387} else {388 console.log('No changes were made to the file');389}390
391console.log('\nNext steps:');392console.log('1. Run the capture_traffic.sh script to collect network data');393console.log('2. Use the data to further refine the fingerprinting in this script');394console.log('3. Re-run this script to apply improved fingerprinting settings');
network_analysis/capture_traffic.sh
Downloadsrc/enhanced-crawler.js
1/**2 * Enhanced Upwork Crawler3 * Uses advanced challenge bypass techniques4 */5
6import { Actor } from 'apify';7import { Dataset } from 'crawlee';8import fs from 'fs/promises';9import path from 'path';10import { fileURLToPath } from 'url';11import { parseJobsFromHTML } from './main.js';12import {13 createEnhancedSession,14 enhancedPageNavigation,15 cleanupSession16} from './upwork-challenge-integrator.js';17import {18 applySessionFingerprinting,19 createFingerprintRotationStrategy20} from './fingerprint-integration.js';21
22const __dirname = path.dirname(fileURLToPath(import.meta.url));23
24// Store active sessions25const activeSessions = {};26const sessionStats = {};27
28/**29 * Initialize a session pool for rotation30 * @param {Object} options Configuration options31 * @returns {Array} Array of session IDs32 */33async function initializeSessionPool(options = {}) {34 const {35 sessionCount = 3,36 browserProfiles = [],37 input = {},38 proxyConfiguration = null,39 } = options;40 41 console.log(`Creating ${sessionCount} new sessions for rotation`);42 43 const sessionIds = [];44 45 for (let i = 1; i <= sessionCount; i++) {46 const sessionId = `session_${Date.now()}_${i}`;47 sessionIds.push(sessionId);48 49 // Initialize session stats50 sessionStats[sessionId] = {51 requestCount: 0,52 successCount: 0,53 failureCount: 0,54 lastUsed: null,55 createdAt: Date.now(),56 };57 }58 59 return sessionIds;60}61
62/**63 * Get the next available session for rotation64 * @param {Array} sessionIds Array of session IDs65 * @param {Object} options Configuration options66 * @returns {string} Next session ID to use67 */68function getNextSession(sessionIds, options = {}) {69 const {70 cooldownMinutes = 30,71 maxRequests = 3,72 } = options;73 74 // Find least recently used session that's not in cooldown75 const now = Date.now();76 const cooldownMs = cooldownMinutes * 60 * 1000;77 78 // Sort sessions by priority:79 // 1. Sessions that have never been used80 // 2. Sessions that are out of cooldown and have made fewer requests81 // 3. Sessions that have been used least recently82 const sortedSessions = [...sessionIds].sort((a, b) => {83 const statsA = sessionStats[a] || { requestCount: 0, lastUsed: null };84 const statsB = sessionStats[b] || { requestCount: 0, lastUsed: null };85 86 // Never used sessions have highest priority87 if (statsA.lastUsed === null && statsB.lastUsed !== null) return -1;88 if (statsA.lastUsed !== null && statsB.lastUsed === null) return 1;89 90 // Check if either session is in cooldown91 const aInCooldown = statsA.lastUsed && (now - statsA.lastUsed < cooldownMs);92 const bInCooldown = statsB.lastUsed && (now - statsB.lastUsed < cooldownMs);93 94 if (!aInCooldown && bInCooldown) return -1;95 if (aInCooldown && !bInCooldown) return 1;96 97 // If both are in cooldown or both are not, check request count98 if (statsA.requestCount < maxRequests && statsB.requestCount >= maxRequests) return -1;99 if (statsA.requestCount >= maxRequests && statsB.requestCount < maxRequests) return 1;100 101 // If both have similar request counts, use least recently used102 return (statsA.lastUsed || 0) - (statsB.lastUsed || 0);103 });104 105 // Return the best session106 return sortedSessions[0];107}108
109/**110 * Update session stats after use111 * @param {string} sessionId Session ID112 * @param {Object} result Result of the request113 */114function updateSessionStats(sessionId, result = {}) {115 if (!sessionStats[sessionId]) {116 sessionStats[sessionId] = {117 requestCount: 0,118 successCount: 0,119 failureCount: 0,120 lastUsed: null,121 createdAt: Date.now(),122 };123 }124 125 sessionStats[sessionId].requestCount += 1;126 sessionStats[sessionId].lastUsed = Date.now();127 128 if (result.success) {129 sessionStats[sessionId].successCount += 1;130 } else {131 sessionStats[sessionId].failureCount += 1;132 }133}134
135// Capture a screenshot and store it on Apify Storage136const takeScreenshot = async (page, filename) => {137 try {138 const screenshotBuffer = await page.screenshot({139 type: "jpeg",140 quality: 50,141 fullPage: false,142 });143 const screenshotKey = `${filename}-${Date.now()}.jpeg`;144 await Actor.setValue(screenshotKey, screenshotBuffer, {145 contentType: "image/jpeg",146 });147 const screenshotUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${screenshotKey}`;148 console.log(`📸 Screenshot saved: ${screenshotUrl}`);149 return screenshotUrl;150 } catch (error) {151 console.error(`❌ Failed to take screenshot: ${error.message}`);152 return null;153 }154};155
156/**157 * Main crawler function158 * @param {Object} options Crawler options159 */160async function crawlUpworkJobs(options = {}) {161 const {162 searchQuery = 'javascript',163 maxPages = 5,164 input = {},165 proxyConfiguration = null,166 browserProfiles = [],167 } = options;168 169 try {170 // Initialize session pool171 const sessionIds = await initializeSessionPool({172 sessionCount: input.sessionRotationCount || 3,173 browserProfiles,174 input,175 proxyConfiguration,176 });177 178 // Process each page179 for (let page = 1; page <= maxPages; page++) {180 // Get next available session181 const sessionId = getNextSession(sessionIds, {182 cooldownMinutes: input.sessionCooldownMinutes || 30,183 maxRequests: input.proxyRotationRequests || 3,184 });185 186 console.log(`Using session ${sessionId} for this crawl`);187 188 // Get or create session189 let session = activeSessions[sessionId];190 if (!session) {191 // Get proxy URL if configured192 let proxyUrl = null;193 if (proxyConfiguration) {194 proxyUrl = await proxyConfiguration.newUrl(sessionId);195 }196 197 // Create enhanced session198 session = await createEnhancedSession({199 sessionId,200 proxyUrl,201 headless: !input.headful,202 input,203 browserProfiles,204 });205 206 // Apply advanced fingerprinting specific to this session207 if (session.page && !session.error) {208 try {209 const fingerprintConfig = await applySessionFingerprinting(session.page, sessionId);210 console.log(`Applied consistent fingerprinting for session ${sessionId} using profile: ${fingerprintConfig.browserProfile}`);211 session.fingerprintConfig = fingerprintConfig;212 } catch (err) {213 console.log(`Error applying fingerprinting: ${err.message}`);214 }215 }216 217 activeSessions[sessionId] = session;218 }219 220 // Check if session has issues221 if (session.error || session.connectionFailed || session.setupFailed) {222 console.log(`Session ${sessionId} has issues: ${session.error || 'Failed setup'}`);223 224 // Clean up and remove problematic session225 await cleanupSession(session);226 delete activeSessions[sessionId];227 228 // Create a replacement session229 const newSessionId = `session_${Date.now()}_replacement`;230 sessionIds.push(newSessionId);231 232 console.log(`Created replacement session ${newSessionId}`);233 continue;234 }235 236 // Build search URL237 let url;238 if (searchQuery && searchQuery.trim() !== '') {239 url = `https://www.upwork.com/nx/search/jobs/?q=${encodeURIComponent(searchQuery)}${page > 1 ? `&page=${page}` : ''}`;240 } else {241 url = `https://www.upwork.com/nx/search/jobs/${page > 1 ? `?page=${page}` : ''}`;242 }243 244 // Navigate to page and extract jobs245 const result = await enhancedPageNavigation({246 session,247 url,248 input,249 parseJobsFunction: parseJobsFromHTML,250 });251 252 // Update session stats253 updateSessionStats(sessionId, result);254 255 // Handle successful result256 if (result.success && result.jobs && result.jobs.length > 0) {257 console.log(`Successfully extracted ${result.jobs.length} jobs from page ${page} using ${result.method}`);258 259 // Save jobs to dataset260 await Dataset.pushData(result.jobs);261 262 // Add delay between successful requests263 const delaySeconds = Math.floor(Math.random() * 264 (input.maxDelayBetweenRequests || 45) - (input.minDelayBetweenRequests || 15) + 1) + 265 (input.minDelayBetweenRequests || 15);266 267 console.log(`Waiting ${delaySeconds} seconds before the next request...`);268 await new Promise(resolve => setTimeout(resolve, delaySeconds * 1000));269 } else {270 // Handle failure271 console.log(`Failed to extract jobs from page ${page}: ${result.error || 'Unknown error'}`);272 273 if (result.needsSessionRotation) {274 // Clean up and rotate to new session275 await cleanupSession(session);276 delete activeSessions[sessionId];277 278 // Retry with fresh session after delay279 const delaySeconds = Math.floor(Math.random() * 30) + 30;280 console.log(`Session rotation needed. Waiting ${delaySeconds} seconds before retrying...`);281 await new Promise(resolve => setTimeout(resolve, delaySeconds * 1000));282 283 // Decrement page to retry284 page--;285 }286 287 if (result.needsBrowserReset) {288 // Reset browser but keep session289 await cleanupSession(session);290 delete activeSessions[sessionId];291 }292 293 // Increase delay after errors294 await new Promise(resolve => setTimeout(resolve, 10000));295 }296 297 // Check if we need to rotate proxy298 if (proxyConfiguration && input.proxyRotationEnabled && 299 sessionStats[sessionId].requestCount >= (input.proxyRotationRequests || 3)) {300 console.log(`Rotating proxy for session ${sessionId}`);301 302 if (session.browser) {303 // Close existing browser304 await cleanupSession(session);305 }306 307 const proxyUrl = await proxyConfiguration.newUrl(sessionId);308 console.log(`Rotating proxy for session ${sessionId} to: ${proxyUrl}`);309 310 // Reset session with new proxy311 delete activeSessions[sessionId];312 }313
314 if (result.isChallengePage && session.page) {315 const screenshotUrl = await takeScreenshot(session.page, 'cloudflare_challenge_enhanced');316 const html = await session.page.content();317 await Actor.setValue(`cloudflare_challenge_enhanced_${Date.now()}.html`, html);318 if (screenshotUrl) {319 console.log(`Cloudflare challenge screenshot: ${screenshotUrl}`);320 }321 }322 }323 324 // Clean up all sessions325 console.log('Crawl complete, cleaning up sessions');326 for (const sessionId in activeSessions) {327 await cleanupSession(activeSessions[sessionId]);328 }329 330 } catch (error) {331 console.error('Error in crawler:', error);332 333 // Clean up on error334 for (const sessionId in activeSessions) {335 await cleanupSession(activeSessions[sessionId]);336 }337 }338}339
340export {341 crawlUpworkJobs,342 initializeSessionPool,343 getNextSession,344};
src/fingerprint-enhancement.js
1/**2 * Advanced Browser Fingerprint Enhancement3 * Specialized techniques to defeat Upwork's fingerprinting detection4 */5
6import { Page } from 'puppeteer';7
8/**9 * Apply advanced fingerprint protection to page10 * @param {Page} page Puppeteer page instance11 * @param {Object} options Configuration options12 */13export async function applyAdvancedFingerprinting(page, options = {}) {14 const {15 deviceProfile = 'modern_desktop',16 webglNoise = true,17 audioNoise = true,18 fontConsistency = true,19 hideTempStorage = true,20 consistentTimezone = true,21 userAgent,22 timezone = 'America/New_York',23 } = options;24
25 console.log('Applying enhanced browser fingerprint protection...');26
27 // 1. Canvas fingerprinting protection28 await page.evaluateOnNewDocument(() => {29 // Original implementations to reference30 const originalGetContext = HTMLCanvasElement.prototype.getContext;31 const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;32 const originalGetImageData = CanvasRenderingContext2D.prototype.getImageData;33 const originalReadPixels = WebGLRenderingContext.prototype.readPixels;34 35 // Override canvas methods to add subtle noise to prevent fingerprinting36 HTMLCanvasElement.prototype.getContext = function(contextType, contextAttributes) {37 const context = originalGetContext.call(this, contextType, contextAttributes);38 // Mark modified canvases so we only modify them once39 if (context && ['2d', 'webgl', 'webgl2'].includes(contextType) && !this.__modified) {40 this.__modified = true;41 this.__contextType = contextType;42 }43 return context;44 };45 46 HTMLCanvasElement.prototype.toDataURL = function(type, quality) {47 if (this.__modified && this.__contextType === '2d') {48 // For 2D canvases, add subtle noise before generating data URL49 const context = originalGetContext.call(this, '2d');50 const imageData = originalGetImageData.call(51 context, 52 0, 53 0, 54 this.width, 55 this.height56 );57 58 // Subtle noise that won't be visible but changes fingerprint59 for (let i = 0; i < imageData.data.length; i += 4) {60 // Only modify alpha channel very slightly in select pixels61 if (Math.random() < 0.005) { // Only change 0.5% of pixels62 const offset = Math.floor(Math.random() * 2) - 1; // -1 to +163 imageData.data[i + 3] = Math.max(0, Math.min(255, imageData.data[i + 3] + offset));64 }65 }66 67 context.putImageData(imageData, 0, 0);68 }69 70 return originalToDataURL.call(this, type, quality);71 };72 });73 74 // 2. WebGL fingerprinting protection75 if (webglNoise) {76 await page.evaluateOnNewDocument(() => {77 // Original implementations to reference78 const originalGetParameter = WebGLRenderingContext.prototype.getParameter;79 80 WebGLRenderingContext.prototype.getParameter = function(parameter) {81 // Randomize the following parameters carefully to avoid detection82 // RENDERER and VENDOR are most commonly used for fingerprinting83 if (parameter === 37446) { // UNMASKED_RENDERER_WEBGL84 return 'Intel Iris OpenGL Engine';85 }86 if (parameter === 37445) { // UNMASKED_VENDOR_WEBGL87 return 'Google Inc. (Intel)';88 }89 90 // Add very subtle randomization to matrix values91 // This targets transform matrices used in fingerprinting92 if ([2982, 2983, 35978].includes(parameter)) { // Various matrix parameters93 const originalValue = originalGetParameter.call(this, parameter);94 if (originalValue && originalValue.length) {95 const newValue = new Float32Array(originalValue);96 // Add extremely small noise that won't affect rendering97 for (let i = 0; i < newValue.length; i++) {98 if (Math.random() < 0.1) {99 newValue[i] += (Math.random() * 2 - 1) * 0.0000001;100 }101 }102 return newValue;103 }104 }105 106 return originalGetParameter.call(this, parameter);107 };108 });109 }110 111 // 3. Audio fingerprinting protection112 if (audioNoise) {113 await page.evaluateOnNewDocument(() => {114 // Protect against audio fingerprinting115 const originalGetFloatFrequencyData = AudioBuffer.prototype.getChannelData;116 117 AudioBuffer.prototype.getChannelData = function(channel) {118 const originalData = originalGetFloatFrequencyData.call(this, channel);119 120 // Clone to avoid modifying original audio121 const audioData = new Float32Array(originalData);122 123 // Add subtle noise to audio data124 for (let i = 0; i < audioData.length; i++) {125 if (Math.random() < 0.001) {126 // Extremely small noise value127 audioData[i] += (Math.random() * 2 - 1) * 0.0001;128 }129 }130 131 return audioData;132 };133 });134 }135 136 // 4. Font consistency137 if (fontConsistency) {138 await page.evaluateOnNewDocument(() => {139 // List of common fonts to emulate consistent availability140 const commonFonts = [141 'Arial', 'Arial Black', 'Arial Narrow', 'Book Antiqua', 'Bookman Old Style',142 'Calibri', 'Cambria', 'Century Gothic', 'Comic Sans MS', 'Consolas',143 'Courier', 'Courier New', 'Georgia', 'Helvetica', 'Impact', 'Lucida Console',144 'Lucida Sans Unicode', 'Microsoft Sans Serif', 'Palatino Linotype', 'Segoe UI',145 'Tahoma', 'Times', 'Times New Roman', 'Trebuchet MS', 'Verdana'146 ];147 148 // Override font detection149 if (document.fonts && document.fonts.check) {150 const originalCheck = document.fonts.check;151 document.fonts.check = function(font, text) {152 const fontFamily = font.split(' ').pop().replace(/['",]/g, '');153 154 if (commonFonts.includes(fontFamily)) {155 return true;156 }157 158 return originalCheck.apply(this, arguments);159 };160 }161 });162 }163 164 // 5. Session/localStorage fingerprinting protection165 if (hideTempStorage) {166 await page.evaluateOnNewDocument(() => {167 // Create wrapper for storage to prevent fingerprinting via storage size/content168 const createStorageProxy = (storageType) => {169 const storage = {};170 const originalStorage = window[storageType];171 172 // Create proxy wrapper around storage173 return new Proxy(originalStorage, {174 get: (target, prop) => {175 if (prop === 'length') {176 return Object.keys(storage).length;177 }178 if (prop === 'key') {179 return (index) => Object.keys(storage)[index];180 }181 if (prop === 'getItem') {182 return (key) => storage[key] || null;183 }184 if (prop === 'setItem') {185 return (key, value) => {186 storage[key] = String(value);187 };188 }189 if (prop === 'removeItem') {190 return (key) => { delete storage[key]; };191 }192 if (prop === 'clear') {193 return () => { Object.keys(storage).forEach(key => delete storage[key]); };194 }195 196 return storage[prop] || target[prop];197 },198 set: (target, prop, value) => {199 storage[prop] = value;200 return true;201 }202 });203 };204 205 // Apply storage proxies206 Object.defineProperty(window, 'localStorage', {207 get: () => createStorageProxy('localStorage')208 });209 210 Object.defineProperty(window, 'sessionStorage', {211 get: () => createStorageProxy('sessionStorage')212 });213 });214 }215 216 // 6. Consistent timezone emulation217 if (consistentTimezone) {218 await page.evaluateOnNewDocument((timezone) => {219 // Override Date to provide consistent timezone220 const originalDate = Date;221 const timezoneOffset = {222 'America/New_York': -5 * 60,223 'America/Los_Angeles': -8 * 60,224 'Europe/London': 0,225 'Europe/Berlin': 1 * 60,226 'Asia/Tokyo': 9 * 60227 }[timezone] || 0;228 229 // Override getTimezoneOffset to return consistent value230 const DateTimeFormat = Intl.DateTimeFormat;231 Intl.DateTimeFormat = function(locales, options) {232 if (options && options.timeZone === undefined) {233 options = {...options, timeZone: timezone};234 }235 return new DateTimeFormat(locales, options);236 };237 238 Date.prototype.getTimezoneOffset = function() {239 return timezoneOffset;240 };241 }, timezone);242 }243 244 // 7. Fix client rects fingerprinting245 await page.evaluateOnNewDocument(() => {246 // Add tiny variations to element dimensions (DOMRect objects)247 const variateRect = (rect) => {248 if (!rect || typeof rect !== 'object') return rect;249 250 // Very small variations (< 0.05 px) that shouldn't affect layout251 const variation = () => (Math.random() * 0.1) - 0.05;252 253 return {254 top: rect.top + variation(),255 right: rect.right + variation(),256 bottom: rect.bottom + variation(),257 left: rect.left + variation(),258 width: rect.width,259 height: rect.height,260 x: rect.x + variation(),261 y: rect.y + variation()262 };263 };264 265 // Override getClientRects266 const originalGetClientRects = Element.prototype.getClientRects;267 Element.prototype.getClientRects = function() {268 const originalRects = originalGetClientRects.call(this);269 270 // Create array-like object with modified values271 const modifiedRects = {};272 for (let i = 0; i < originalRects.length; i++) {273 modifiedRects[i] = variateRect(originalRects[i]);274 }275 modifiedRects.length = originalRects.length;276 277 return modifiedRects;278 };279 280 // Override getBoundingClientRect281 const originalGetBoundingClientRect = Element.prototype.getBoundingClientRect;282 Element.prototype.getBoundingClientRect = function() {283 const originalRect = originalGetBoundingClientRect.call(this);284 return variateRect(originalRect);285 };286 });287 288 // 8. Hardware concurrency and device memory spoofing289 await page.evaluateOnNewDocument(() => {290 // Make hardware details match common configurations291 Object.defineProperty(navigator, 'hardwareConcurrency', {292 get: () => 8293 });294 295 if ('deviceMemory' in navigator) {296 Object.defineProperty(navigator, 'deviceMemory', {297 get: () => 8298 });299 }300 });301 302 // 9. User agent consistency303 if (userAgent) {304 await page.setUserAgent(userAgent);305 }306 307 // 10. Navigator property consistency308 await page.evaluateOnNewDocument((device) => {309 // Ensure navigator properties are consistent310 if (device === 'modern_desktop') {311 const nav = navigator;312 Object.defineProperty(nav, 'appVersion', { get: () => '5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15' });313 Object.defineProperty(nav, 'platform', { get: () => 'MacIntel' });314 Object.defineProperty(nav, 'userAgentData', { get: () => null }); // Safari doesn't support this315 }316 }, deviceProfile);317 318 console.log('Enhanced fingerprint protection applied successfully');319}320
321/**322 * Add TLS fingerprinting protection by modifying cipher suites323 * @param {Object} options Configuration options324 */325export function applyTLSFingerprinting(options = {}) {326 const {327 randomize = true,328 profile = 'chrome_mac'329 } = options;330 331 // Common browser TLS fingerprints (cipher suite order)332 const tlsProfiles = {333 chrome_mac: [334 'TLS_AES_128_GCM_SHA256',335 'TLS_AES_256_GCM_SHA384',336 'TLS_CHACHA20_POLY1305_SHA256',337 'ECDHE-ECDSA-AES128-GCM-SHA256',338 'ECDHE-RSA-AES128-GCM-SHA256',339 'ECDHE-ECDSA-AES256-GCM-SHA384',340 'ECDHE-RSA-AES256-GCM-SHA384',341 'ECDHE-ECDSA-CHACHA20-POLY1305',342 'ECDHE-RSA-CHACHA20-POLY1305'343 ],344 safari: [345 'TLS_AES_128_GCM_SHA256',346 'TLS_AES_256_GCM_SHA384',347 'TLS_CHACHA20_POLY1305_SHA256',348 'ECDHE-ECDSA-AES256-GCM-SHA384',349 'ECDHE-ECDSA-AES128-GCM-SHA256',350 'ECDHE-RSA-AES256-GCM-SHA384',351 'ECDHE-RSA-AES128-GCM-SHA256'352 ],353 firefox: [354 'TLS_AES_128_GCM_SHA256',355 'TLS_CHACHA20_POLY1305_SHA256',356 'TLS_AES_256_GCM_SHA384',357 'ECDHE-ECDSA-AES128-GCM-SHA256',358 'ECDHE-RSA-AES128-GCM-SHA256',359 'ECDHE-ECDSA-CHACHA20-POLY1305',360 'ECDHE-RSA-CHACHA20-POLY1305',361 'ECDHE-ECDSA-AES256-GCM-SHA384',362 'ECDHE-RSA-AES256-GCM-SHA384'363 ]364 };365
366 // Get the selected profile's cipher suites367 let cipherSuites = tlsProfiles[profile] || tlsProfiles.chrome_mac;368 369 // Optionally add very subtle randomization without changing the main pattern370 // This makes each TLS fingerprint slightly different while maintaining browser pattern371 if (randomize) {372 // Select a random grouping pattern that preserves general order373 // but introduces small variations374 const groups = [375 [0, 1, 2], // First three ciphers as a group376 [3, 4], // Next two ciphers377 [5, 6], // Next two ciphers378 [7, 8] // Last two ciphers (if they exist)379 ];380 381 // Create new array to hold the randomized cipher suites382 const randomizedSuites = [];383 384 // Process each group385 groups.forEach(group => {386 // Extract the ciphers in this group387 const groupCiphers = group388 .map(index => cipherSuites[index])389 .filter(cipher => cipher !== undefined);390 391 // If we have ciphers in this group, randomize their order392 if (groupCiphers.length > 0) {393 // Small chance to swap positions within group only394 if (groupCiphers.length > 1 && Math.random() < 0.3) {395 const i = Math.floor(Math.random() * groupCiphers.length);396 const j = Math.floor(Math.random() * groupCiphers.length);397 if (i !== j) {398 [groupCiphers[i], groupCiphers[j]] = [groupCiphers[j], groupCiphers[i]];399 }400 }401 402 // Add group ciphers to result403 randomizedSuites.push(...groupCiphers);404 }405 });406 407 cipherSuites = randomizedSuites;408 }409 410 // Set environment variable for Node.js TLS connections411 process.env.NODE_TLS_CIPHER_SUITES = cipherSuites.join(':');412 413 return cipherSuites;414}415
416/**417 * Generate consistent HTTP headers in browser-like patterns418 * @param {Object} options Configuration options419 * @returns {Object} Configured headers420 */421export function generateConsistentHeaders(options = {}) {422 const {423 userAgent,424 locale = 'en-US',425 browser = 'safari',426 randomizeOrder = true427 } = options;428 429 // Base headers common to most browsers430 let headers = {431 'User-Agent': userAgent,432 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',433 'Accept-Language': `${locale},en;q=0.9`,434 'Accept-Encoding': 'gzip, deflate, br',435 'Connection': 'keep-alive',436 'Upgrade-Insecure-Requests': '1',437 'Sec-Fetch-Dest': 'document',438 'Sec-Fetch-Mode': 'navigate',439 'Sec-Fetch-Site': 'none',440 'Sec-Fetch-User': '?1'441 };442 443 // Browser-specific headers444 if (browser === 'safari') {445 headers = {446 ...headers,447 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',448 // Safari typically has fewer sec-fetch headers449 'Sec-Fetch-Dest': undefined,450 'Sec-Fetch-Mode': undefined,451 'Sec-Fetch-Site': undefined,452 'Sec-Fetch-User': undefined453 };454 } else if (browser === 'firefox') {455 headers = {456 ...headers,457 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',458 'TE': 'trailers'459 };460 }461 462 // Clean up undefined headers463 Object.keys(headers).forEach(key => {464 if (headers[key] === undefined) {465 delete headers[key];466 }467 });468 469 // Randomize order if requested while maintaining common patterns470 if (randomizeOrder) {471 const headerOrder = Object.keys(headers);472 473 // Ensure user-agent is typically near the beginning474 if (headerOrder.includes('User-Agent')) {475 headerOrder.splice(headerOrder.indexOf('User-Agent'), 1);476 const insertPosition = Math.floor(Math.random() * Math.min(2, headerOrder.length));477 headerOrder.splice(insertPosition, 0, 'User-Agent');478 }479 480 // Ensure Accept is typically near the beginning as well481 if (headerOrder.includes('Accept')) {482 headerOrder.splice(headerOrder.indexOf('Accept'), 1);483 const insertPosition = Math.floor(Math.random() * Math.min(3, headerOrder.length));484 headerOrder.splice(insertPosition, 0, 'Accept');485 }486 487 // Create new headers object with randomized order488 const orderedHeaders = {};489 headerOrder.forEach(key => {490 orderedHeaders[key] = headers[key];491 });492 493 return orderedHeaders;494 }495 496 return headers;497}498
499/**500 * Add HTTP header variations and fingerprint resistance501 * @param {Object} page Puppeteer page502 * @param {Object} options Configuration options503 */504export async function applyHeaderFingerprinting(page, options = {}) {505 // Generate consistent headers506 const headers = generateConsistentHeaders(options);507 508 // Set the headers on the page509 await page.setExtraHTTPHeaders(headers);510 511 return headers;512}
src/fingerprint-integration.js
1/**2 * Fingerprint Integration Module3 * Connects advanced fingerprinting techniques with the existing crawler system4 */5
6import { applyAdvancedFingerprinting, applyTLSFingerprinting, applyHeaderFingerprinting } from './fingerprint-enhancement.js';7import randomUseragent from 'random-useragent';8
9/**10 * Configures and applies all fingerprinting protections to a browser page11 * @param {Page} page - Puppeteer page instance12 * @param {Object} options - Configuration options13 * @returns {Object} Applied fingerprint settings14 */15export async function configureFingerprinting(page, options = {}) {16 console.log('Setting up advanced fingerprinting protection...');17 18 const {19 userAgent = randomUseragent.getRandom(),20 browserProfile = 'safari',21 randomizeFingerprints = true,22 timezone = 'America/New_York',23 locale = 'en-US'24 } = options;25 26 // Track applied configurations27 const appliedConfig = {28 userAgent,29 browserProfile,30 timezone,31 locale32 };33 34 // 1. TLS fingerprinting (affects network requests)35 appliedConfig.tlsCiphers = applyTLSFingerprinting({36 randomize: randomizeFingerprints,37 profile: browserProfile === 'safari' ? 'safari' : 38 browserProfile === 'firefox' ? 'firefox' : 'chrome_mac'39 });40 41 // 2. Apply browser fingerprinting protections42 await applyAdvancedFingerprinting(page, {43 deviceProfile: browserProfile === 'safari' ? 'safari' : 44 browserProfile === 'firefox' ? 'firefox' : 'modern_desktop',45 webglNoise: true,46 audioNoise: true,47 fontConsistency: true,48 hideTempStorage: true,49 consistentTimezone: true,50 userAgent,51 timezone52 });53 54 // 3. Apply header fingerprinting55 appliedConfig.headers = await applyHeaderFingerprinting(page, {56 userAgent,57 locale,58 browser: browserProfile,59 randomizeOrder: randomizeFingerprints60 });61 62 console.log(`Fingerprinting protection configured for profile: ${browserProfile}`);63 64 return appliedConfig;65}66
67/**68 * Creates a fingerprinting strategy based on browser rotation69 * @param {Array} profiles - List of browser profiles to rotate through70 * @returns {Function} Strategy function that configures fingerprinting for a page71 */72export function createFingerprintRotationStrategy(profiles = ['chrome', 'safari', 'firefox']) {73 let currentProfileIndex = 0;74 75 // Common timezone groups for better consistency76 const timezones = [77 'America/New_York',78 'America/Chicago',79 'America/Denver', 80 'America/Los_Angeles',81 'Europe/London',82 'Europe/Berlin',83 'Europe/Paris',84 'Asia/Tokyo',85 'Australia/Sydney'86 ];87 88 // Common locales that match the timezones89 const locales = [90 'en-US',91 'en-GB',92 'en-CA',93 'de-DE',94 'fr-FR',95 'ja-JP',96 'en-AU'97 ];98 99 return async function applyRotatingFingerprint(page) {100 // Get next profile in rotation101 const profileName = profiles[currentProfileIndex];102 currentProfileIndex = (currentProfileIndex + 1) % profiles.length;103 104 // Select timezone and locale that would naturally go together105 const timezoneIndex = Math.floor(Math.random() * timezones.length);106 const timezone = timezones[timezoneIndex];107 108 // Match locale to region where possible109 let locale;110 if (timezone.startsWith('America')) {111 locale = locales[Math.floor(Math.random() * 3)]; // US, GB, or CA112 } else if (timezone.startsWith('Europe')) {113 locale = locales[Math.floor(Math.random() * 2) + 3]; // DE or FR114 } else if (timezone.startsWith('Asia')) {115 locale = locales[5]; // JA116 } else {117 locale = locales[6]; // AU118 }119 120 // Apply the fingerprinting121 return await configureFingerprinting(page, {122 browserProfile: profileName,123 randomizeFingerprints: true,124 timezone,125 locale126 });127 };128}129
130/**131 * Creates consistent fingerprints for a session132 * @param {string} sessionId - Session identifier133 * @returns {Object} Consistent fingerprint settings for this session134 */135export function generateConsistentFingerprint(sessionId) {136 // Use session ID as seed for deterministic but unique fingerprints137 const seed = sessionId.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0);138 139 // Browser profiles to choose from140 const browserProfiles = ['safari', 'chrome', 'firefox'];141 142 // Common timezones143 const timezones = [144 'America/New_York', 'America/Chicago', 'America/Los_Angeles', 145 'Europe/London', 'Europe/Paris', 'Europe/Berlin',146 'Asia/Tokyo', 'Australia/Sydney'147 ];148 149 // Common locales150 const locales = ['en-US', 'en-GB', 'en-CA', 'de-DE', 'fr-FR', 'ja-JP', 'en-AU'];151 152 // Helper for deterministic selection based on seed153 const selectOption = (options, offset = 0) => {154 const index = (seed + offset) % options.length;155 return options[index];156 };157 158 // Generate a consistent fingerprint configuration159 return {160 browserProfile: selectOption(browserProfiles),161 timezone: selectOption(timezones, 100),162 locale: selectOption(locales, 200),163 // Generate a consistent but small variation164 fingerprint: {165 screenWidth: 1920 + ((seed % 5) * 16), // Small variations in screen size166 screenHeight: 1080 + ((seed % 3) * 8),167 colorDepth: 24,168 deviceMemory: 8,169 hardwareConcurrency: 4 + (seed % 4) * 2, // 4, 6, 8, or 10 cores170 }171 };172}173
174/**175 * Apply session-consistent fingerprinting to page176 * @param {Page} page - Puppeteer page177 * @param {string} sessionId - Session identifier178 * @returns {Object} Applied configuration179 */180export async function applySessionFingerprinting(page, sessionId) {181 // Generate consistent fingerprint for this session182 const fingerprintConfig = generateConsistentFingerprint(sessionId);183 184 // Apply the fingerprinting with the consistent settings185 return await configureFingerprinting(page, {186 browserProfile: fingerprintConfig.browserProfile,187 timezone: fingerprintConfig.timezone,188 locale: fingerprintConfig.locale,189 randomizeFingerprints: false,190 // Additional custom props191 screen: {192 width: fingerprintConfig.fingerprint.screenWidth,193 height: fingerprintConfig.fingerprint.screenHeight,194 colorDepth: fingerprintConfig.fingerprint.colorDepth195 },196 hardwareConcurrency: fingerprintConfig.fingerprint.hardwareConcurrency,197 deviceMemory: fingerprintConfig.fingerprint.deviceMemory198 });199}
src/main.js
1import { Actor } from 'apify';2import { PuppeteerCrawler, KeyValueStore, Dataset } from 'crawlee';3import puppeteer from 'puppeteer-extra';4import StealthPlugin from 'puppeteer-extra-plugin-stealth';5import randomUseragent from 'random-useragent';6import Captcha from '2captcha';7import { fileURLToPath } from 'url';8import path from 'path';9import fs from 'fs';10import { handleRequest } from './routes.js';11import cheerio from 'cheerio';12import cloudscraper from './utils/cloudscraper-replacement.js';13import { HttpsProxyAgent } from 'https-proxy-agent';14
15// Import our new challenge bypass integrator16import {17 createEnhancedSession,18 enhancedPageNavigation,19 cleanupSession20} from './upwork-challenge-integrator.js';21
22// Import fingerprint enhancer23import {24 applySessionFingerprinting,25 createFingerprintRotationStrategy26} from './fingerprint-integration.js';27
28// Additional puppeteer-extra plugins29import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha';30import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker';31import UserPreferencesPlugin from 'puppeteer-extra-plugin-user-preferences';32import UserDataDirPlugin from 'puppeteer-extra-plugin-user-data-dir';33
34// Create a fingerprint rotation strategy for the entire application35const fingerprintRotator = createFingerprintRotationStrategy(['chrome', 'safari', 'firefox']);36 37// Store fingerprint configurations by session38const sessionFingerprints = {};39
40// Directory for storing browser profiles41const __dirname = path.dirname(fileURLToPath(import.meta.url));42const PROFILES_DIR = path.join(__dirname, '..', 'browser_profiles');43
44// Ensure profiles directory exists45if (!fs.existsSync(PROFILES_DIR)) {46 fs.mkdirSync(PROFILES_DIR, { recursive: true });47}48
49// Configure stealth plugin with advanced options50const stealth = StealthPlugin({51 webglVendor: "Google Inc. (Intel)",52 webglRenderer: "Intel Iris OpenGL Engine",53 navigator: {54 platform: "MacIntel",55 languages: ["en-US", "en"]56 }57});58
59// Add all plugins to puppeteer60puppeteer.use(stealth);61
62// Add recaptcha plugin with 2captcha support (will be configured later if API key is provided)63puppeteer.use(RecaptchaPlugin());64
65// Add adblocker to reduce detection via ads and trackers66puppeteer.use(AdblockerPlugin({ blockTrackers: true }));67
68// CloudScraper request function (Promise-based wrapper)69const cloudScraperRequest = async (options) => {70 return new Promise((resolve, reject) => {71 cloudscraper(options, (error, response, body) => {72 if (error) {73 reject(error);74 return;75 }76 resolve({ response, body });77 });78 });79};80
81// HTML parser fallback for when browser methods fail82export const parseJobsFromHTML = (html) => {83 console.log('Using direct HTML parsing fallback method');84 85 try {86 // Basic regex-based extraction87 const jobs = [];88 89 // Extract job titles and links90 const titleRegex = /<h2[^>]*>[^<]*<a[^>]*href="([^"]+)"[^>]*>([^<]+)<\/a>/gi;91 let titleMatch;92 while ((titleMatch = titleRegex.exec(html)) !== null) {93 const jobLink = titleMatch[1].startsWith('http') ? titleMatch[1] : `https://www.upwork.com${titleMatch[1]}`;94 const title = titleMatch[2].trim();95 96 // Create a job object97 const job = {98 title: title,99 jobLink: jobLink,100 postedDate: 'N/A',101 jobType: 'N/A',102 experienceLevel: 'N/A',103 duration: 'N/A',104 description: 'N/A',105 budget: 'N/A',106 skills: 'N/A',107 _extractedBy: 'html-fallback'108 };109 110 // Try to find additional information for this job111 112 // Find job description - look for a paragraph after the title113 const descRegex = new RegExp(`href="${jobLink.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}"[^>]*>[^<]*<\\/a>[\\s\\S]*?<p[^>]*>([\\s\\S]*?)<\\/p>`, 'i');114 const descMatch = html.match(descRegex);115 if (descMatch) {116 job.description = descMatch[1].trim().replace(/<[^>]*>/g, '');117 }118 119 // Try to find posted date120 const postedRegex = /posted\s+([^<>]+?)(?:ago|on)/i;121 const postedMatch = html.match(postedRegex);122 if (postedMatch) {123 job.postedDate = postedMatch[1].trim();124 }125 126 // Try to find budget127 const budgetRegex = /\$([0-9,.]+)\s*-\s*\$([0-9,.]+)|Fixed\s*Price:\s*\$([0-9,.]+)|Budget:\s*\$([0-9,.]+)/i;128 const budgetMatch = html.match(budgetRegex);129 if (budgetMatch) {130 if (budgetMatch[1] && budgetMatch[2]) {131 job.budget = `$${budgetMatch[1]}-$${budgetMatch[2]}`;132 } else if (budgetMatch[3]) {133 job.budget = `$${budgetMatch[3]}`;134 } else if (budgetMatch[4]) {135 job.budget = `$${budgetMatch[4]}`;136 }137 }138 139 // Try to find job type140 const jobTypeRegex = /Hourly|Fixed Price/i;141 const jobTypeMatch = html.match(jobTypeRegex);142 if (jobTypeMatch) {143 job.jobType = jobTypeMatch[0].trim();144 }145 146 // Try to find experience level147 const expLevelRegex = /Entry Level|Intermediate|Expert/i;148 const expLevelMatch = html.match(expLevelRegex);149 if (expLevelMatch) {150 job.experienceLevel = expLevelMatch[0].trim();151 }152 153 // Try to find duration154 const durationRegex = /More than 6 months|3 to 6 months|1 to 3 months|Less than 1 month/i;155 const durationMatch = html.match(durationRegex);156 if (durationMatch) {157 job.duration = durationMatch[0].trim();158 }159 160 // Try to find skills161 const skillsRegex = /<span[^>]*data-test="skill"[^>]*>([^<]+)<\/span>/gi;162 let skillsMatch;163 const skills = [];164 while ((skillsMatch = skillsRegex.exec(html)) !== null) {165 skills.push(skillsMatch[1].trim());166 }167 if (skills.length > 0) {168 job.skills = skills.join(', ');169 }170 171 jobs.push(job);172 }173 174 console.log(`Extracted ${jobs.length} jobs using HTML fallback parser`);175 return jobs;176 } catch (error) {177 console.error('Error parsing HTML directly:', error);178 return [];179 }180};181
182// Browser fingerprint diversity config183const browserProfiles = [184 // US profiles185 {186 timezone: 'America/New_York',187 locale: 'en-US',188 geolocation: { latitude: 40.7128, longitude: -74.0060, accuracy: 100 }, // NYC189 platform: 'Win32',190 platformVersion: '10.0',191 deviceMemory: 8,192 hardwareConcurrency: 8,193 userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'194 },195 {196 timezone: 'America/Los_Angeles',197 locale: 'en-US',198 geolocation: { latitude: 34.0522, longitude: -118.2437, accuracy: 100 }, // LA199 platform: 'MacIntel',200 platformVersion: '10.15.7',201 deviceMemory: 16,202 hardwareConcurrency: 12,203 userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15'204 },205 {206 timezone: 'America/Chicago',207 locale: 'en-US',208 geolocation: { latitude: 41.8781, longitude: -87.6298, accuracy: 100 }, // Chicago209 platform: 'Win32',210 platformVersion: '11.0',211 deviceMemory: 16,212 hardwareConcurrency: 8,213 userAgent: 'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'214 },215 // International profiles (for IP diversity)216 {217 timezone: 'Europe/London',218 locale: 'en-GB',219 geolocation: { latitude: 51.5074, longitude: -0.1278, accuracy: 100 }, // London220 platform: 'MacIntel',221 platformVersion: '14.1',222 deviceMemory: 8,223 hardwareConcurrency: 10,224 userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15'225 },226 {227 timezone: 'Europe/Berlin',228 locale: 'de-DE',229 geolocation: { latitude: 52.5200, longitude: 13.4050, accuracy: 100 }, // Berlin230 platform: 'Win32',231 platformVersion: '10.0',232 deviceMemory: 16,233 hardwareConcurrency: 6,234 userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'235 }236];237
238// Place ProxyManager class definition at the top level (outside any block)239class ProxyManager {240 constructor(config) {241 this.config = config;242 this.proxyStats = new Map();243 this.activeProxies = new Set();244 this.failedProxies = new Set();245 this.badProxies = new Set(); // New set for proxies that fail with 403246 this.lastRotation = Date.now();247 this.rotationInterval = 30 * 60 * 1000; // 30 minutes248 this.maxConcurrency = config.maxConcurrency;249 this.maxRetries = config.maxRetries;250 this.concurrentRequests = new Map();251 this.initialized = false;252 }253 // ... rest of ProxyManager methods ...254}255
256import { crawlUpworkJobs } from './enhanced-crawler.js';257
258async function main() {259 await Actor.init();260
261 const input = await Actor.getInput() || {};262 const {263 searchQuery = 'shopify',264 page = 1,265 useApifyProxy = false,266 maxConcurrency = 1,267 maxRetries = 3,268 upworkUsername = '',269 upworkPassword = '',270 captchaApiKey = '', // 2captcha API key for solving CAPTCHAs271 sessionRotationCount = 3, // Number of sessions to rotate through272 minDelayBetweenRequests = 15, // Minimum delay in seconds between requests273 maxDelayBetweenRequests = 45, // Maximum delay in seconds between requests274 sessionCooldownMinutes = 30, // Cooldown period for each session in minutes275 proxyRotationEnabled = true, // Whether to rotate proxies during session276 proxyRotationRequests = 3, // Number of requests before rotating proxy277 simulateBrowserHistory = true, // Whether to simulate browser history278 randomizeTimezone = true, // Whether to randomize timezone per session279 useNewHeadless = true, // Whether to use Chrome's new headless mode280 useRealProfiles = true, // Whether to use real browser profiles281 useCloudScraper = true, // Whether to use CloudScraper for Cloudflare bypass282 disableWebSecurity = false, // Whether to disable web security283 bypassCSP = false, // Whether to bypass Content Security Policy284 simulateExtensions = true, // Whether to simulate browser extensions285 useFallbackHtmlParser = true, // Whether to use direct HTML parsing as fallback286 } = input;287
288 let shouldUseApifyProxy = useApifyProxy;289
290 // Configure RecaptchaPlugin if API key is provided291 if (captchaApiKey) {292 const recaptchaPlugin = RecaptchaPlugin({293 provider: {294 id: '2captcha',295 token: captchaApiKey,296 },297 visualFeedback: true,298 });299 puppeteer.use(recaptchaPlugin);300 }301
302 // Set up the CAPTCHA solver if API key is provided303 const solver = captchaApiKey ? new Captcha.Solver(captchaApiKey) : null;304
305 // Initialize sessions storage306 const kvStore = await KeyValueStore.open();307 let sessions = await kvStore.getValue('UPWORK_SESSIONS') || [];308
309 // After browserProfiles, add common extension simulation data310 const commonExtensions = [311 {312 name: "AdBlock",313 description: "Block ads and popups",314 version: "5.13.0",315 enabled: true316 },317 {318 name: "Honey",319 description: "Automatic coupon finder",320 version: "12.8.2",321 enabled: true322 },323 {324 name: "Grammarly",325 description: "Grammar checking",326 version: "14.1033.0",327 enabled: true328 },329 {330 name: "LastPass",331 description: "Password manager",332 version: "4.110.0",333 enabled: true334 },335 {336 name: "Dark Reader",337 description: "Dark mode for websites",338 version: "4.9.67",339 enabled: false340 }341 ];342
343 // Function to generate random extensions for a profile344 const generateRandomExtensions = () => {345 // Choose 2-5 random extensions from the common list346 const extensionCount = Math.floor(Math.random() * 4) + 2;347 const shuffled = [...commonExtensions].sort(() => 0.5 - Math.random());348 const selectedExtensions = shuffled.slice(0, extensionCount);349 350 // Randomize whether they're enabled351 return selectedExtensions.map(ext => ({352 ...ext,353 enabled: Math.random() > 0.2 // 80% chance of being enabled354 }));355 };356
357 // Generate new sessions if needed358 if (sessions.length < sessionRotationCount) {359 console.log(`Creating ${sessionRotationCount - sessions.length} new sessions for rotation`);360 sessions = Array.from({ length: sessionRotationCount }, (_, i) => {361 // Assign a random browser profile to this session362 const profileIndex = Math.floor(Math.random() * browserProfiles.length);363 const profile = browserProfiles[profileIndex];364 365 // Create a unique user data directory for this session if using real profiles366 const userDataDir = useRealProfiles ? 367 path.join(PROFILES_DIR, `profile_${Date.now()}_${i}`) : null;368 369 return {370 id: `session_${Date.now()}_${i}`,371 cookies: null,372 lastUsed: null,373 usageCount: 0,374 cooldownUntil: null,375 requestsSinceProxyRotation: 0,376 browserProfile: profile,377 userDataDir: userDataDir,378 // Simulate browser extensions if enabled379 extensions: simulateExtensions ? generateRandomExtensions() : [],380 // Pre-calculated history entries for common sites381 browserHistory: simulateBrowserHistory ? generateFakeBrowsingHistory() : null,382 };383 });384 await kvStore.setValue('UPWORK_SESSIONS', sessions);385 }386
387 // Helper function to get the next available session388 const getNextAvailableSession = () => {389 const now = new Date();390 391 // Sort sessions: first by cooldown status, then by least recently used392 const sortedSessions = [...sessions].sort((a, b) => {393 // First check if session is in cooldown394 const aInCooldown = a.cooldownUntil && new Date(a.cooldownUntil) > now;395 const bInCooldown = b.cooldownUntil && new Date(b.cooldownUntil) > now;396 397 if (aInCooldown && !bInCooldown) return 1;398 if (!aInCooldown && bInCooldown) return -1;399 400 // If cooldown status is the same, sort by last used time401 if (!a.lastUsed) return -1;402 if (!b.lastUsed) return 1;403 404 return new Date(a.lastUsed) - new Date(b.lastUsed);405 });406 407 // Get the first non-cooldown session408 const availableSession = sortedSessions.find(s => !s.cooldownUntil || new Date(s.cooldownUntil) <= now);409 410 if (!availableSession) {411 console.log('All sessions are in cooldown. Using the session with earliest cooldown end.');412 return sortedSessions[0];413 }414 415 return availableSession;416 };417
418 // Helper function to update session after use419 const updateSessionAfterUse = async (sessionId) => {420 const sessionIndex = sessions.findIndex(s => s.id === sessionId);421 if (sessionIndex === -1) return;422 423 sessions[sessionIndex].lastUsed = new Date().toISOString();424 sessions[sessionIndex].usageCount++;425 426 // If session has been used multiple times, put it in cooldown427 if (sessions[sessionIndex].usageCount >= 3) {428 const cooldownUntil = new Date();429 cooldownUntil.setMinutes(cooldownUntil.getMinutes() + sessionCooldownMinutes);430 sessions[sessionIndex].cooldownUntil = cooldownUntil.toISOString();431 sessions[sessionIndex].usageCount = 0;432 console.log(`Session ${sessionId} placed in cooldown until ${cooldownUntil.toISOString()}`);433 }434 435 await kvStore.setValue('UPWORK_SESSIONS', sessions);436 };437
438 // Generate fake browsing history for a more authentic browser profile439 function generateFakeBrowsingHistory() {440 const popularSites = [441 'https://www.google.com',442 'https://www.youtube.com',443 'https://www.facebook.com',444 'https://www.amazon.com',445 'https://www.wikipedia.org',446 'https://www.reddit.com',447 'https://twitter.com',448 'https://www.instagram.com',449 'https://www.linkedin.com',450 'https://www.netflix.com',451 'https://www.twitch.tv',452 'https://www.github.com',453 'https://news.ycombinator.com',454 'https://medium.com',455 'https://www.nytimes.com',456 'https://www.cnn.com',457 'https://www.bbc.com',458 'https://www.espn.com'459 ];460 461 // Generate 5-15 random history entries462 const entryCount = Math.floor(Math.random() * 10) + 5;463 const history = [];464 465 for (let i = 0; i < entryCount; i++) {466 const site = popularSites[Math.floor(Math.random() * popularSites.length)];467 // Create a timestamp within the last 7 days468 const timestamp = new Date();469 timestamp.setDate(timestamp.getDate() - Math.floor(Math.random() * 7));470 timestamp.setHours(timestamp.getHours() - Math.floor(Math.random() * 24));471 timestamp.setMinutes(timestamp.getMinutes() - Math.floor(Math.random() * 60));472 473 history.push({474 url: site,475 timestamp: timestamp.toISOString(),476 title: `${site.split('//')[1].split('.')[1].charAt(0).toUpperCase() + site.split('//')[1].split('.')[1].slice(1)} - Home`477 });478 }479 480 return history;481 }482
483 // Function to simulate browser cache and history484 async function simulateBrowserCacheAndHistory(page, history) {485 if (!history) return;486 487 await page.evaluateOnNewDocument((historyEntries) => {488 // Override History API489 const originalPushState = window.history.pushState;490 const originalReplaceState = window.history.replaceState;491 492 // Simulate history length493 Object.defineProperty(window.history, 'length', {494 get: function() {495 return historyEntries.length;496 }497 });498 499 // Create a fake localStorage with some entries500 const originalGetItem = Storage.prototype.getItem;501 const originalSetItem = Storage.prototype.setItem;502 const originalRemoveItem = Storage.prototype.removeItem;503 504 const storageCache = {};505 506 // Add some common storage items507 storageCache['theme'] = 'light';508 storageCache['session_visited'] = 'true';509 storageCache['gdpr_accepted'] = 'true';510 storageCache['last_visit'] = new Date(Date.now() - 86400000).toISOString();511 512 // Override localStorage methods513 Storage.prototype.getItem = function(key) {514 if (key in storageCache) {515 return storageCache[key];516 }517 return originalGetItem.call(this, key);518 };519 520 Storage.prototype.setItem = function(key, value) {521 storageCache[key] = value;522 return originalSetItem.call(this, key, value);523 };524 525 Storage.prototype.removeItem = function(key) {526 delete storageCache[key];527 return originalRemoveItem.call(this, key);528 };529 530 }, history);531 }532
533 // --- Cloudflare challenge detection helper ---534 async function isCloudflareChallenge(page) {535 return await page.evaluate(() => {536 return (537 document.title.includes('Cloudflare') ||538 document.title.includes('Attention Required') ||539 document.querySelector('div[class*="cf-"]') !== null ||540 document.querySelector('#challenge-form') !== null ||541 document.body.innerText.includes('Verifying...')542 );543 });544 }545
546 // Construire l'URL de recherche Upwork547 const baseUrl = `https://www.upwork.com/nx/search/jobs/?q=${encodeURIComponent(searchQuery)}`;548 const startUrl = page === 1 ? baseUrl : `${baseUrl}&page=${page}`;549
550 // Only instantiate proxyManager if useApifyProxy is true551 let proxyCfg = null;552 let proxyManager = null;553 let sessionProxyUrl = null;554 if (shouldUseApifyProxy) {555 try {556 proxyCfg = await Actor.createProxyConfiguration({557 groups: ['RESIDENTIAL'],558 countryCode: 'US',559 password: process.env.APIFY_PROXY_PASSWORD,560 });561 proxyManager = new ProxyManager({ ...proxyCfg, maxConcurrency, maxRetries });562 const proxyInfo = await proxyCfg.newUrl();563 const urlObj = new URL(proxyInfo);564 sessionProxyUrl = `http://groups-RESIDENTIAL,country-US:${process.env.APIFY_PROXY_PASSWORD}@${urlObj.host}`;565 console.log(`Using proxy: ${sessionProxyUrl}`);566 } catch (error) {567 console.warn('Warning: Failed to initialize proxy configuration. Running without proxy:', error.message);568 // Continue without proxy instead of throwing error569 shouldUseApifyProxy = false;570 }571 } else {572 console.log('Running without proxy as useApifyProxy is false');573 }574
575 // Move the entire definition of async function setupBrowserSession(...) here, just above its first usage.576 async function setupBrowserSession(sessionId, proxyUrl) {577 // Implementation of setupBrowserSession function578 }579
580 // Setup this session with the proxy URL (null if not using proxy)581 const activeSession = getNextAvailableSession();582 const sessionCookies = await setupBrowserSession(activeSession.id, sessionProxyUrl);583
584 // Create request with session ID585 const requestList = [];586 requestList.push({ 587 url: startUrl, 588 userData: { 589 pageCount: 0,590 sessionId: activeSession.id591 } 592 });593
594 // Check if we should use the enhanced crawler595 if (input.useEnhancedCrawler) {596 console.log('Using enhanced crawler with advanced challenge bypass');597 598 // Run enhanced crawler599 await crawlUpworkJobs({600 searchQuery,601 maxPages: page || 1,602 input,603 proxyConfiguration: proxyCfg,604 browserProfiles,605 });606 607 // Exit after enhanced crawler completes608 await Actor.exit();609 } else {610 // Original crawler creation and execution611 const crawler = new PuppeteerCrawler({612 requestHandler: async ({ request, page, log, session, proxyInfo }) => {613 try {614 const response = await page.goto(request.url);615 // Cloudflare challenge detection616 if (await isCloudflareChallenge(page)) {617 console.log('Cloudflare challenge detected! Rotating proxy/session, saving screenshot and HTML...');618 const screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 50 });619 await Actor.setValue(`cloudflare_challenge_${Date.now()}.jpg`, screenshotBuffer, { contentType: 'image/jpeg' });620 const html = await page.content();621 await Actor.setValue(`cloudflare_challenge_${Date.now()}.html`, html);622 if (proxyManager && proxyInfo && proxyInfo.url) {623 proxyManager.markProxyAsBad(proxyInfo.url, 'Cloudflare challenge');624 }625 // Optionally, rotate session or proxy here626 await updateSessionAfterUse(session.id);627 throw new Error('Cloudflare challenge detected, rotating proxy/session.');628 }629 if (response.status() === 403) {630 console.log('Proxy banned by Upwork, marking as bad...');631 if (proxyManager) {632 proxyManager.markProxyAsBad(proxyInfo.url, '403 Upwork');633 }634 throw new Error('Proxy banned by Upwork');635 }636 // ... rest of the handler code ...637 } catch (error) {638 console.error(`Error in request handler for session ${session.id}:`, error);639 await updateSessionAfterUse(session.id);640 }641 },642 maxConcurrency: maxConcurrency,643 maxRequestRetries: maxRetries,644 requestHandlerTimeoutSecs: 300,645 navigationTimeoutSecs: 180,646 preNavigationHooks: [647 // Hook pour ajouter des délais aléatoires entre requêtes et configurer le navigateur648 async ({ page, request, session }) => {649 // Get session info650 const { sessionId } = request.userData;651 const sessionIndex = sessions.findIndex(s => s.id === sessionId);652 653 // Handle proxy rotation if enabled654 if (proxyRotationEnabled && proxyCfg && sessionIndex !== -1) {655 // Check if we need to rotate proxy656 sessions[sessionIndex].requestsSinceProxyRotation = 657 (sessions[sessionIndex].requestsSinceProxyRotation || 0) + 1;658 659 if (sessions[sessionIndex].requestsSinceProxyRotation >= proxyRotationRequests) {660 // Time to rotate proxy661 const newProxyUrl = await proxyCfg.newUrl();662 console.log(`Rotating proxy for session ${sessionId} to: ${newProxyUrl}`);663 664 // Apply the new proxy - this requires recreating the browser session665 // We'll add the proxy to request.userData so it can be used in the next request666 request.userData.rotateProxy = true;667 request.userData.newProxyUrl = newProxyUrl;668 669 // Reset counter670 sessions[sessionIndex].requestsSinceProxyRotation = 0;671 await kvStore.setValue('UPWORK_SESSIONS', sessions);672 }673 }674 675 // First try CloudScraper if enabled to prefetch content and bypass protections676 if (useCloudScraper) {677 try {678 console.log(`Attempting CloudScraper pre-fetch for ${request.url}...`);679 const csResult = await cloudScraperRequest({680 method: 'GET',681 url: request.url,682 headers: {683 'User-Agent': sessionIndex !== -1 && sessions[sessionIndex].browserProfile ? 684 sessions[sessionIndex].browserProfile.userAgent : 685 getRandomUserAgent(),686 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',687 'Accept-Language': sessionIndex !== -1 && sessions[sessionIndex].browserProfile && sessions[sessionIndex].browserProfile.locale ? 688 `${sessions[sessionIndex].browserProfile.locale},en-US;q=0.9,en;q=0.8` : 689 'en-US,en;q=0.9',690 },691 resolveWithFullResponse: true,692 });693 694 if (csResult.response && csResult.response.statusCode === 200) {695 console.log('CloudScraper successfully pre-fetched the page');696 // Could potentially extract content here and parse it directly697 // if browser keeps getting blocked698 } else {699 console.log(`CloudScraper got status ${csResult.response?.statusCode || 'unknown'}`);700 }701 } catch (error) {702 console.error('CloudScraper pre-fetch error:', error.message);703 }704 }705 706 // Use a fresh user agent for each request707 const profile = sessionIndex !== -1 ? sessions[sessionIndex].browserProfile : null;708 const userAgent = profile ? profile.userAgent : getRandomUserAgent();709 710 console.log(`Using user agent: ${userAgent}`);711 await page.setUserAgent(userAgent);712 713 // Get session cookies714 if (sessionIndex !== -1 && sessions[sessionIndex].cookies) {715 await page.setCookie(...sessions[sessionIndex].cookies);716 console.log(`Set ${sessions[sessionIndex].cookies.length} cookies for session ${sessionId}`);717 }718 719 // Apply timezone from profile if available720 if (profile && randomizeTimezone) {721 await page.emulateTimezone(profile.timezone);722 console.log(`Using timezone: ${profile.timezone}`);723 724 // Set locale for this request725 if (profile.locale) {726 await page.evaluateOnNewDocument((locale) => {727 Object.defineProperty(navigator, 'language', {728 get: function() { return locale; }729 });730 Object.defineProperty(navigator, 'languages', {731 get: function() { return [locale, 'en-US']; }732 });733 }, profile.locale);734 735 console.log(`Using locale: ${profile.locale}`);736 }737 }738 739 // Simulate browser history and cache for this session740 if (simulateBrowserHistory && sessionIndex !== -1 && sessions[sessionIndex].browserHistory) {741 await simulateBrowserCacheAndHistory(page, sessions[sessionIndex].browserHistory);742 }743 744 // Apply advanced evasion techniques745 await page.evaluateOnNewDocument(() => {746 // Hide automation fingerprints747 delete Object.getPrototypeOf(navigator).webdriver;748 749 // Override platform info750 Object.defineProperty(navigator, 'platform', {751 get: () => 'MacIntel',752 });753 754 // Override hardware concurrency755 Object.defineProperty(navigator, 'hardwareConcurrency', {756 get: () => 8,757 });758 759 // Chrome specific properties760 if (typeof window !== 'undefined') {761 window.chrome = {762 runtime: {},763 loadTimes: function() {},764 csi: function() {},765 app: {766 isInstalled: false,767 },768 };769 }770 771 // Fix permissions behavior772 const originalQuery = window.navigator.permissions?.query;773 if (originalQuery) {774 window.navigator.permissions.query = (parameters) => {775 return parameters.name === 'notifications' ?776 Promise.resolve({ state: Notification.permission }) :777 originalQuery(parameters);778 };779 }780 781 // Fix WebGL fingerprinting782 if (window.WebGLRenderingContext) {783 const getParameter = WebGLRenderingContext.prototype.getParameter;784 WebGLRenderingContext.prototype.getParameter = function(parameter) {785 // UNMASKED_VENDOR_WEBGL786 if (parameter === 37445) {787 return 'Google Inc. (Intel)';788 }789 // UNMASKED_RENDERER_WEBGL790 if (parameter === 37446) {791 return 'Intel Iris OpenGL Engine';792 }793 return getParameter.call(this, parameter);794 };795 }796 });797 798 // Generate random ordering for HTTP/2 headers to avoid fingerprinting799 const headerOrder = [800 'Accept',801 'Accept-Language',802 'Accept-Encoding',803 'Connection',804 'Upgrade-Insecure-Requests',805 'Sec-Fetch-Dest',806 'Sec-Fetch-Mode',807 'Sec-Fetch-Site',808 'Sec-Fetch-User',809 'Cache-Control',810 'DNT',811 'Referer'812 ].sort(() => Math.random() - 0.5);813 814 // Apply headers in randomized order815 const headers = {};816 for (const header of headerOrder) {817 switch (header) {818 case 'Accept':819 headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8';820 break;821 case 'Accept-Language':822 headers['Accept-Language'] = profile && profile.locale ? 823 `${profile.locale},en-US;q=0.9,en;q=0.8` : 824 'en-US,en;q=0.9';825 break;826 case 'Accept-Encoding':827 headers['Accept-Encoding'] = 'gzip, deflate, br';828 break;829 case 'Connection':830 headers['Connection'] = 'keep-alive';831 break;832 case 'Upgrade-Insecure-Requests':833 headers['Upgrade-Insecure-Requests'] = '1';834 break;835 case 'Sec-Fetch-Dest':836 headers['Sec-Fetch-Dest'] = 'document';837 break;838 case 'Sec-Fetch-Mode':839 headers['Sec-Fetch-Mode'] = 'navigate';840 break;841 case 'Sec-Fetch-Site':842 headers['Sec-Fetch-Site'] = 'none';843 break;844 case 'Sec-Fetch-User':845 headers['Sec-Fetch-User'] = '?1';846 break;847 case 'Cache-Control':848 headers['Cache-Control'] = 'max-age=0';849 break;850 case 'DNT':851 headers['DNT'] = '1';852 break;853 case 'Referer':854 // Use a previous browsing history item as referer occasionally855 if (sessionIndex !== -1 && 856 sessions[sessionIndex].browserHistory && 857 Math.random() > 0.5) {858 const historyItems = sessions[sessionIndex].browserHistory;859 if (historyItems.length > 0) {860 const randomItem = historyItems[Math.floor(Math.random() * historyItems.length)];861 headers['Referer'] = randomItem.url;862 } else {863 headers['Referer'] = 'https://www.google.com/search?q=upwork+jobs';864 }865 } else {866 headers['Referer'] = 'https://www.google.com/search?q=upwork+jobs';867 }868 break;869 }870 }871 872 // Set the headers with random ordering873 await page.setExtraHTTPHeaders(headers);874 875 // Apply much longer delays between requests (15-45 seconds)876 const delaySeconds = Math.floor(Math.random() * (maxDelayBetweenRequests - minDelayBetweenRequests + 1)) + minDelayBetweenRequests;877 console.log(`Waiting ${delaySeconds} seconds before the next request...`);878 await new Promise(resolve => setTimeout(resolve, delaySeconds * 1000));879 },880 ],881 postNavigationHooks: [882 // Hook to handle CAPTCHA and challenges after navigation883 async ({ page, request, response, session }) => {884 const { sessionId } = request.userData;885 886 // Take screenshot for debugging navigation issues887 if (!response || response.status() >= 400) {888 const screenshotBuffer = await page.screenshot({ type: "jpeg", quality: 50 });889 await Actor.setValue(`navigation_error_${Date.now()}.jpg`, screenshotBuffer, { contentType: "image/jpeg" });890 console.log(`Error screenshot saved for status ${response ? response.status() : 'No response'}`);891 892 // If it's a 403, try to use CloudScraper as a fallback893 if (response && response.status() === 403 && useCloudScraper) {894 try {895 console.log('Got 403, attempting to bypass with CloudScraper...');896 const cloudScraperResult = await cloudScraperRequest({897 method: 'GET',898 url: request.url,899 resolveWithFullResponse: true,900 });901 902 if (cloudScraperResult.response && cloudScraperResult.response.statusCode === 200) {903 console.log('CloudScraper successfully retrieved content, saving to dataset');904 905 // Save the HTML content to a file for manual analysis906 await Actor.setValue(`cloudscraper_content_${Date.now()}.html`, cloudScraperResult.body);907 908 // Use direct HTML parsing as a fallback if enabled909 if (useFallbackHtmlParser) {910 try {911 console.log('Attempting to parse jobs from Cloudflare-bypassed HTML...');912 const extractedJobs = parseJobsFromHTML(cloudScraperResult.body);913 914 if (extractedJobs.length > 0) {915 console.log(`Successfully extracted ${extractedJobs.length} jobs with HTML parser from Cloudflare-bypassed content`);916 917 // Save to dataset918 const dataset = await Actor.openDataset('jobs');919 await dataset.pushData(extractedJobs);920 921 console.log('Jobs from Cloudflare-bypassed content saved to dataset');922 923 // Mark the request as handled924 request.userData.parsedWithFallback = true;925 }926 } catch (parserError) {927 console.error('Error parsing Cloudflare-bypassed content:', parserError);928 }929 }930 } else {931 console.log(`CloudScraper also failed with status: ${cloudScraperResult.response?.statusCode || 'unknown'}`);932 }933 } catch (error) {934 console.error('Error using CloudScraper fallback:', error);935 }936 }937 }938 939 // Check for security challenges940 const securityChallenges = await detectSecurityChallenges(page);941 942 if (securityChallenges.hasCaptcha) {943 console.log('CAPTCHA detected during navigation');944 const screenshotBuffer = await page.screenshot({ type: "jpeg", quality: 50 });945 await Actor.setValue(`captcha_challenge_${Date.now()}.jpg`, screenshotBuffer, { contentType: "image/jpeg" });946 947 // Try to solve the CAPTCHA with the plugin first if available948 if (captchaApiKey) {949 try {950 console.log('Attempting to solve with puppeteer-extra-plugin-recaptcha...');951 await page.solveRecaptchas();952 console.log('Recaptcha solved with plugin');953 954 // Check if there's a form to submit after solving955 const submitButton = await page.$('button[type="submit"]');956 if (submitButton) {957 await submitButton.click();958 await page.waitForNavigation({ timeout: 30000 });959 }960 } catch (recaptchaError) {961 console.error('Error with recaptcha plugin:', recaptchaError);962 // Fall back to our custom implementation963 const solved = await solveCaptcha(page);964 if (solved) {965 console.log('Navigation CAPTCHA solved successfully with fallback method');966 } else {967 console.log('Failed to solve navigation CAPTCHA');968 session.markBad();969 }970 }971 } else {972 // Use our custom implementation973 const solved = await solveCaptcha(page);974 if (solved) {975 console.log('Navigation CAPTCHA solved successfully');976 } else {977 console.log('Failed to solve navigation CAPTCHA');978 session.markBad();979 }980 }981 }982 983 if (securityChallenges.hasCloudflare) {984 console.log('Cloudflare challenge detected during navigation');985 const screenshotBuffer = await page.screenshot({ type: "jpeg", quality: 50 });986 await Actor.setValue(`cloudflare_challenge_${Date.now()}.jpg`, screenshotBuffer, { contentType: "image/jpeg" });987 988 // Try to bypass with CloudScraper if enabled989 if (useCloudScraper) {990 try {991 console.log('Attempting to bypass Cloudflare with CloudScraper...');992 const cloudScraperResult = await cloudScraperRequest({993 method: 'GET',994 url: request.url,995 resolveWithFullResponse: true,996 });997 998 if (cloudScraperResult.response && cloudScraperResult.response.statusCode === 200) {999 console.log('CloudScraper successfully bypassed Cloudflare, saving content');1000 await Actor.setValue(`cloudscraper_cf_bypass_${Date.now()}.html`, cloudScraperResult.body);1001 1002 // Try to parse the HTML content directly1003 if (useFallbackHtmlParser) {1004 try {1005 console.log('Attempting to parse jobs from Cloudflare-bypassed HTML...');1006 const extractedJobs = parseJobsFromHTML(cloudScraperResult.body);1007 1008 if (extractedJobs.length > 0) {1009 console.log(`Successfully extracted ${extractedJobs.length} jobs with HTML parser from Cloudflare-bypassed content`);1010 1011 // Save to dataset1012 const dataset = await Actor.openDataset('jobs');1013 await dataset.pushData(extractedJobs);1014 1015 console.log('Jobs from Cloudflare-bypassed content saved to dataset');1016 1017 // Mark the request as handled1018 request.userData.parsedWithFallback = true;1019 }1020 } catch (parserError) {1021 console.error('Error parsing Cloudflare-bypassed content:', parserError);1022 }1023 }1024 } else {1025 console.log('CloudScraper also failed to bypass Cloudflare');1026 }1027 } catch (error) {1028 console.error('Error using CloudScraper for Cloudflare bypass:', error);1029 }1030 }1031 1032 // Wait longer for Cloudflare to resolve1033 console.log('Waiting longer for Cloudflare challenge resolution...');1034 await new Promise(resolve => setTimeout(resolve, 30000));1035 }1036 1037 // After handling any challenges, simulate human behavior again1038 await emulateHumanBehavior(page);1039 1040 // Random network pattern: Sometimes load additional resources to appear more human-like1041 if (Math.random() > 0.7) {1042 try {1043 console.log('Simulating additional resource loading for natural network patterns...');1044 1045 // Choose random URLs to visit briefly1046 const commonResources = [1047 'https://www.upwork.com/static/assets/css/main.css',1048 'https://www.upwork.com/static/fonts/font.woff2',1049 'https://www.upwork.com/ab/account-security/login',1050 'https://www.upwork.com/nx/create-profile/',1051 'https://www.upwork.com/resources/'1052 ];1053 1054 // Pick 1-2 random resources1055 const resourceCount = Math.floor(Math.random() * 2) + 1;1056 const selectedResources = [...commonResources]1057 .sort(() => 0.5 - Math.random())1058 .slice(0, resourceCount);1059 1060 // Fetch these resources in the background1061 for (const resourceUrl of selectedResources) {1062 await page.evaluate((url) => {1063 // Create and append a hidden iframe to load the resource1064 const iframe = document.createElement('iframe');1065 iframe.style.width = '0px';1066 iframe.style.height = '0px';1067 iframe.style.position = 'absolute';1068 iframe.style.top = '-9999px';1069 iframe.style.left = '-9999px';1070 iframe.src = url;1071 document.body.appendChild(iframe);1072 1073 // Remove it after a short time1074 setTimeout(() => {1075 if (iframe && iframe.parentNode) {1076 iframe.parentNode.removeChild(iframe);1077 }1078 }, 5000);1079 }, resourceUrl);1080 1081 // Wait a short time between resource requests1082 await new Promise(resolve => setTimeout(resolve, Math.floor(Math.random() * 2000) + 1000));1083 }1084 1085 console.log('Additional resources loaded for natural network pattern');1086 } catch (error) {1087 console.error('Error simulating additional resource loading:', error);1088 }1089 }1090 },1091 ],1092 proxyConfiguration: proxyCfg,1093 launchContext: {1094 launcher: puppeteer,1095 launchOptions: {1096 headless: useNewHeadless ? "new" : true,1097 ignoreHTTPSErrors: true,1098 args: [1099 '--disable-gpu',1100 '--no-sandbox',1101 '--disable-setuid-sandbox',1102 '--disable-blink-features=AutomationControlled',1103 '--disable-accelerated-2d-canvas',1104 '--disable-infobars',1105 '--window-size=1920,1080',1106 '--hide-scrollbars',1107 '--disable-notifications',1108 '--disable-extensions',1109 '--ignore-certificate-errors',1110 ...(disableWebSecurity ? ['--disable-web-security'] : []),1111 ...(bypassCSP ? ['--disable-features=IsolateOrigins,site-per-process', '--disable-site-isolation-trials'] : []),1112 ],1113 defaultViewport: {1114 width: 1920,1115 height: 1080,1116 },1117 },1118 },1119 });1120 }1121
1122 // Lancer le crawler avec l'URL initiale (only if we're using the original crawler)1123 if (!input.useEnhancedCrawler) {1124 await crawler.run(requestList);1125 }1126
1127 await Actor.exit();1128}1129
1130main();1131
1132export default main;
src/routes.js
1import { Actor, Dataset } from 'apify';2
3// Liste des User-Agents (évite la détection anti-bot)4const userAgents = [5 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",6 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",7 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",8 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",9 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",10];11
12// Sélection aléatoire d'un User-Agent13const getRandomUserAgent = () => userAgents[Math.floor(Math.random() * userAgents.length)];14
15// Pause aléatoire pour simuler un comportement humain16const randomSleep = (minMs, maxMs) =>17 new Promise(resolve => setTimeout(resolve, Math.floor(Math.random() * (maxMs - minMs + 1)) + minMs));18
19// Capture d'écran pour debug20const takeScreenshot = async (page, filename) => {21 try {22 const screenshotBuffer = await page.screenshot({ type: "jpeg", quality: 50, fullPage: false });23 const screenshotKey = `${filename}-${Date.now()}.jpeg`;24 await Actor.setValue(screenshotKey, screenshotBuffer, { contentType: "image/jpeg" });25 const screenshotUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${screenshotKey}`;26 console.log(`📸 Screenshot saved: ${screenshotUrl}`);27 return screenshotUrl;28 } catch (error) {29 console.error(`❌ Failed to take screenshot: ${error.message}`);30 return null;31 }32};33
34// Simulate human-like interaction with the page35const simulateHumanInteraction = async (page) => {36 // Simulate random mouse movements37 await page.evaluate(() => {38 const randomPoint = () => ({39 x: Math.floor(Math.random() * window.innerWidth),40 y: Math.floor(Math.random() * window.innerHeight),41 });42 43 // Create several points for a natural curve44 const points = Array.from({ length: 5 }, randomPoint);45 46 // Simulate mouse movement across these points47 points.forEach((point) => {48 const event = new MouseEvent('mousemove', {49 bubbles: true,50 cancelable: true,51 clientX: point.x,52 clientY: point.y,53 });54 document.dispatchEvent(event);55 });56 });57 58 // Random scrolling with natural speed59 await page.evaluate(() => {60 return new Promise((resolve) => {61 let scrollTop = 0;62 const maxScroll = Math.max(63 document.body.scrollHeight, 64 document.documentElement.scrollHeight65 ) / 3;66 67 // Create a natural scroll down pattern68 const scroll = () => {69 const step = Math.floor(Math.random() * 100) + 50;70 scrollTop += step;71 window.scrollTo(0, scrollTop);72 73 if (scrollTop < maxScroll) {74 setTimeout(scroll, Math.floor(Math.random() * 200) + 100);75 } else {76 // Sometimes scroll back up a bit77 if (Math.random() > 0.7) {78 setTimeout(() => {79 window.scrollTo(0, scrollTop - Math.floor(Math.random() * 300));80 resolve();81 }, Math.floor(Math.random() * 500) + 500);82 } else {83 resolve();84 }85 }86 };87 88 setTimeout(scroll, Math.floor(Math.random() * 500) + 100);89 });90 });91 92 // Random pauses93 await randomSleep(1000, 3000);94};95
96// Vérifier si une page contient un challenge Cloudflare97const hasCloudflareChallenge = async (page) => {98 return page.evaluate(() => {99 return document.title.includes('Cloudflare') || 100 document.title.includes('Attention Required') ||101 document.querySelector('div[class*="cf-"]') !== null ||102 document.querySelector('#challenge-form') !== null;103 });104};105
106// Check for other security challenges (like CAPTCHA, DataDome, etc.)107const hasSecurityChallenge = async (page) => {108 return page.evaluate(() => {109 const pageContent = document.body.innerText.toLowerCase();110 const pageHtml = document.documentElement.innerHTML.toLowerCase();111 112 // Common security challenge indicators113 const securityIndicators = [114 'captcha',115 'security check',116 'bot protection',117 'human verification',118 'datadome',119 'are you a robot',120 'prove you are human',121 'перехресні дороги', // reCAPTCHA phrase122 'traffic light', // reCAPTCHA phrase123 'human challenge'124 ];125 126 // Check for text indicators in the page127 const hasTextIndicator = securityIndicators.some(indicator => 128 pageContent.includes(indicator));129 130 // Check for specific elements that might indicate a challenge131 const hasElementIndicator = 132 document.querySelector('iframe[src*="captcha"]') !== null ||133 document.querySelector('iframe[src*="recaptcha"]') !== null ||134 document.querySelector('div[class*="captcha"]') !== null ||135 document.querySelector('div[class*="g-recaptcha"]') !== null ||136 document.querySelector('div[class*="h-captcha"]') !== null;137 138 return hasTextIndicator || hasElementIndicator;139 });140};141
142// Fonction principale de scraping143export async function handleRequest({ request, page, log, session }) {144 log.info(`🔍 Scraping: ${request.url}`);145
146 try {147 // Définir un User-Agent aléatoire148 const userAgent = getRandomUserAgent();149 await page.setUserAgent(userAgent);150
151 // Ajouter des en-têtes HTTP supplémentaires plus réalistes152 await page.setExtraHTTPHeaders({153 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',154 'Accept-Language': 'en-US,en;q=0.9',155 'Accept-Encoding': 'gzip, deflate, br',156 'Connection': 'keep-alive',157 'Upgrade-Insecure-Requests': '1',158 'Sec-Fetch-Dest': 'document',159 'Sec-Fetch-Mode': 'navigate',160 'Sec-Fetch-Site': 'none',161 'Sec-Fetch-User': '?1',162 'Cache-Control': 'max-age=0',163 'DNT': '1',164 'Referer': 'https://www.google.com/search?q=upwork+jobs',165 });166
167 // Configure advanced browser fingerprinting evasion168 await page.evaluateOnNewDocument(() => {169 // Basic webdriver removal170 delete Object.getPrototypeOf(navigator).webdriver;171 172 // Advanced browser fingerprinting evasion173 const overrideWebGL = () => {174 if (!window.WebGLRenderingContext) {175 return;176 }177 178 // Override WebGL fingerprinting179 const getParameter = WebGLRenderingContext.prototype.getParameter;180 WebGLRenderingContext.prototype.getParameter = function(parameter) {181 // UNMASKED_VENDOR_WEBGL182 if (parameter === 37445) {183 return 'Google Inc. (Intel)';184 }185 // UNMASKED_RENDERER_WEBGL186 if (parameter === 37446) {187 return 'Intel Iris OpenGL Engine';188 }189 return getParameter.call(this, parameter);190 };191 };192 193 // Canvas fingerprinting evasion194 const overrideCanvas = () => {195 const originalGetContext = HTMLCanvasElement.prototype.getContext;196 HTMLCanvasElement.prototype.getContext = function(type, attributes) {197 const context = originalGetContext.call(this, type, attributes);198 199 if (context && type === '2d') {200 const originalGetImageData = context.getImageData;201 context.getImageData = function(...args) {202 const imageData = originalGetImageData.apply(this, args);203 // Subtle modifications to canvas data to produce consistent fingerprint204 // but one that doesn't exactly match headless browsers205 const pixels = imageData.data;206 for (let i = 0; i < pixels.length; i += 4) {207 // Only modify a small percentage of pixels for subtle alteration208 if (Math.random() < 0.005) {209 pixels[i] = pixels[i] ^ 1; // Red210 pixels[i+1] = pixels[i+1] ^ 1; // Green211 pixels[i+2] = pixels[i+2] ^ 1; // Blue212 }213 }214 return imageData;215 };216 }217 218 return context;219 };220 };221 222 // Audio fingerprinting evasion223 const overrideAudioContext = () => {224 if (window.AudioContext) {225 const originalGetChannelData = AudioBuffer.prototype.getChannelData;226 AudioBuffer.prototype.getChannelData = function(channel) {227 const data = originalGetChannelData.call(this, channel);228 // Only modify a small percentage for subtle changes229 if (window.AUDIO_FINGERPRINT_DEFENDED !== true && data.length > 0) {230 window.AUDIO_FINGERPRINT_DEFENDED = true;231 for (let i = 0; i < Math.min(data.length, 8); i++) {232 data[i] = data[i] + Math.random() * 0.0001;233 }234 }235 return data;236 };237 }238 };239 240 // Custom font fingerprinting evasion241 const overrideFonts = () => {242 // Override font measurement to produce consistent readings243 if (document.fonts && document.fonts.load) {244 const originalLoad = document.fonts.load;245 document.fonts.load = function(...args) {246 return originalLoad.apply(this, args).catch(err => {247 // Suppress certain font loading errors248 return Promise.resolve([]);249 });250 };251 }252 };253 254 // Timezone & language consistency255 const overrideTimeAndLanguage = () => {256 // Override Intl methods to ensure consistent language/locale257 if (window.Intl && window.Intl.DateTimeFormat) {258 const originalDateTimeFormat = window.Intl.DateTimeFormat;259 window.Intl.DateTimeFormat = function(...args) {260 if (args.length === 0 || !args[0]) {261 args[0] = 'en-US';262 }263 return new originalDateTimeFormat(...args);264 };265 }266 };267 268 // Apply all evasion techniques269 overrideWebGL();270 overrideCanvas();271 overrideAudioContext();272 overrideFonts();273 overrideTimeAndLanguage();274 275 // Chrome specific properties276 if (typeof window !== 'undefined') {277 window.chrome = {278 runtime: {},279 loadTimes: function() {},280 csi: function() {},281 app: {282 isInstalled: false,283 },284 };285 }286 287 // Fix permissions behavior288 const originalQuery = window.navigator.permissions?.query;289 if (originalQuery) {290 window.navigator.permissions.query = (parameters) => {291 return parameters.name === 'notifications' ?292 Promise.resolve({ state: Notification.permission }) :293 originalQuery(parameters);294 };295 }296 });297
298 // Pause aléatoire avant de charger la page (comportement humain)299 await randomSleep(2000, 5000);300
301 // Charger la page avec une stratégie plus patiente302 const response = await page.goto(request.url, { 303 waitUntil: 'networkidle2', 304 timeout: 180000 305 });306
307 // Verification for Cloudflare and other protection challenges308 const isCloudflare = await hasCloudflareChallenge(page);309 const isSecurityChallenge = await hasSecurityChallenge(page);310 311 if (isCloudflare) {312 log.warning("⚠️ Cloudflare challenge detected! Taking screenshot and waiting...");313 await takeScreenshot(page, "cloudflare_challenge");314 315 // Wait longer for Cloudflare to resolve (sometimes it passes automatically)316 await randomSleep(15000, 30000);317 318 // Take another screenshot after waiting319 await takeScreenshot(page, "after_cloudflare_wait");320 }321 322 if (isSecurityChallenge) {323 log.warning("⚠️ Security challenge (CAPTCHA, etc.) detected! Taking screenshot...");324 await takeScreenshot(page, "security_challenge");325 326 // Store the page HTML for analysis327 const pageContent = await page.content();328 await Actor.setValue('challenge_page_html', pageContent);329 330 session.markBad();331 throw new Error("Security challenge detected - possible CAPTCHA or verification required");332 }333
334 // Vérification des erreurs 403 (blocage)335 if (!response || response.status() === 403) {336 log.warning(`🚨 403 Forbidden. Taking screenshot...`);337 await takeScreenshot(page, "403_error");338 339 // Get the page HTML for analysis340 const pageContent = await page.content();341 await Actor.setValue('forbidden_page_html', pageContent);342 343 session.markBad();344 throw new Error("Blocked with 403");345 }346 347 // Verify the response is OK348 if (!response || !response.ok()) {349 log.warning(`🚨 Bad response: ${response ? response.status() : 'No response'}. Taking screenshot...`);350 await takeScreenshot(page, "bad_response");351 352 // Get the page HTML for analysis353 const pageContent = await page.content();354 await Actor.setValue('bad_response_html', pageContent);355 356 session.markBad();357 throw new Error(`Bad response: ${response ? response.status() : 'No response'}`);358 }359
360 // Pause plus longue après chargement de la page361 await randomSleep(5000, 10000);362
363 // Vérifier l'existence des sélecteurs de job avant de continuer364 const hasJobElements = await page.evaluate(() => {365 return document.querySelectorAll('article[data-test="JobTile"]').length > 0;366 });367 368 if (!hasJobElements) {369 log.warning("No job elements found on the page. Taking screenshot...");370 await takeScreenshot(page, "no_jobs_found");371 372 // Sauvegarder le HTML pour debug373 const pageContent = await page.content();374 await Actor.setValue('page_html', pageContent);375 log.info("Page HTML saved for debugging");376 377 // Vérifier si on est sur une page de login ou de captcha378 const isLoginPage = await page.evaluate(() => {379 return document.body.innerText.includes('Log In') || 380 document.body.innerText.includes('Sign In') ||381 document.body.innerText.includes('CAPTCHA');382 });383 384 if (isLoginPage) {385 log.warning("Login or CAPTCHA page detected. Authentication may be required.");386 session.markBad();387 throw new Error("Authentication required - redirected to login page");388 }389 390 // Try scraping anyway - in case the selector has changed391 log.info("Continuing anyway - selector might have changed");392 }393
394 // Simulate realistic human browsing behavior395 await simulateHumanInteraction(page);396
397 // Extraction des informations avec des sélecteurs stables (`data-test`)398 const jobs = await page.evaluate(() => {399 // Try multiple selector strategies to find job listings400 const jobSelectors = [401 'article[data-test="JobTile"]',402 'article.job-tile',403 'div[class*="job-tile"]',404 'div[class*="jobTile"]',405 'section.job-list article',406 'div.up-card-section'407 ];408 409 // Try each selector until we find elements410 let jobElements = [];411 for (const selector of jobSelectors) {412 jobElements = document.querySelectorAll(selector);413 if (jobElements.length > 0) {414 break;415 }416 }417 418 // Convert to array and extract data419 return Array.from(jobElements).map(job => {420 // Flexible extraction function that tries multiple selectors421 const getTextFromSelectors = (selectors) => {422 for (const selector of selectors) {423 const element = job.querySelector(selector);424 if (element && element.innerText.trim()) {425 return element.innerText.trim();426 }427 }428 return "N/A";429 };430 431 // Flexible link extraction432 const getLinkFromSelectors = (selectors) => {433 for (const selector of selectors) {434 const element = job.querySelector(selector);435 if (element && element.getAttribute('href')) {436 const href = element.getAttribute('href');437 return href.startsWith('http') ? href : `https://www.upwork.com${href}`;438 }439 }440 return "N/A";441 };442 443 // Extract job information with multiple fallback selectors444 return {445 title: getTextFromSelectors([446 '[data-test="job-tile-title-link"]', 447 'a[href*="/job/"]',448 '.job-title',449 'h2 a',450 'h3 a'451 ]),452 jobLink: getLinkFromSelectors([453 '[data-test="job-tile-title-link"]',454 'a[href*="/job/"]',455 '.job-title a',456 'h2 a',457 'h3 a'458 ]),459 postedDate: getTextFromSelectors([460 '[data-test="job-pubilshed-date"] span:nth-child(2)',461 '.posted-on',462 'span[class*="postedOn"]',463 'span[class*="datePosted"]'464 ]),465 jobType: getTextFromSelectors([466 '[data-test="job-type-label"] strong',467 '.contract-type',468 'span[class*="jobType"]'469 ]),470 experienceLevel: getTextFromSelectors([471 '[data-test="experience-level"] strong',472 '.experience-level',473 'span[class*="experience"]'474 ]),475 duration: getTextFromSelectors([476 '[data-test="duration-label"] strong:nth-child(2)',477 '.duration',478 'span[class*="duration"]'479 ]),480 description: getTextFromSelectors([481 '[data-test="UpCLineClamp JobDescription"] p',482 '.job-description',483 'div[class*="description"]',484 'p.description'485 ]),486 budget: getTextFromSelectors([487 '[data-test="budget"] span',488 '.budget',489 'span[class*="budget"]',490 'span[class*="price"]'491 ]),492 skills: getTextFromSelectors([493 '[data-test="skills"]',494 '.skills',495 'div[class*="skills"]'496 ]),497 };498 });499 });500
501 log.info(`✅ Found ${jobs.length} job listings.`);502
503 // Stockage du résultat sous forme d'un tableau `jobs` à la racine504 await Dataset.pushData(jobs);505
506 } catch (error) {507 log.error(`❌ Error scraping ${request.url}: ${error.message}`);508 session.markBad();509 }510}
src/upwork-challenge-bypass.js
1/**2 * Upwork Challenge Bypass Module3 * Specialized techniques to bypass Upwork's advanced anti-bot system4 */5
6import puppeteer from 'puppeteer-extra';7import StealthPlugin from 'puppeteer-extra-plugin-stealth';8import { FingerprintGenerator } from 'fingerprint-generator';9import { FingerprintInjector } from 'fingerprint-injector';10import { HttpsProxyAgent } from 'https-proxy-agent';11import fetch from 'node-fetch';12import fs from 'fs/promises';13import path from 'path';14import { fileURLToPath } from 'url';15
16// Initialize plugins17puppeteer.use(StealthPlugin());18const __dirname = path.dirname(fileURLToPath(import.meta.url));19
20// Create fingerprint generator with constantly shifting browser signatures21const fingerprintGenerator = new FingerprintGenerator({22 browsers: [23 { name: 'chrome', minVersion: 88 },24 { name: 'firefox', minVersion: 94 },25 { name: 'safari', minVersion: 15 }26 ],27 operatingSystems: ['windows', 'macos', 'linux'],28 devices: ['desktop'],29 locales: ['en-US', 'en-GB', 'de-DE', 'fr-FR'],30});31
32const fingerprintInjector = new FingerprintInjector();33
34/**35 * Analyzes HTTP headers from a successful Upwork session36 * @param {string} captureFile Path to TCPDump capture from a working session37 * @returns {Object} Header patterns to mimic38 */39async function analyzeWorkingHeaders(captureFile) {40 try {41 const capture = await fs.readFile(captureFile, 'utf-8');42 // Parse capture to extract successful headers43 // This is a simplified implementation - the real version would parse pcap files44 return {45 order: ['host', 'user-agent', 'accept', 'accept-language', 'accept-encoding', 'connection'],46 specialValues: {47 'sec-fetch-site': 'same-origin',48 'sec-fetch-mode': 'navigate',49 'sec-fetch-user': '?1',50 'sec-fetch-dest': 'document',51 'sec-ch-ua-platform': '"Windows"',52 'sec-ch-ua-mobile': '?0',53 }54 };55 } catch (error) {56 console.log('No capture file available, using default headers');57 return {58 order: ['host', 'user-agent', 'accept', 'accept-language', 'accept-encoding', 'connection'],59 specialValues: {60 'sec-fetch-site': 'none',61 'sec-fetch-mode': 'navigate',62 'sec-fetch-user': '?1',63 'sec-fetch-dest': 'document',64 }65 };66 }67}68
69/**70 * Creates browser with advanced fingerprint evasion71 * @param {Object} options Browser configuration options72 * @returns {Promise<Browser>} Configured browser instance73 */74async function createEvasiveBrowser(options = {}) {75 const {76 proxy,77 headless = false,78 profileDir,79 cookiesPath,80 fingerprintOptions = {},81 } = options;82
83 // Generate fingerprint84 const fingerprint = fingerprintGenerator.getFingerprint({85 devices: ['desktop'],86 ...fingerprintOptions,87 });88
89 // Enhanced browser launch options90 const launchOptions = {91 headless,92 args: [93 '--disable-blink-features=AutomationControlled',94 '--disable-features=IsolateOrigins,site-per-process',95 '--disable-site-isolation-trials',96 '--disable-web-security',97 '--disable-features=ShutdownEventDrain',98 '--disable-setuid-sandbox',99 '--no-sandbox',100 '--disable-dev-shm-usage',101 '--disable-accelerated-2d-canvas',102 '--no-first-run',103 '--no-zygote',104 '--disable-gpu',105 '--disable-background-networking',106 '--disable-default-apps',107 '--disable-extensions',108 '--disable-sync',109 '--disable-background-timer-throttling',110 '--disable-backgrounding-occluded-windows',111 '--disable-client-side-phishing-detection',112 '--disable-component-extensions-with-background-pages',113 '--disable-domain-reliability',114 '--disable-hang-monitor',115 '--disable-ipc-flooding-protection',116 '--disable-notifications',117 '--disable-popup-blocking',118 '--disable-prompt-on-repost',119 '--disable-renderer-backgrounding',120 '--disable-speech-api',121 '--disable-breakpad',122 '--metrics-recording-only',123 '--mute-audio',124 '--no-default-browser-check',125 '--password-store=basic',126 `--user-agent=${fingerprint.userAgent}`,127 '--window-size=1920,1080',128 '--window-position=0,0',129 ],130 ignoreHTTPSErrors: true,131 defaultViewport: null,132 };133
134 // Add proxy if provided135 if (proxy) {136 launchOptions.args.push(`--proxy-server=${proxy}`);137 }138
139 // Add profile directory if provided140 if (profileDir) {141 launchOptions.userDataDir = profileDir;142 }143
144 // Launch browser145 const browser = await puppeteer.launch(launchOptions);146 147 // Load cookies if available148 if (cookiesPath) {149 try {150 const cookiesString = await fs.readFile(cookiesPath, 'utf8');151 const cookies = JSON.parse(cookiesString);152 const pages = await browser.pages();153 await pages[0].setCookie(...cookies);154 } catch (error) {155 console.log('No cookies found or invalid format');156 }157 }158
159 return { browser, fingerprint };160}161
162/**163 * Bypass Upwork's challenge page164 * @param {Object} page Puppeteer page object165 * @param {Object} fingerprint Browser fingerprint data166 * @returns {Promise<boolean>} Success status167 */168async function bypassChallenge(page, fingerprint) {169 try {170 // Inject complete fingerprint into page171 await fingerprintInjector.attachFingerprintToPuppeteer(page, fingerprint);172 173 // Analyze headers from working sessions if available174 const headerPatterns = await analyzeWorkingHeaders(175 path.join(__dirname, '../network_analysis/successful_capture.pcap')176 );177 178 // Intercept and modify requests to match successful patterns179 await page.setRequestInterception(true);180 page.on('request', async (request) => {181 const headers = request.headers();182 183 // Apply special header values184 for (const [key, value] of Object.entries(headerPatterns.specialValues)) {185 headers[key] = value;186 }187 188 // Reorder headers to match typical browser patterns189 const orderedHeaders = {};190 headerPatterns.order.forEach(header => {191 if (headers[header]) {192 orderedHeaders[header] = headers[header];193 }194 });195 196 // Add remaining headers197 Object.keys(headers).forEach(header => {198 if (!orderedHeaders[header]) {199 orderedHeaders[header] = headers[header];200 }201 });202 203 request.continue({204 headers: orderedHeaders,205 });206 });207 208 // Monitor for challenge elements209 page.on('load', async () => {210 const isChallengePage = await page.evaluate(() => {211 return document.title.includes('Challenge') || 212 document.querySelector('form[id*="challenge"]') !== null;213 });214 215 if (isChallengePage) {216 console.log('Challenge page detected, attempting to solve...');217 218 // Wait to see if any human verification appears219 await page.waitForTimeout(3000);220 221 // Look for common challenge elements and interact222 const hasRecaptcha = await page.evaluate(() => {223 return document.querySelector('iframe[src*="recaptcha"]') !== null;224 });225 226 if (hasRecaptcha) {227 console.log('reCAPTCHA detected - this requires manual intervention');228 // Would need external CAPTCHA solving service integration here229 } else {230 // Try to find and click any "I'm not a robot" checkbox231 const clicked = await page.evaluate(() => {232 const checkbox = document.querySelector('input[type="checkbox"]');233 if (checkbox) {234 checkbox.click();235 return true;236 }237 return false;238 });239 240 if (clicked) {241 console.log('Clicked verification checkbox');242 await page.waitForNavigation({ timeout: 10000 }).catch(() => {});243 }244 245 // Try to find and click any continue/verify buttons246 const buttonClicked = await page.evaluate(() => {247 const buttons = Array.from(document.querySelectorAll('button'));248 const verifyButton = buttons.find(button => 249 button.textContent.includes('Verify') || 250 button.textContent.includes('Continue') ||251 button.textContent.includes('Submit')252 );253 254 if (verifyButton) {255 verifyButton.click();256 return true;257 }258 return false;259 });260 261 if (buttonClicked) {262 console.log('Clicked verification button');263 await page.waitForNavigation({ timeout: 10000 }).catch(() => {});264 }265 }266 }267 });268 269 return true;270 } catch (error) {271 console.error('Error in challenge bypass:', error.message);272 return false;273 }274}275
276/**277 * Makes direct request with custom fingerprinting278 * @param {string} url Target URL279 * @param {Object} options Request options280 * @returns {Promise<Object>} Response data281 */282async function makeEvasiveRequest(url, options = {}) {283 const {284 proxy,285 cookies = '',286 fingerprint = fingerprintGenerator.getFingerprint(),287 } = options;288 289 const fetchOptions = {290 headers: {291 'User-Agent': fingerprint.userAgent,292 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',293 'Accept-Language': 'en-US,en;q=0.5',294 'Accept-Encoding': 'gzip, deflate, br',295 'Connection': 'keep-alive',296 'Upgrade-Insecure-Requests': '1',297 'Sec-Fetch-Dest': 'document',298 'Sec-Fetch-Mode': 'navigate',299 'Sec-Fetch-Site': 'none',300 'Sec-Fetch-User': '?1',301 'Cache-Control': 'max-age=0',302 'TE': 'trailers',303 'Cookie': cookies,304 },305 };306 307 // Add proxy if provided308 if (proxy) {309 fetchOptions.agent = new HttpsProxyAgent(proxy);310 }311 312 // Randomize TLS fingerprint patterns313 process.env.NODE_TLS_CIPHER_SUITES = getRandomCipherSuites();314 315 try {316 const response = await fetch(url, fetchOptions);317 318 // If we get a challenge page, we need browser-based approach319 const text = await response.text();320 if (text.includes('Challenge - Upwork') || response.status === 403) {321 return { needsBrowser: true, status: response.status };322 }323 324 return { 325 success: response.ok, 326 status: response.status, 327 data: text 328 };329 } catch (error) {330 console.error('Error in evasive request:', error.message);331 return { success: false, error: error.message };332 }333}334
335/**336 * Generates random TLS cipher suites order to randomize TLS fingerprint337 */338function getRandomCipherSuites() {339 const cipherSuites = [340 'TLS_AES_128_GCM_SHA256',341 'TLS_AES_256_GCM_SHA384',342 'TLS_CHACHA20_POLY1305_SHA256',343 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256',344 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256',345 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384',346 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384',347 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256',348 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256',349 ];350 351 // Shuffle array352 for (let i = cipherSuites.length - 1; i > 0; i--) {353 const j = Math.floor(Math.random() * (i + 1));354 [cipherSuites[i], cipherSuites[j]] = [cipherSuites[j], cipherSuites[i]];355 }356 357 return cipherSuites.join(':');358}359
360export {361 createEvasiveBrowser,362 bypassChallenge,363 makeEvasiveRequest,364 fingerprintGenerator,365 fingerprintInjector366};
src/upwork-challenge-integrator.js
1/**2 * Upwork Challenge Integrator3 * Connects the challenge bypass module with the main scraper4 */5
6import { KeyValueStore } from 'crawlee';7import fs from 'fs/promises';8import path from 'path';9import { fileURLToPath } from 'url';10import cheerio from 'cheerio';11import cloudscraper from './utils/cloudscraper-replacement.js';12import { Actor } from 'apify';13
14import {15 createEvasiveBrowser,16 bypassChallenge,17 makeEvasiveRequest,18 fingerprintGenerator19} from './upwork-challenge-bypass.js';20
21import {22 applyAdvancedFingerprinting,23 applyHeaderFingerprinting,24 applyTLSFingerprinting25} from './fingerprint-enhancement.js';26
27const __dirname = path.dirname(fileURLToPath(import.meta.url));28
29// Ensure directories exist30const SCREENSHOTS_DIR = path.join(__dirname, '..', 'screenshots');31const PROFILES_DIR = path.join(__dirname, '..', 'browser_profiles');32
33async function ensureDirectoriesExist() {34 await fs.mkdir(SCREENSHOTS_DIR, { recursive: true });35 await fs.mkdir(PROFILES_DIR, { recursive: true });36}37
38/**39 * Enhanced browser session setup with challenge bypass40 * @param {Object} options Session configuration41 * @returns {Object} Session with configured browser42 */43async function createEnhancedSession(options = {}) {44 const {45 sessionId,46 proxyUrl,47 headless = false,48 input = {},49 browserProfiles = [],50 } = options;51 52 await ensureDirectoriesExist();53 54 // Session storage paths55 const cookiesPath = path.join(PROFILES_DIR, `${sessionId}_cookies.json`);56 const profileDir = path.join(PROFILES_DIR, `profile_${sessionId}`);57 58 // Check if we have stored cookies59 let hasStoredCookies = false;60 try {61 await fs.access(cookiesPath);62 hasStoredCookies = true;63 console.log(`Found cookies for session ${sessionId}`);64 } catch (error) {65 console.log(`No cookies found for session ${sessionId}, collecting new cookies...`);66 }67 68 // Select a random browser profile for fingerprinting diversity69 const randomProfile = browserProfiles[Math.floor(Math.random() * browserProfiles.length)];70 71 // Apply timezone from profile or random72 const timezone = randomProfile?.timezone || 73 input.timezone || 74 ['America/New_York', 'America/Los_Angeles', 'Europe/London'][Math.floor(Math.random() * 3)];75 76 console.log(`Using profile timezone: ${timezone}`);77 78 // Configure fingerprint options based on the profile79 const fingerprintOptions = {80 browsers: [81 { name: randomProfile?.platform?.includes('Mac') ? 'safari' : 'chrome', minVersion: 90 }82 ],83 operatingSystems: [randomProfile?.platform?.includes('Mac') ? 'macos' : 'windows'],84 devices: ['desktop'],85 locales: [randomProfile?.locale || 'en-US'],86 };87 88 // Use the profile's user agent if available89 const userAgent = randomProfile?.userAgent || null;90 91 try {92 // Try direct request first to check if challenge bypass is needed93 console.log(`Attempting to use CloudScraper to bypass Cloudflare protection...`);94 95 try {96 const result = await cloudscraper.get('https://www.upwork.com');97 console.log(`CloudScraper successful with status: ${result.statusCode}`);98 99 // If direct request works, we can use a simpler browser setup100 // Create the browser session without advanced evasion101 // ... existing simpler browser setup ...102 103 return {104 needsAdvancedEvasion: false,105 // Simple browser config here106 };107 } catch (error) {108 console.log(`CloudScraper failed to bypass protection, falling back to puppeteer`);109 110 // Create advanced evasive browser111 console.log(`Using real browser profile at: ${profileDir}`);112 const { browser, fingerprint } = await createEvasiveBrowser({113 proxy: proxyUrl,114 headless,115 profileDir,116 cookiesPath: hasStoredCookies ? cookiesPath : null,117 fingerprintOptions,118 userAgent,119 });120 121 // Set up the page with advanced evasion122 const page = await browser.newPage();123 124 // Apply bypass techniques to page125 await bypassChallenge(page, fingerprint);126 127 // Apply our enhanced fingerprinting techniques128 console.log('Applying enhanced fingerprint protection...');129 await applyAdvancedFingerprinting(page, {130 deviceProfile: fingerprint.device.platform.includes('Mac') ? 'safari' : 'chrome',131 webglNoise: true,132 audioNoise: true,133 fontConsistency: true,134 hideTempStorage: true,135 consistentTimezone: true,136 userAgent: fingerprint.userAgent,137 timezone: timezone138 });139 140 // Apply TLS fingerprinting141 applyTLSFingerprinting({142 randomize: true,143 profile: fingerprint.device.platform.includes('Mac') ? 'safari' : 'chrome_mac'144 });145 146 // Apply header fingerprinting147 await applyHeaderFingerprinting(page, {148 userAgent: fingerprint.userAgent,149 locale: fingerprint.navigator.language,150 browser: fingerprint.device.platform.includes('Mac') ? 'safari' : 'chrome',151 randomizeOrder: true152 });153 154 // Visit Upwork homepage to get cookies155 console.log('Visiting Upwork homepage...');156 try {157 await page.goto('https://www.upwork.com', {158 waitUntil: 'networkidle2',159 timeout: 60000,160 });161 162 // Check if we're on a challenge page163 const isChallengePage = await page.evaluate(() => {164 return document.title.includes('Challenge') || 165 document.querySelector('form[id*="challenge"]') !== null;166 });167 168 if (isChallengePage) {169 console.log('Still on challenge page, may need manual intervention');170 // Take screenshot for debugging171 await page.screenshot({172 path: path.join(SCREENSHOTS_DIR, `challenge_${sessionId}.png`),173 });174 175 // Store session with challenge flag176 return {177 sessionId,178 browser,179 page,180 userAgent: fingerprint.userAgent,181 proxy: proxyUrl,182 needsAdvancedEvasion: true,183 onChallengePage: true,184 };185 } else {186 console.log('Successfully bypassed challenge page!');187 188 // Store cookies for future use189 const cookies = await page.cookies();190 await fs.writeFile(cookiesPath, JSON.stringify(cookies, null, 2));191 192 // Return session with successful bypass193 return {194 sessionId,195 browser,196 page,197 userAgent: fingerprint.userAgent,198 cookies,199 proxy: proxyUrl,200 needsAdvancedEvasion: true,201 onChallengePage: false,202 };203 }204 } catch (error) {205 console.log(`Error during cookie collection for session ${sessionId}: ${error}`);206 await browser.close();207 208 // Return session with error flag209 return {210 sessionId,211 error: error.message,212 needsAdvancedEvasion: true,213 connectionFailed: true,214 };215 }216 }217 } catch (error) {218 console.error(`Error creating enhanced session: ${error.message}`);219 return {220 sessionId,221 error: error.message,222 needsAdvancedEvasion: true,223 setupFailed: true,224 };225 }226}227
228// Capture a screenshot and store it on Apify Storage229const takeScreenshot = async (page, filename) => {230 try {231 const screenshotBuffer = await page.screenshot({232 type: "jpeg",233 quality: 50,234 fullPage: false,235 });236 const screenshotKey = `${filename}-${Date.now()}.jpeg`;237 await Actor.setValue(screenshotKey, screenshotBuffer, {238 contentType: "image/jpeg",239 });240 const screenshotUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${screenshotKey}`;241 console.log(`📸 Screenshot saved: ${screenshotUrl}`);242 return screenshotUrl;243 } catch (error) {244 console.error(`❌ Failed to take screenshot: ${error.message}`);245 return null;246 }247};248
249/**250 * Enhanced page navigation with challenge bypass251 * @param {Object} options Navigation options252 * @returns {Object} Result with page content or error253 */254async function enhancedPageNavigation(options = {}) {255 const {256 session,257 url,258 input = {},259 attemptDirectRequest = true,260 parseJobsFunction,261 } = options;262 263 if (!session || !url) {264 return { success: false, error: 'Missing session or URL' };265 }266 267 const { page, browser, userAgent, cookies, proxy, needsAdvancedEvasion } = session;268 269 try {270 // Try direct request methods first if enabled271 if (attemptDirectRequest && input.useCloudScraper) {272 console.log(`Attempting CloudScraper pre-fetch for ${url}...`);273 274 try {275 const result = await cloudscraper.get(url);276 console.log(`CloudScraper successful with status ${result.statusCode}`);277 278 // Parse the HTML response directly if successful279 if (typeof parseJobsFunction === 'function') {280 const $ = cheerio.load(result.body);281 const jobs = parseJobsFunction($);282 283 return {284 success: true,285 method: 'cloudscraper',286 jobs,287 statusCode: result.statusCode,288 };289 }290 291 return {292 success: true,293 method: 'cloudscraper',294 content: result.body,295 statusCode: result.statusCode,296 };297 } catch (error) {298 const statusCode = error.statusCode || (error.response && error.response.statusCode);299 console.log(`CloudScraper got status ${statusCode}`);300 301 // If CloudScraper fails with 403, try evasive direct request302 if (needsAdvancedEvasion && statusCode === 403) {303 console.log(`Using user agent: ${userAgent}`);304 305 try {306 const evasiveResult = await makeEvasiveRequest(url, {307 proxy,308 cookies: cookies ? cookies.map(c => `${c.name}=${c.value}`).join('; ') : '',309 fingerprint: fingerprintGenerator.getFingerprint(),310 });311 312 if (!evasiveResult.needsBrowser && evasiveResult.success) {313 console.log(`Evasive request successful with status ${evasiveResult.status}`);314 315 // Parse the HTML response directly316 if (typeof parseJobsFunction === 'function') {317 const $ = cheerio.load(evasiveResult.data);318 const jobs = parseJobsFunction($);319 320 return {321 success: true,322 method: 'evasive-request',323 jobs,324 statusCode: evasiveResult.status,325 };326 }327 328 return {329 success: true,330 method: 'evasive-request',331 content: evasiveResult.data,332 statusCode: evasiveResult.status,333 };334 }335 } catch (directError) {336 console.log(`Evasive request failed: ${directError.message}`);337 }338 }339 }340 }341 342 // If direct methods failed or not enabled, use browser343 if (!page || !browser) {344 return { 345 success: false, 346 error: 'Browser or page not available',347 needsBrowserReset: true 348 };349 }350 351 // Add random delay to prevent patterns352 const delaySeconds = Math.floor(Math.random() * 30) + 15;353 console.log(`Waiting ${delaySeconds} seconds before the next request...`);354 await page.waitForTimeout(delaySeconds * 1000);355 356 // Re-apply header randomization before each navigation357 await applyHeaderFingerprinting(page, {358 userAgent: userAgent,359 browser: session.fingerprintConfig?.browserProfile || 'safari',360 randomizeOrder: true361 });362 363 // Visit the page with full browser364 const response = await page.goto(url, {365 waitUntil: 'networkidle2',366 timeout: 60000,367 });368 369 if (response.status() === 403) {370 // Take a screenshot of the error for debugging371 console.log(`Error screenshot saved for status ${response.status()}`);372 await page.screenshot({373 path: path.join(SCREENSHOTS_DIR, `error_${Date.now()}.png`),374 });375 376 // Try to bypass the challenge page377 await bypassChallenge(page, { userAgent });378 379 // Check if we're still on a challenge page380 const isChallengePage = await page.evaluate(() => {381 return document.title.includes('Challenge') || 382 document.querySelector('form[id*="challenge"]') !== null;383 });384 385 if (isChallengePage) {386 console.log('Detected challenge page, about to take screenshot...');387 const screenshotUrl = await takeScreenshot(page, 'cloudflare_challenge_enhanced_nav');388 if (screenshotUrl) {389 console.log(`Cloudflare challenge screenshot: ${screenshotUrl}`);390 } else {391 console.log('Screenshot failed or not available.');392 }393 394 // Try to extract data even from challenge page if possible395 const pageContent = await page.content();396 397 if (typeof parseJobsFunction === 'function') {398 const $ = cheerio.load(pageContent);399 const jobs = parseJobsFunction($);400 401 if (jobs && jobs.length > 0) {402 console.log(`Found ${jobs.length} jobs even on challenge page`);403 return {404 success: true,405 method: 'browser-challenge-page',406 jobs,407 statusCode: 403,408 isChallengePage: true,409 };410 }411 }412 413 // Simulate human-like behavior for potential bypass414 console.log('Simulating additional resource loading for natural network patterns...');415 416 try {417 // Visit some common resources that real browsers would load418 const resources = [419 'https://www.google.com/favicon.ico',420 'https://fonts.googleapis.com/css?family=Roboto',421 'https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js',422 ];423 424 // Create a new incognito page to load resources without affecting main page425 const context = await browser.createIncognitoBrowserContext();426 const resourcePage = await context.newPage();427 428 for (const resource of resources) {429 await resourcePage.goto(resource, { waitUntil: 'domcontentloaded' }).catch(() => {});430 await page.waitForTimeout(1000);431 }432 433 await resourcePage.close();434 await context.close();435 436 console.log('Additional resources loaded for natural network pattern');437 } catch (resourceError) {438 console.log(`Error loading additional resources: ${resourceError.message}`);439 }440 441 if (typeof parseJobsFunction === 'function') {442 console.log('Fallback parser: about to take screenshot before returning result...');443 const fallbackScreenshotUrl = await takeScreenshot(page, 'cloudflare_challenge_fallback');444 if (fallbackScreenshotUrl) {445 console.log(`Fallback parser screenshot: ${fallbackScreenshotUrl}`);446 } else {447 console.log('Fallback parser screenshot failed or not available.');448 }449 }450 451 return {452 success: false,453 error: `Request blocked - received 403 status code.`,454 isChallengePage: true,455 statusCode: 403,456 needsSessionRotation: true,457 };458 }459 460 // If we got past the challenge, continue processing461 console.log('Successfully bypassed challenge page after retry!');462 }463 464 // Process successful page response465 const pageContent = await page.content();466 467 if (typeof parseJobsFunction === 'function') {468 const $ = cheerio.load(pageContent);469 const jobs = parseJobsFunction($);470 471 // Fallback screenshot and logging472 if (page) {473 console.log('Fallback parser: about to take screenshot before returning result...');474 const fallbackScreenshotUrl = await takeScreenshot(page, 'cloudflare_challenge_fallback');475 if (fallbackScreenshotUrl) {476 console.log(`Fallback parser screenshot: ${fallbackScreenshotUrl}`);477 } else {478 console.log('Fallback parser screenshot failed or not available.');479 }480 } else {481 console.log('Fallback parser: page object is missing, cannot take screenshot.');482 }483
484 return {485 success: true,486 method: 'browser',487 jobs,488 statusCode: response.status(),489 };490 }491 492 return {493 success: true,494 method: 'browser',495 content: pageContent,496 statusCode: response.status(),497 };498 499 } catch (error) {500 console.error(`Error navigating to ${url}: ${error.message}`);501 return {502 success: false,503 error: error.message,504 needsRetry: true,505 };506 }507}508
509/**510 * Clean up browser sessions511 * @param {Object} session Session to clean up512 */513async function cleanupSession(session) {514 if (session?.browser) {515 try {516 await session.browser.close();517 } catch (error) {518 console.log(`Error closing browser: ${error.message}`);519 }520 }521}522
523export {524 createEnhancedSession,525 enhancedPageNavigation,526 cleanupSession,527};
src/utils/cloudscraper-replacement.js
1/**2 * Cloudscraper replacement using axios3 * Handles requests with similar API to the original cloudscraper4 */5
6import axios from 'axios';7import { HttpsProxyAgent } from 'https-proxy-agent';8
9/**10 * Make a GET request with axios11 * @param {string|Object} options URL or options object12 * @param {Function} callback Optional callback for legacy API compatibility13 * @returns {Promise<Object>} Response data14 */15export async function get(options, callback) {16 let url;17 let config = {};18 19 if (typeof options === 'string') {20 url = options;21 } else {22 url = options.url;23 config = { ...options };24 delete config.url;25 26 // Convert headers27 if (options.headers) {28 config.headers = options.headers;29 }30 31 // Handle proxy32 if (options.proxy) {33 config.httpsAgent = new HttpsProxyAgent(options.proxy);34 }35 36 // Handle jar (cookies)37 if (options.jar === true) {38 config.withCredentials = true;39 }40 }41 42 try {43 const response = await axios.get(url, config);44 45 // Format response to match cloudscraper's expected format46 const result = {47 body: response.data,48 statusCode: response.status,49 request: {50 uri: { href: url },51 jar: {52 getCookies: () => {53 return Object.entries(response.headers['set-cookie'] || {}).map(([key, value]) => ({54 key,55 value,56 domain: new URL(url).hostname,57 path: '/'58 }));59 }60 }61 },62 headers: response.headers63 };64 65 if (callback) {66 callback(null, response, response.data);67 }68 69 return result;70 } catch (error) {71 console.log(`Request to ${url} failed: ${error.message}`);72 73 // Create a synthetic response for errors74 const statusCode = error.response?.status || 500;75 const responseData = error.response?.data || `<html><body><h1>Error ${statusCode}</h1></body></html>`;76 77 // Format as cloudscraper response78 const result = {79 body: responseData,80 statusCode: statusCode,81 request: {82 uri: { href: url },83 jar: {84 getCookies: () => []85 }86 },87 headers: error.response?.headers || {},88 error: error.message89 };90 91 if (callback) {92 callback(null, result, responseData);93 }94 95 return result;96 }97}98
99/**100 * Make a request with axios101 * @param {Object} options Request options102 * @param {Function} callback Optional callback103 * @returns {Promise<Object>} Response data104 */105const cloudscraperReplacement = async function(options, callback) {106 return get(options, callback);107};108
109// Add method handlers for compatibility110cloudscraperReplacement.get = get;111
112cloudscraperReplacement.post = async function(options, callback) {113 try {114 let url;115 let data;116 let config = {};117 118 if (typeof options === 'string') {119 url = options;120 data = {};121 } else {122 url = options.url;123 data = options.form || options.body || {};124 config = { ...options };125 delete config.url;126 delete config.form;127 delete config.body;128 129 // Convert headers130 if (options.headers) {131 config.headers = options.headers;132 }133 134 // Handle proxy135 if (options.proxy) {136 config.httpsAgent = new HttpsProxyAgent(options.proxy);137 }138 139 // Handle jar (cookies)140 if (options.jar === true) {141 config.withCredentials = true;142 }143 }144 145 const response = await axios.post(url, data, config);146 147 // Format response to match cloudscraper's expected format148 const result = {149 body: response.data,150 statusCode: response.status,151 request: {152 uri: { href: url },153 jar: {154 getCookies: () => {155 return Object.entries(response.headers['set-cookie'] || {}).map(([key, value]) => ({156 key,157 value,158 domain: new URL(url).hostname,159 path: '/'160 }));161 }162 }163 },164 headers: response.headers165 };166 167 if (callback) {168 callback(null, response, response.data);169 }170 171 return result;172 } catch (error) {173 console.log(`POST request to ${options.url || options} failed: ${error.message}`);174 175 // Create a synthetic response for errors176 const statusCode = error.response?.status || 500;177 const responseData = error.response?.data || `<html><body><h1>Error ${statusCode}</h1></body></html>`;178 179 // Format as cloudscraper response180 const result = {181 body: responseData,182 statusCode: statusCode,183 request: {184 uri: { href: options.url || options },185 jar: {186 getCookies: () => []187 }188 },189 headers: error.response?.headers || {},190 error: error.message191 };192 193 if (callback) {194 callback(null, result, responseData);195 }196 197 return result;198 }199};200
201export default cloudscraperReplacement;