Upwork Job Scraper avatar
Upwork Job Scraper

Under maintenance

Pricing

$29.00/month + usage

Go to Store
Upwork Job Scraper

Upwork Job Scraper

Under maintenance

Developed by

Runtime

Runtime

Maintained by Community

Upwork Job Scraper is an Apify actor that extracts job listings from Upwork based on keywords. It outputs structured data (title, budget, client info) in JSON/CSV for easy analysis.

5.0 (1)

Pricing

$29.00/month + usage

1

Total users

19

Monthly users

12

Runs succeeded

94%

Last modified

4 hours ago

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage

input.json

{
"searchQuery": "javascript",
"page": 1,
"useApifyProxy": false,
"maxConcurrency": 1,
"maxRetries": 3,
"useEnhancedCrawler": true,
"minDelayBetweenRequests": 5,
"maxDelayBetweenRequests": 10,
"sessionRotationCount": 2,
"useCloudScraper": true,
"useFallbackHtmlParser": true
}

package.json

{
"name": "crawlee-puppeteer-javascript",
"version": "0.0.1",
"type": "module",
"description": "This is an example of a Crawlee project.",
"dependencies": {
"2captcha": "^3.0.7",
"apify": "^3.1.4",
"axios": "^1.6.2",
"crawlee": "^3.5.4",
"fingerprint-generator": "^2.1.66",
"fingerprint-injector": "^2.1.66",
"https-proxy-agent": "^7.0.6",
"node-fetch": "^3.3.2",
"proxy-chain": "^2.3.0",
"puppeteer": "^22.8.2",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-adblocker": "^2.13.6",
"puppeteer-extra-plugin-recaptcha": "^3.6.8",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-extra-plugin-user-data-dir": "^2.4.1",
"puppeteer-extra-plugin-user-preferences": "^2.4.1",
"random-useragent": "^0.5.0",
"uuid": "^9.0.1"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"eslint": "^8.50.0"
},
"scripts": {
"start": "node src/main.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
"pull": "apify pull",
"push": "apify push",
"commit": "git add . && git commit -m \"Update\" && git push"
},
"author": "It's not you it's me",
"license": "ISC"
}

start_xvfb_and_run_cmd.sh

Download

.actor/Dockerfile

# Dockerfile
FROM apify/actor-node-puppeteer-chrome:20
RUN npm ls crawlee apify puppeteer playwright
COPY --chown=myuser package*.json ./
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& npm install puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-recaptcha random-useragent \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
COPY --chown=myuser . ./
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/README.md

1

.actor/actor.json

{
"actorSpecification": 1,
"name": "upwork-job-scraper",
"title": "Project Puppeteer Crawler JavaScript",
"description": "Crawlee and Puppeteer project in JavaScript.",
"version": "0.10",
"meta": {
"templateId": "js-crawlee-puppeteer-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Upwork Job Search",
"type": "object",
"schemaVersion": 1,
"properties": {
"searchQuery": {
"title": "Search Query",
"type": "string",
"description": "Keywords to search for on Upwork (e.g., job title, skills)",
"editor": "textfield",
"default": ""
},
"page": {
"title": "Page Number",
"type": "integer",
"description": "The page number to scrape on Upwork",
"editor": "number",
"default": 1
},
"useEnhancedCrawler": {
"title": "Use Enhanced Anti-Detection System",
"type": "boolean",
"description": "Use the advanced Upwork challenge bypass system (recommended for 403 errors)",
"editor": "checkbox",
"default": true
},
"tlsFingerprintEvasion": {
"title": "Enable TLS Fingerprint Evasion",
"type": "boolean",
"description": "Use advanced TLS fingerprinting evasion to bypass Upwork's security (requires Enhanced Crawler)",
"editor": "checkbox",
"default": true
},
"customHeaderOrder": {
"title": "Randomize HTTP Header Order",
"type": "boolean",
"description": "Randomize HTTP header order to evade fingerprinting detection (requires Enhanced Crawler)",
"editor": "checkbox",
"default": true
},
"advancedFingerprinting": {
"title": "Use Advanced Fingerprint Evasion",
"type": "boolean",
"description": "Apply comprehensive browser fingerprint masking techniques (Canvas, WebGL, Audio, etc.)",
"editor": "checkbox",
"default": true
},
"fingerprintConsistency": {
"title": "Maintain Consistent Fingerprints",
"type": "boolean",
"description": "Keep browser fingerprints consistent within sessions (recommended)",
"editor": "checkbox",
"default": true
},
"fingerprintBrowserProfile": {
"title": "Browser Fingerprint Profile",
"type": "string",
"description": "Browser profile to emulate for fingerprint generation",
"editor": "select",
"default": "auto-rotate",
"enum": ["auto-rotate", "chrome", "safari", "firefox"],
"enumTitles": ["Auto Rotate", "Chrome", "Safari", "Firefox"]
},
"useApifyProxy": {
"title": "Use Apify Proxy",
"type": "boolean",
"description": "Whether to use Apify Proxy",
"editor": "checkbox",
"default": true,
"prefill": true
},
"maxConcurrency": {
"title": "Max Concurrency",
"type": "integer",
"description": "Maximum number of concurrent requests (lower is safer)",
"editor": "number",
"default": 1,
"minimum": 1,
"maximum": 10
},
"maxRetries": {
"title": "Max Retries",
"type": "integer",
"description": "Maximum number of retries per request",
"editor": "number",
"default": 3,
"minimum": 1,
"maximum": 10
},
"upworkUsername": {
"title": "Upwork Username",
"type": "string",
"description": "Optional: Your Upwork username for authenticated access (recommended to bypass 403 errors)",
"editor": "textfield"
},
"upworkPassword": {
"title": "Upwork Password",
"type": "string",
"description": "Optional: Your Upwork password for authenticated access",
"editor": "textfield",
"isSecret": true
},
"captchaApiKey": {
"title": "2Captcha API Key",
"type": "string",
"description": "Optional: Your 2Captcha API key to solve CAPTCHAs automatically (https://2captcha.com/)",
"editor": "textfield",
"isSecret": true
},
"sessionRotationCount": {
"title": "Session Rotation Count",
"type": "integer",
"description": "Number of browser sessions to rotate through (helps avoid detection)",
"editor": "number",
"default": 3,
"minimum": 1,
"maximum": 10
},
"minDelayBetweenRequests": {
"title": "Min Delay Between Requests (seconds)",
"type": "integer",
"description": "Minimum delay in seconds between requests (higher values reduce blocking)",
"editor": "number",
"default": 15,
"minimum": 5,
"maximum": 120
},
"maxDelayBetweenRequests": {
"title": "Max Delay Between Requests (seconds)",
"type": "integer",
"description": "Maximum delay in seconds between requests (higher values reduce blocking)",
"editor": "number",
"default": 45,
"minimum": 10,
"maximum": 180
},
"sessionCooldownMinutes": {
"title": "Session Cooldown Minutes",
"type": "integer",
"description": "Time in minutes to rest a session after multiple uses",
"editor": "number",
"default": 30,
"minimum": 5,
"maximum": 120
},
"proxyRotationEnabled": {
"title": "Enable Proxy Rotation",
"type": "boolean",
"description": "Whether to rotate proxies during the session (helps prevent IP-based blocking)",
"editor": "checkbox",
"default": true
},
"proxyRotationRequests": {
"title": "Requests Per Proxy",
"type": "integer",
"description": "Number of requests before rotating to a new proxy",
"editor": "number",
"default": 3,
"minimum": 1,
"maximum": 10
},
"simulateBrowserHistory": {
"title": "Simulate Browser History",
"type": "boolean",
"description": "Whether to simulate browser history and cache for a more authentic profile",
"editor": "checkbox",
"default": true
},
"randomizeTimezone": {
"title": "Randomize Browser Timezone",
"type": "boolean",
"description": "Whether to randomize timezone and locale settings per session",
"editor": "checkbox",
"default": true
},
"useNewHeadless": {
"title": "Use New Headless Mode",
"type": "boolean",
"description": "Use Chrome's new headless mode which is less detectable than the old mode",
"editor": "checkbox",
"default": true
},
"useRealProfiles": {
"title": "Use Real Browser Profiles",
"type": "boolean",
"description": "Use real Chrome profiles with history and extensions (better anti-detection)",
"editor": "checkbox",
"default": true
},
"useCloudScraper": {
"title": "Use CloudScraper",
"type": "boolean",
"description": "Use CloudScraper to bypass Cloudflare protection (fallback method)",
"editor": "checkbox",
"default": true
},
"simulateExtensions": {
"title": "Simulate Browser Extensions",
"type": "boolean",
"description": "Simulate common browser extensions to appear more like a real user",
"editor": "checkbox",
"default": true
},
"useFallbackHtmlParser": {
"title": "Use Direct HTML Parser Fallback",
"type": "boolean",
"description": "When browser approach fails, try to extract data directly from HTML (last resort)",
"editor": "checkbox",
"default": true
},
"disableWebSecurity": {
"title": "Disable Web Security",
"type": "boolean",
"description": "Disable browser web security features (CORS, etc.) to bypass some protections",
"editor": "checkbox",
"default": false
},
"bypassCSP": {
"title": "Bypass Content Security Policy",
"type": "boolean",
"description": "Bypass website Content Security Policy restrictions",
"editor": "checkbox",
"default": false
},
"proxyConfiguration": {
"title": "Proxy configuration",
"type": "object",
"description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.",
"prefill": {
"useApifyProxy": true,
"apifyProxyGroups": []
},
"default": {},
"editor": "proxy"
}
}
}

network_analysis/README.md

1# Upwork Scraper Network Traffic Analysis
2
3This directory contains tools to analyze and improve network traffic patterns for the Upwork job scraper, helping to bypass 403 errors and anti-bot detection.
4
5## Tools Overview
6
71. **capture_traffic.sh** - Script for capturing and analyzing network traffic using TCPDump
82. **apply_findings.js** - Script to modify the scraper code based on network analysis findings
9
10## Prerequisites
11
12- TCPDump (`brew install tcpdump` on macOS)
13- Node.js 14+ for the apply_findings.js script
14- Fingerprinting packages (`npm install fingerprint-generator fingerprint-injector`)
15
16## Getting Started
17
18### Step 1: Capture Traffic
19
20The capture_traffic.sh script allows you to capture and compare network traffic from:
21- A successful manual browser session
22- A failing automated scraper session
23
24```bash
25# Make the script executable
26chmod +x capture_traffic.sh
27
28# Capture a successful manual browser session (use your normal browser)
29./capture_traffic.sh --manual
30
31# Capture a failing automated session (run the scraper in another terminal)
32./capture_traffic.sh --auto
33
34# Compare the two captures to identify differences
35./capture_traffic.sh --compare
36```
37
38### Step 2: Apply Fixes
39
40After analyzing the network traffic differences, you can apply fixes to the scraper:
41
42```bash
43# Install required dependencies
44npm install fingerprint-generator fingerprint-injector
45
46# Run the script to apply fixes based on traffic analysis
47node apply_findings.js
48```
49
50## What to Look For
51
52When comparing successful vs. failing traffic:
53
541. **TLS Fingerprinting** - Look for differences in TLS handshakes, cipher suites, and extensions
552. **HTTP Headers** - Check order, values, and presence/absence of specific headers
563. **Browser Fingerprinting** - JavaScript challenges, canvas checks, WebGL rendering
574. **Request Timing** - Natural patterns vs. automated timing
58
59## Technical Details
60
61### Browser Fingerprinting
62
63The fingerprint-generator package creates realistic browser fingerprints that match real browser behavior:
64
65```javascript
66const fingerprint = fingerprintGenerator.getFingerprint({
67 browserName: 'chrome',
68 browserVersion: 123,
69 operatingSystem: 'macos',
70 operatingSystemVersion: '10.15.7',
71 deviceCategory: 'desktop',
72 locale: 'en-US'
73});
74```
75
76Then fingerprint-injector applies this fingerprint to Puppeteer:
77
78```javascript
79await fingerprintInjector.attachFingerprintToPuppeteer(page, fingerprint);
80```
81
82### HTTP Headers
83
84The order and values of HTTP headers can be used to identify bots. The analysis will help identify the correct header ordering used by real browsers:
85
86```
87Host: www.upwork.com
88Connection: keep-alive
89sec-ch-ua: "Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"
90sec-ch-ua-mobile: ?0
91sec-ch-ua-platform: "macOS"
92Upgrade-Insecure-Requests: 1
93User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36
94Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
95Sec-Fetch-Site: none
96Sec-Fetch-Mode: navigate
97Sec-Fetch-User: ?1
98Sec-Fetch-Dest: document
99Accept-Encoding: gzip, deflate, br
100Accept-Language: en-US,en;q=0.9
101```
102
103### JavaScript Fingerprinting Protection
104
105The analysis will also help identify and mitigate JavaScript-based fingerprinting techniques used by Upwork:
106
107- Canvas fingerprinting - Adding subtle noise to canvas rendering
108- WebGL fingerprinting - Spoofing vendor and renderer information
109- Audio fingerprinting - Adding minor variations to audio data
110- Navigator property checks - Fixing browser detection properties
111- DOM property access patterns - Matching real browser behavior
112
113## Troubleshooting
114
115If you're still getting 403 errors after applying fixes:
116
1171. Try different network interfaces or mobile tethering
1182. Use higher quality proxies (mobile/4G proxies work better)
1193. Increase the capture time to get more data
1204. Try different browsers for the manual capture (Safari, Firefox)
1215. Use tcpdump's deeper analysis with `-v` or `-vv` flags
122
123## Advanced Usage
124
125For more detailed packet analysis:
126
127```bash
128# Analyze SSL/TLS handshake in detail
129sudo tcpdump -i en0 -nn -s0 -vvv "host upwork.com and tcp port 443" -w detailed_capture.pcap
130
131# Extract all HTTP headers
132tcpdump -A -s 0 'tcp port 443 and (((ip[2:2] - ((ip[0]&0xf)<<2)) - ((tcp[12]&0xf0)>>2)) != 0)' -r capture.pcap
133```
134
135## Additional Techniques to Try
136
137If the fingerprinting techniques don't fully resolve the 403 errors, consider:
138
1391. **Mobile User Agents** - Try using mobile user agents which often face less scrutiny
1402. **4G/5G Mobile Proxies** - These have cleaner IP reputation than data center proxies
1413. **Modify request timing** - Add more natural, human-like timing between navigation steps
1424. **Implement Chrome Extensions API** - More fully simulate popular browser extensions
1435. **HTTP/3 Support** - Implement HTTP/3 (QUIC) protocol support to bypass some fingerprinting

network_analysis/analyze_and_fix.sh

Download

network_analysis/apply_findings.js

1/**
2 * This script applies network fingerprinting fixes to the Upwork scraper
3 * based on TCPDump analysis findings
4 */
5
6import fs from 'fs';
7import path from 'path';
8import { fileURLToPath } from 'url';
9
10const __dirname = path.dirname(fileURLToPath(import.meta.url));
11const MAIN_JS_PATH = path.join(__dirname, '..', 'src', 'main.js');
12
13// Check if main.js exists
14if (!fs.existsSync(MAIN_JS_PATH)) {
15 console.error(`Error: ${MAIN_JS_PATH} not found`);
16 process.exit(1);
17}
18
19// Read the existing main.js file
20let mainJsContent = fs.readFileSync(MAIN_JS_PATH, 'utf8');
21
22// Function to apply fingerprinting fixes based on TCPDump analysis
23function applyFingerprintFixes(mainJsContent) {
24 console.log('Applying fingerprinting fixes...');
25
26 // Check if fingerprint-generator and fingerprint-injector modules are already imported
27 if (!mainJsContent.includes('fingerprint-generator') || !mainJsContent.includes('fingerprint-injector')) {
28 console.log('Adding fingerprinting modules...');
29
30 // Add imports
31 mainJsContent = mainJsContent.replace(
32 'import { fileURLToPath } from \'url\';',
33 'import { fileURLToPath } from \'url\';\nimport { FingerprintGenerator } from \'fingerprint-generator\';\nimport { FingerprintInjector } from \'fingerprint-injector\';'
34 );
35
36 // Initialize fingerprint generator after browserProfiles
37 const fingerprintGeneratorCode = `
38// Initialize fingerprint generator for more accurate browser emulation
39const fingerprintGenerator = new FingerprintGenerator({
40 browsers: [
41 { name: 'chrome', minVersion: 100, maxVersion: 123 },
42 { name: 'firefox', minVersion: 100, maxVersion: 123 },
43 { name: 'safari', minVersion: 15, maxVersion: 17 }
44 ],
45 devices: ['desktop'],
46 operatingSystems: ['macos', 'windows'],
47 locales: ['en-US', 'en-GB', 'de-DE'],
48 // Use consistent fingerprints per session for less detection
49 cache: true
50});
51
52// Fingerprint injector for applying fingerprints to browser
53const fingerprintInjector = new FingerprintInjector();`;
54
55 // Insert fingerprint generator code after browserProfiles
56 mainJsContent = mainJsContent.replace(
57 '// Generate new sessions if needed',
58 fingerprintGeneratorCode + '\n\n// Generate new sessions if needed'
59 );
60
61 console.log('Added fingerprint generator and injector');
62 }
63
64 // Add fingerprint generation to session initialization
65 if (!mainJsContent.includes('generateFingerprint()')) {
66 console.log('Adding fingerprint generation to sessions...');
67
68 // Add generateFingerprint function
69 const generateFingerprintCode = `
70// Generate consistent fingerprint for a session
71const generateFingerprint = (sessionId, profile) => {
72 // Generate fingerprint based on browser profile or random selection
73 const browserName = profile?.userAgent?.toLowerCase().includes('firefox') ? 'firefox' :
74 profile?.userAgent?.toLowerCase().includes('safari') ? 'safari' : 'chrome';
75
76 const osName = profile?.platform === 'MacIntel' ? 'macos' : 'windows';
77
78 const locale = profile?.locale || 'en-US';
79
80 return fingerprintGenerator.getFingerprint({
81 browserName,
82 browserVersion: parseInt(profile?.userAgent?.match(/Chrome\\/(\\d+)/)
83 || profile?.userAgent?.match(/Firefox\\/(\\d+)/)
84 || profile?.userAgent?.match(/Version\\/(\\d+)/)
85 || [0, Math.floor(Math.random() * 20) + 100])[1],
86 operatingSystem: osName,
87 operatingSystemVersion: osName === 'macos' ? '10.15.7' : '10.0',
88 deviceCategory: 'desktop',
89 locale
90 });
91};`;
92
93 // Insert generateFingerprint function
94 mainJsContent = mainJsContent.replace(
95 '// Generate fake browsing history for a more authentic browser profile',
96 generateFingerprintCode + '\n\n// Generate fake browsing history for a more authentic browser profile'
97 );
98
99 // Add fingerprint to session generation
100 mainJsContent = mainJsContent.replace(
101 'return {',
102 'const fingerprint = generateFingerprint(`session_${Date.now()}_${i}`, profile);\n return {'
103 );
104
105 mainJsContent = mainJsContent.replace(
106 'id: `session_${Date.now()}_${i}`,',
107 'id: `session_${Date.now()}_${i}`,\n fingerprint,'
108 );
109
110 console.log('Added fingerprint generation to sessions');
111 }
112
113 // Add fingerprint injection to preNavigationHooks
114 if (!mainJsContent.includes('injectFingerprint')) {
115 console.log('Adding fingerprint injection to navigation...');
116
117 // Find the preNavigationHooks section
118 const navHooksRegex = /preNavigationHooks: \[\s*\/\/ Hook pour ajouter des délais aléatoires entre requêtes et configurer le navigateur\s*async \(\{ page, request, session \}\) => \{([\s\S]*?)\},\s*\],/;
119 const navHooksMatch = mainJsContent.match(navHooksRegex);
120
121 if (navHooksMatch) {
122 // Add fingerprint injection code
123 const fingerPrintInjectionCode = `
124 // Apply fingerprint from session
125 if (sessionIndex !== -1 && sessions[sessionIndex].fingerprint) {
126 console.log(\`Injecting fingerprint for session \${sessionId}...\`);
127 try {
128 await fingerprintInjector.attachFingerprintToPuppeteer(page, sessions[sessionIndex].fingerprint);
129 console.log('Fingerprint successfully injected');
130 } catch (error) {
131 console.error('Error injecting fingerprint:', error.message);
132 }
133 }`;
134
135 // Insert after headers are set
136 mainJsContent = mainJsContent.replace(
137 'await page.setExtraHTTPHeaders(headers);',
138 'await page.setExtraHTTPHeaders(headers);\n' + fingerPrintInjectionCode
139 );
140
141 console.log('Added fingerprint injection to navigation hooks');
142 } else {
143 console.log('Warning: Could not find preNavigationHooks section');
144 }
145 }
146
147 return mainJsContent;
148}
149
150// Function to enhance HTTP header ordering based on TCPDump findings
151function enhanceHeaderOrdering(mainJsContent) {
152 console.log('Enhancing HTTP header ordering...');
153
154 // Check if already implemented
155 if (mainJsContent.includes('naturalistic header ordering')) {
156 console.log('HTTP header enhancements already exist, skipping...');
157 return mainJsContent;
158 }
159
160 // Find the headerOrder array in the code
161 const headerOrderRegex = /const headerOrder = \[([\s\S]*?)\];/;
162 const headerOrderMatch = mainJsContent.match(headerOrderRegex);
163
164 if (headerOrderMatch) {
165 // Replace with improved header ordering based on TCPDump analysis
166 const improvedHeaderOrder = `const headerOrder = [
167 // More naturalistic header ordering based on TCPDump analysis
168 // Primary headers always come first in real browsers
169 'Host',
170 'Connection',
171 'Cache-Control',
172 'sec-ch-ua',
173 'sec-ch-ua-mobile',
174 'sec-ch-ua-platform',
175 'Upgrade-Insecure-Requests',
176 'User-Agent',
177 'Accept',
178 'Accept-Encoding',
179 'Accept-Language',
180 // Secondary headers might appear in this order
181 'Sec-Fetch-Site',
182 'Sec-Fetch-Mode',
183 'Sec-Fetch-User',
184 'Sec-Fetch-Dest',
185 'Referer',
186 'Cookie',
187 'DNT'
188 ].sort(() => {
189 // Maintain primary header ordering (first 10) but randomize others
190 if (Math.random() > 0.8) {
191 return Math.random() - 0.5;
192 }
193 return 0;
194 });`;
195
196 mainJsContent = mainJsContent.replace(headerOrderRegex, improvedHeaderOrder);
197
198 console.log('Enhanced header ordering pattern');
199 } else {
200 console.log('Warning: Could not find header ordering code to enhance');
201 }
202
203 return mainJsContent;
204}
205
206// Function to enhance browser fingerprinting protection
207function enhanceBrowserFingerprinting(mainJsContent) {
208 console.log('Enhancing browser fingerprinting protection...');
209
210 // Check if enhanced fingerprinting already exists
211 if (mainJsContent.includes('enhanced fingerprinting protection')) {
212 console.log('Enhanced fingerprinting protection already exists, skipping...');
213 return mainJsContent;
214 }
215
216 // Find the page.evaluateOnNewDocument section for fingerprinting
217 const fingerprintRegex = /await page\.evaluateOnNewDocument\(\(\) => \{([\s\S]*?)\}\);/;
218 const fingerprintMatch = mainJsContent.match(fingerprintRegex);
219
220 if (fingerprintMatch) {
221 // Enhanced fingerprinting protection based on TCPDump analysis
222 const enhancedFingerprinting = `await page.evaluateOnNewDocument(() => {
223 // Enhanced fingerprinting protection based on TCPDump analysis
224
225 // Hide automation fingerprints
226 delete Object.getPrototypeOf(navigator).webdriver;
227
228 // Override platform info with more accurate values
229 Object.defineProperty(navigator, 'platform', {
230 get: () => 'MacIntel',
231 });
232
233 // Override hardware concurrency
234 Object.defineProperty(navigator, 'hardwareConcurrency', {
235 get: () => 8,
236 });
237
238 // Override device memory
239 Object.defineProperty(navigator, 'deviceMemory', {
240 get: () => 8,
241 });
242
243 // Improve Chrome properties emulation
244 if (typeof window !== 'undefined') {
245 window.chrome = {
246 app: {
247 isInstalled: false,
248 InstallState: { DISABLED: 'disabled', INSTALLED: 'installed', NOT_INSTALLED: 'not_installed' },
249 RunningState: { CANNOT_RUN: 'cannot_run', READY_TO_RUN: 'ready_to_run', RUNNING: 'running' }
250 },
251 runtime: {
252 OnInstalledReason: { CHROME_UPDATE: 'chrome_update', INSTALL: 'install', SHARED_MODULE_UPDATE: 'shared_module_update', UPDATE: 'update' },
253 OnRestartRequiredReason: { APP_UPDATE: 'app_update', OS_UPDATE: 'os_update', PERIODIC: 'periodic' },
254 PlatformArch: { ARM: 'arm', ARM64: 'arm64', MIPS: 'mips', MIPS64: 'mips64', X86_32: 'x86-32', X86_64: 'x86-64' },
255 PlatformNaclArch: { ARM: 'arm', MIPS: 'mips', MIPS64: 'mips64', X86_32: 'x86-32', X86_64: 'x86-64' },
256 PlatformOs: { ANDROID: 'android', CROS: 'cros', LINUX: 'linux', MAC: 'mac', OPENBSD: 'openbsd', WIN: 'win' },
257 RequestUpdateCheckStatus: { NO_UPDATE: 'no_update', THROTTLED: 'throttled', UPDATE_AVAILABLE: 'update_available' }
258 }
259 };
260 }
261
262 // Fix permissions behavior
263 const originalQuery = window.navigator.permissions?.query;
264 if (originalQuery) {
265 window.navigator.permissions.query = (parameters) => {
266 if (parameters.name === 'notifications') {
267 return Promise.resolve({ state: Notification.permission });
268 }
269 if (parameters.name === 'clipboard-read' || parameters.name === 'clipboard-write') {
270 return Promise.resolve({ state: 'prompt' });
271 }
272 return originalQuery(parameters);
273 };
274 }
275
276 // Fix WebGL fingerprinting - more accurate values from real browsers
277 if (window.WebGLRenderingContext) {
278 const getParameter = WebGLRenderingContext.prototype.getParameter;
279 WebGLRenderingContext.prototype.getParameter = function(parameter) {
280 // UNMASKED_VENDOR_WEBGL
281 if (parameter === 37445) {
282 return 'Intel Inc.';
283 }
284 // UNMASKED_RENDERER_WEBGL
285 if (parameter === 37446) {
286 return 'Intel Iris Pro OpenGL Engine';
287 }
288 return getParameter.call(this, parameter);
289 };
290 }
291
292 // Fix canvas fingerprinting - add subtle noise
293 const originalGetContext = HTMLCanvasElement.prototype.getContext;
294 HTMLCanvasElement.prototype.getContext = function(type, attributes) {
295 const context = originalGetContext.call(this, type, attributes);
296 if (context && type === '2d') {
297 const originalFillText = context.fillText;
298 context.fillText = function() {
299 // Add subtle variations to text rendering
300 const args = arguments;
301 if (Math.random() < 0.2 && args[0]) {
302 args[0] = args[0].split('').map(c => Math.random() < 0.05 ? c + String.fromCharCode(8203) : c).join('');
303 }
304 return originalFillText.apply(this, args);
305 };
306
307 // Affect toDataURL to add subtle noise
308 const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
309 HTMLCanvasElement.prototype.toDataURL = function() {
310 // Only add noise for likely fingerprinting scenarios
311 if (this.width === 16 && this.height === 16 ||
312 this.width <= 2 && this.height <= 2 ||
313 this.width <= 200 && this.height <= 50) {
314
315 const context = this.getContext('2d');
316 if (context) {
317 // Add minimal noise that's invisible to humans
318 const imageData = context.getImageData(0, 0, this.width, this.height);
319 const data = imageData.data;
320
321 // Only modify 1-2 pixels with minor variations
322 const pixelsToModify = Math.max(1, Math.floor(this.width * this.height * 0.01));
323 for (let i = 0; i < pixelsToModify; i++) {
324 const pixelIndex = Math.floor(Math.random() * data.length / 4) * 4;
325 // Only make very minor adjustments to color values
326 data[pixelIndex] = Math.min(255, Math.max(0, data[pixelIndex] + (Math.random() < 0.5 ? -1 : 1)));
327 data[pixelIndex + 1] = Math.min(255, Math.max(0, data[pixelIndex + 1] + (Math.random() < 0.5 ? -1 : 1)));
328 data[pixelIndex + 2] = Math.min(255, Math.max(0, data[pixelIndex + 2] + (Math.random() < 0.5 ? -1 : 1)));
329 }
330
331 context.putImageData(imageData, 0, 0);
332 }
333 }
334 return originalToDataURL.apply(this, arguments);
335 };
336 }
337 return context;
338 };
339
340 // Fix audio fingerprinting
341 const audioContext = window.AudioContext || window.webkitAudioContext;
342 if (audioContext) {
343 const originalGetChannelData = AudioBuffer.prototype.getChannelData;
344 AudioBuffer.prototype.getChannelData = function() {
345 const channelData = originalGetChannelData.apply(this, arguments);
346 // Only add noise for very short buffers (likely fingerprinting)
347 if (this.length < 100) {
348 const noise = 0.0001; // Very subtle noise
349 // Only modify a few random samples
350 const samplesToModify = Math.max(1, Math.floor(channelData.length * 0.001));
351 for (let i = 0; i < samplesToModify; i++) {
352 const index = Math.floor(Math.random() * channelData.length);
353 channelData[index] += (Math.random() * 2 - 1) * noise;
354 }
355 }
356 return channelData;
357 };
358 }
359 });`;
360
361 mainJsContent = mainJsContent.replace(fingerprintRegex, enhancedFingerprinting);
362
363 console.log('Enhanced browser fingerprinting protection');
364 } else {
365 console.log('Warning: Could not find fingerprinting code to enhance');
366 }
367
368 return mainJsContent;
369}
370
371// Apply all fixes
372let updatedContent = mainJsContent;
373updatedContent = applyFingerprintFixes(updatedContent);
374updatedContent = enhanceHeaderOrdering(updatedContent);
375updatedContent = enhanceBrowserFingerprinting(updatedContent);
376
377// Write the updated content back to main.js
378if (updatedContent !== mainJsContent) {
379 // Backup the original file
380 const backupPath = `${MAIN_JS_PATH}.bak.${Date.now()}`;
381 fs.writeFileSync(backupPath, mainJsContent);
382 console.log(`Original file backed up to: ${backupPath}`);
383
384 // Write the updated content
385 fs.writeFileSync(MAIN_JS_PATH, updatedContent);
386 console.log(`Successfully updated ${MAIN_JS_PATH} with fingerprinting fixes`);
387} else {
388 console.log('No changes were made to the file');
389}
390
391console.log('\nNext steps:');
392console.log('1. Run the capture_traffic.sh script to collect network data');
393console.log('2. Use the data to further refine the fingerprinting in this script');
394console.log('3. Re-run this script to apply improved fingerprinting settings');

network_analysis/capture_traffic.sh

Download

src/enhanced-crawler.js

1/**
2 * Enhanced Upwork Crawler
3 * Uses advanced challenge bypass techniques
4 */
5
6import { Actor } from 'apify';
7import { Dataset } from 'crawlee';
8import fs from 'fs/promises';
9import path from 'path';
10import { fileURLToPath } from 'url';
11import { parseJobsFromHTML } from './main.js';
12import {
13 createEnhancedSession,
14 enhancedPageNavigation,
15 cleanupSession
16} from './upwork-challenge-integrator.js';
17import {
18 applySessionFingerprinting,
19 createFingerprintRotationStrategy
20} from './fingerprint-integration.js';
21
22const __dirname = path.dirname(fileURLToPath(import.meta.url));
23
24// Store active sessions
25const activeSessions = {};
26const sessionStats = {};
27
28/**
29 * Initialize a session pool for rotation
30 * @param {Object} options Configuration options
31 * @returns {Array} Array of session IDs
32 */
33async function initializeSessionPool(options = {}) {
34 const {
35 sessionCount = 3,
36 browserProfiles = [],
37 input = {},
38 proxyConfiguration = null,
39 } = options;
40
41 console.log(`Creating ${sessionCount} new sessions for rotation`);
42
43 const sessionIds = [];
44
45 for (let i = 1; i <= sessionCount; i++) {
46 const sessionId = `session_${Date.now()}_${i}`;
47 sessionIds.push(sessionId);
48
49 // Initialize session stats
50 sessionStats[sessionId] = {
51 requestCount: 0,
52 successCount: 0,
53 failureCount: 0,
54 lastUsed: null,
55 createdAt: Date.now(),
56 };
57 }
58
59 return sessionIds;
60}
61
62/**
63 * Get the next available session for rotation
64 * @param {Array} sessionIds Array of session IDs
65 * @param {Object} options Configuration options
66 * @returns {string} Next session ID to use
67 */
68function getNextSession(sessionIds, options = {}) {
69 const {
70 cooldownMinutes = 30,
71 maxRequests = 3,
72 } = options;
73
74 // Find least recently used session that's not in cooldown
75 const now = Date.now();
76 const cooldownMs = cooldownMinutes * 60 * 1000;
77
78 // Sort sessions by priority:
79 // 1. Sessions that have never been used
80 // 2. Sessions that are out of cooldown and have made fewer requests
81 // 3. Sessions that have been used least recently
82 const sortedSessions = [...sessionIds].sort((a, b) => {
83 const statsA = sessionStats[a] || { requestCount: 0, lastUsed: null };
84 const statsB = sessionStats[b] || { requestCount: 0, lastUsed: null };
85
86 // Never used sessions have highest priority
87 if (statsA.lastUsed === null && statsB.lastUsed !== null) return -1;
88 if (statsA.lastUsed !== null && statsB.lastUsed === null) return 1;
89
90 // Check if either session is in cooldown
91 const aInCooldown = statsA.lastUsed && (now - statsA.lastUsed < cooldownMs);
92 const bInCooldown = statsB.lastUsed && (now - statsB.lastUsed < cooldownMs);
93
94 if (!aInCooldown && bInCooldown) return -1;
95 if (aInCooldown && !bInCooldown) return 1;
96
97 // If both are in cooldown or both are not, check request count
98 if (statsA.requestCount < maxRequests && statsB.requestCount >= maxRequests) return -1;
99 if (statsA.requestCount >= maxRequests && statsB.requestCount < maxRequests) return 1;
100
101 // If both have similar request counts, use least recently used
102 return (statsA.lastUsed || 0) - (statsB.lastUsed || 0);
103 });
104
105 // Return the best session
106 return sortedSessions[0];
107}
108
109/**
110 * Update session stats after use
111 * @param {string} sessionId Session ID
112 * @param {Object} result Result of the request
113 */
114function updateSessionStats(sessionId, result = {}) {
115 if (!sessionStats[sessionId]) {
116 sessionStats[sessionId] = {
117 requestCount: 0,
118 successCount: 0,
119 failureCount: 0,
120 lastUsed: null,
121 createdAt: Date.now(),
122 };
123 }
124
125 sessionStats[sessionId].requestCount += 1;
126 sessionStats[sessionId].lastUsed = Date.now();
127
128 if (result.success) {
129 sessionStats[sessionId].successCount += 1;
130 } else {
131 sessionStats[sessionId].failureCount += 1;
132 }
133}
134
135// Capture a screenshot and store it on Apify Storage
136const takeScreenshot = async (page, filename) => {
137 try {
138 const screenshotBuffer = await page.screenshot({
139 type: "jpeg",
140 quality: 50,
141 fullPage: false,
142 });
143 const screenshotKey = `${filename}-${Date.now()}.jpeg`;
144 await Actor.setValue(screenshotKey, screenshotBuffer, {
145 contentType: "image/jpeg",
146 });
147 const screenshotUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${screenshotKey}`;
148 console.log(`📸 Screenshot saved: ${screenshotUrl}`);
149 return screenshotUrl;
150 } catch (error) {
151 console.error(`❌ Failed to take screenshot: ${error.message}`);
152 return null;
153 }
154};
155
156/**
157 * Main crawler function
158 * @param {Object} options Crawler options
159 */
160async function crawlUpworkJobs(options = {}) {
161 const {
162 searchQuery = 'javascript',
163 maxPages = 5,
164 input = {},
165 proxyConfiguration = null,
166 browserProfiles = [],
167 } = options;
168
169 try {
170 // Initialize session pool
171 const sessionIds = await initializeSessionPool({
172 sessionCount: input.sessionRotationCount || 3,
173 browserProfiles,
174 input,
175 proxyConfiguration,
176 });
177
178 // Process each page
179 for (let page = 1; page <= maxPages; page++) {
180 // Get next available session
181 const sessionId = getNextSession(sessionIds, {
182 cooldownMinutes: input.sessionCooldownMinutes || 30,
183 maxRequests: input.proxyRotationRequests || 3,
184 });
185
186 console.log(`Using session ${sessionId} for this crawl`);
187
188 // Get or create session
189 let session = activeSessions[sessionId];
190 if (!session) {
191 // Get proxy URL if configured
192 let proxyUrl = null;
193 if (proxyConfiguration) {
194 proxyUrl = await proxyConfiguration.newUrl(sessionId);
195 }
196
197 // Create enhanced session
198 session = await createEnhancedSession({
199 sessionId,
200 proxyUrl,
201 headless: !input.headful,
202 input,
203 browserProfiles,
204 });
205
206 // Apply advanced fingerprinting specific to this session
207 if (session.page && !session.error) {
208 try {
209 const fingerprintConfig = await applySessionFingerprinting(session.page, sessionId);
210 console.log(`Applied consistent fingerprinting for session ${sessionId} using profile: ${fingerprintConfig.browserProfile}`);
211 session.fingerprintConfig = fingerprintConfig;
212 } catch (err) {
213 console.log(`Error applying fingerprinting: ${err.message}`);
214 }
215 }
216
217 activeSessions[sessionId] = session;
218 }
219
220 // Check if session has issues
221 if (session.error || session.connectionFailed || session.setupFailed) {
222 console.log(`Session ${sessionId} has issues: ${session.error || 'Failed setup'}`);
223
224 // Clean up and remove problematic session
225 await cleanupSession(session);
226 delete activeSessions[sessionId];
227
228 // Create a replacement session
229 const newSessionId = `session_${Date.now()}_replacement`;
230 sessionIds.push(newSessionId);
231
232 console.log(`Created replacement session ${newSessionId}`);
233 continue;
234 }
235
236 // Build search URL
237 let url;
238 if (searchQuery && searchQuery.trim() !== '') {
239 url = `https://www.upwork.com/nx/search/jobs/?q=${encodeURIComponent(searchQuery)}${page > 1 ? `&page=${page}` : ''}`;
240 } else {
241 url = `https://www.upwork.com/nx/search/jobs/${page > 1 ? `?page=${page}` : ''}`;
242 }
243
244 // Navigate to page and extract jobs
245 const result = await enhancedPageNavigation({
246 session,
247 url,
248 input,
249 parseJobsFunction: parseJobsFromHTML,
250 });
251
252 // Update session stats
253 updateSessionStats(sessionId, result);
254
255 // Handle successful result
256 if (result.success && result.jobs && result.jobs.length > 0) {
257 console.log(`Successfully extracted ${result.jobs.length} jobs from page ${page} using ${result.method}`);
258
259 // Save jobs to dataset
260 await Dataset.pushData(result.jobs);
261
262 // Add delay between successful requests
263 const delaySeconds = Math.floor(Math.random() *
264 (input.maxDelayBetweenRequests || 45) - (input.minDelayBetweenRequests || 15) + 1) +
265 (input.minDelayBetweenRequests || 15);
266
267 console.log(`Waiting ${delaySeconds} seconds before the next request...`);
268 await new Promise(resolve => setTimeout(resolve, delaySeconds * 1000));
269 } else {
270 // Handle failure
271 console.log(`Failed to extract jobs from page ${page}: ${result.error || 'Unknown error'}`);
272
273 if (result.needsSessionRotation) {
274 // Clean up and rotate to new session
275 await cleanupSession(session);
276 delete activeSessions[sessionId];
277
278 // Retry with fresh session after delay
279 const delaySeconds = Math.floor(Math.random() * 30) + 30;
280 console.log(`Session rotation needed. Waiting ${delaySeconds} seconds before retrying...`);
281 await new Promise(resolve => setTimeout(resolve, delaySeconds * 1000));
282
283 // Decrement page to retry
284 page--;
285 }
286
287 if (result.needsBrowserReset) {
288 // Reset browser but keep session
289 await cleanupSession(session);
290 delete activeSessions[sessionId];
291 }
292
293 // Increase delay after errors
294 await new Promise(resolve => setTimeout(resolve, 10000));
295 }
296
297 // Check if we need to rotate proxy
298 if (proxyConfiguration && input.proxyRotationEnabled &&
299 sessionStats[sessionId].requestCount >= (input.proxyRotationRequests || 3)) {
300 console.log(`Rotating proxy for session ${sessionId}`);
301
302 if (session.browser) {
303 // Close existing browser
304 await cleanupSession(session);
305 }
306
307 const proxyUrl = await proxyConfiguration.newUrl(sessionId);
308 console.log(`Rotating proxy for session ${sessionId} to: ${proxyUrl}`);
309
310 // Reset session with new proxy
311 delete activeSessions[sessionId];
312 }
313
314 if (result.isChallengePage && session.page) {
315 const screenshotUrl = await takeScreenshot(session.page, 'cloudflare_challenge_enhanced');
316 const html = await session.page.content();
317 await Actor.setValue(`cloudflare_challenge_enhanced_${Date.now()}.html`, html);
318 if (screenshotUrl) {
319 console.log(`Cloudflare challenge screenshot: ${screenshotUrl}`);
320 }
321 }
322 }
323
324 // Clean up all sessions
325 console.log('Crawl complete, cleaning up sessions');
326 for (const sessionId in activeSessions) {
327 await cleanupSession(activeSessions[sessionId]);
328 }
329
330 } catch (error) {
331 console.error('Error in crawler:', error);
332
333 // Clean up on error
334 for (const sessionId in activeSessions) {
335 await cleanupSession(activeSessions[sessionId]);
336 }
337 }
338}
339
340export {
341 crawlUpworkJobs,
342 initializeSessionPool,
343 getNextSession,
344};

src/fingerprint-enhancement.js

1/**
2 * Advanced Browser Fingerprint Enhancement
3 * Specialized techniques to defeat Upwork's fingerprinting detection
4 */
5
6import { Page } from 'puppeteer';
7
8/**
9 * Apply advanced fingerprint protection to page
10 * @param {Page} page Puppeteer page instance
11 * @param {Object} options Configuration options
12 */
13export async function applyAdvancedFingerprinting(page, options = {}) {
14 const {
15 deviceProfile = 'modern_desktop',
16 webglNoise = true,
17 audioNoise = true,
18 fontConsistency = true,
19 hideTempStorage = true,
20 consistentTimezone = true,
21 userAgent,
22 timezone = 'America/New_York',
23 } = options;
24
25 console.log('Applying enhanced browser fingerprint protection...');
26
27 // 1. Canvas fingerprinting protection
28 await page.evaluateOnNewDocument(() => {
29 // Original implementations to reference
30 const originalGetContext = HTMLCanvasElement.prototype.getContext;
31 const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
32 const originalGetImageData = CanvasRenderingContext2D.prototype.getImageData;
33 const originalReadPixels = WebGLRenderingContext.prototype.readPixels;
34
35 // Override canvas methods to add subtle noise to prevent fingerprinting
36 HTMLCanvasElement.prototype.getContext = function(contextType, contextAttributes) {
37 const context = originalGetContext.call(this, contextType, contextAttributes);
38 // Mark modified canvases so we only modify them once
39 if (context && ['2d', 'webgl', 'webgl2'].includes(contextType) && !this.__modified) {
40 this.__modified = true;
41 this.__contextType = contextType;
42 }
43 return context;
44 };
45
46 HTMLCanvasElement.prototype.toDataURL = function(type, quality) {
47 if (this.__modified && this.__contextType === '2d') {
48 // For 2D canvases, add subtle noise before generating data URL
49 const context = originalGetContext.call(this, '2d');
50 const imageData = originalGetImageData.call(
51 context,
52 0,
53 0,
54 this.width,
55 this.height
56 );
57
58 // Subtle noise that won't be visible but changes fingerprint
59 for (let i = 0; i < imageData.data.length; i += 4) {
60 // Only modify alpha channel very slightly in select pixels
61 if (Math.random() < 0.005) { // Only change 0.5% of pixels
62 const offset = Math.floor(Math.random() * 2) - 1; // -1 to +1
63 imageData.data[i + 3] = Math.max(0, Math.min(255, imageData.data[i + 3] + offset));
64 }
65 }
66
67 context.putImageData(imageData, 0, 0);
68 }
69
70 return originalToDataURL.call(this, type, quality);
71 };
72 });
73
74 // 2. WebGL fingerprinting protection
75 if (webglNoise) {
76 await page.evaluateOnNewDocument(() => {
77 // Original implementations to reference
78 const originalGetParameter = WebGLRenderingContext.prototype.getParameter;
79
80 WebGLRenderingContext.prototype.getParameter = function(parameter) {
81 // Randomize the following parameters carefully to avoid detection
82 // RENDERER and VENDOR are most commonly used for fingerprinting
83 if (parameter === 37446) { // UNMASKED_RENDERER_WEBGL
84 return 'Intel Iris OpenGL Engine';
85 }
86 if (parameter === 37445) { // UNMASKED_VENDOR_WEBGL
87 return 'Google Inc. (Intel)';
88 }
89
90 // Add very subtle randomization to matrix values
91 // This targets transform matrices used in fingerprinting
92 if ([2982, 2983, 35978].includes(parameter)) { // Various matrix parameters
93 const originalValue = originalGetParameter.call(this, parameter);
94 if (originalValue && originalValue.length) {
95 const newValue = new Float32Array(originalValue);
96 // Add extremely small noise that won't affect rendering
97 for (let i = 0; i < newValue.length; i++) {
98 if (Math.random() < 0.1) {
99 newValue[i] += (Math.random() * 2 - 1) * 0.0000001;
100 }
101 }
102 return newValue;
103 }
104 }
105
106 return originalGetParameter.call(this, parameter);
107 };
108 });
109 }
110
111 // 3. Audio fingerprinting protection
112 if (audioNoise) {
113 await page.evaluateOnNewDocument(() => {
114 // Protect against audio fingerprinting
115 const originalGetFloatFrequencyData = AudioBuffer.prototype.getChannelData;
116
117 AudioBuffer.prototype.getChannelData = function(channel) {
118 const originalData = originalGetFloatFrequencyData.call(this, channel);
119
120 // Clone to avoid modifying original audio
121 const audioData = new Float32Array(originalData);
122
123 // Add subtle noise to audio data
124 for (let i = 0; i < audioData.length; i++) {
125 if (Math.random() < 0.001) {
126 // Extremely small noise value
127 audioData[i] += (Math.random() * 2 - 1) * 0.0001;
128 }
129 }
130
131 return audioData;
132 };
133 });
134 }
135
136 // 4. Font consistency
137 if (fontConsistency) {
138 await page.evaluateOnNewDocument(() => {
139 // List of common fonts to emulate consistent availability
140 const commonFonts = [
141 'Arial', 'Arial Black', 'Arial Narrow', 'Book Antiqua', 'Bookman Old Style',
142 'Calibri', 'Cambria', 'Century Gothic', 'Comic Sans MS', 'Consolas',
143 'Courier', 'Courier New', 'Georgia', 'Helvetica', 'Impact', 'Lucida Console',
144 'Lucida Sans Unicode', 'Microsoft Sans Serif', 'Palatino Linotype', 'Segoe UI',
145 'Tahoma', 'Times', 'Times New Roman', 'Trebuchet MS', 'Verdana'
146 ];
147
148 // Override font detection
149 if (document.fonts && document.fonts.check) {
150 const originalCheck = document.fonts.check;
151 document.fonts.check = function(font, text) {
152 const fontFamily = font.split(' ').pop().replace(/['",]/g, '');
153
154 if (commonFonts.includes(fontFamily)) {
155 return true;
156 }
157
158 return originalCheck.apply(this, arguments);
159 };
160 }
161 });
162 }
163
164 // 5. Session/localStorage fingerprinting protection
165 if (hideTempStorage) {
166 await page.evaluateOnNewDocument(() => {
167 // Create wrapper for storage to prevent fingerprinting via storage size/content
168 const createStorageProxy = (storageType) => {
169 const storage = {};
170 const originalStorage = window[storageType];
171
172 // Create proxy wrapper around storage
173 return new Proxy(originalStorage, {
174 get: (target, prop) => {
175 if (prop === 'length') {
176 return Object.keys(storage).length;
177 }
178 if (prop === 'key') {
179 return (index) => Object.keys(storage)[index];
180 }
181 if (prop === 'getItem') {
182 return (key) => storage[key] || null;
183 }
184 if (prop === 'setItem') {
185 return (key, value) => {
186 storage[key] = String(value);
187 };
188 }
189 if (prop === 'removeItem') {
190 return (key) => { delete storage[key]; };
191 }
192 if (prop === 'clear') {
193 return () => { Object.keys(storage).forEach(key => delete storage[key]); };
194 }
195
196 return storage[prop] || target[prop];
197 },
198 set: (target, prop, value) => {
199 storage[prop] = value;
200 return true;
201 }
202 });
203 };
204
205 // Apply storage proxies
206 Object.defineProperty(window, 'localStorage', {
207 get: () => createStorageProxy('localStorage')
208 });
209
210 Object.defineProperty(window, 'sessionStorage', {
211 get: () => createStorageProxy('sessionStorage')
212 });
213 });
214 }
215
216 // 6. Consistent timezone emulation
217 if (consistentTimezone) {
218 await page.evaluateOnNewDocument((timezone) => {
219 // Override Date to provide consistent timezone
220 const originalDate = Date;
221 const timezoneOffset = {
222 'America/New_York': -5 * 60,
223 'America/Los_Angeles': -8 * 60,
224 'Europe/London': 0,
225 'Europe/Berlin': 1 * 60,
226 'Asia/Tokyo': 9 * 60
227 }[timezone] || 0;
228
229 // Override getTimezoneOffset to return consistent value
230 const DateTimeFormat = Intl.DateTimeFormat;
231 Intl.DateTimeFormat = function(locales, options) {
232 if (options && options.timeZone === undefined) {
233 options = {...options, timeZone: timezone};
234 }
235 return new DateTimeFormat(locales, options);
236 };
237
238 Date.prototype.getTimezoneOffset = function() {
239 return timezoneOffset;
240 };
241 }, timezone);
242 }
243
244 // 7. Fix client rects fingerprinting
245 await page.evaluateOnNewDocument(() => {
246 // Add tiny variations to element dimensions (DOMRect objects)
247 const variateRect = (rect) => {
248 if (!rect || typeof rect !== 'object') return rect;
249
250 // Very small variations (< 0.05 px) that shouldn't affect layout
251 const variation = () => (Math.random() * 0.1) - 0.05;
252
253 return {
254 top: rect.top + variation(),
255 right: rect.right + variation(),
256 bottom: rect.bottom + variation(),
257 left: rect.left + variation(),
258 width: rect.width,
259 height: rect.height,
260 x: rect.x + variation(),
261 y: rect.y + variation()
262 };
263 };
264
265 // Override getClientRects
266 const originalGetClientRects = Element.prototype.getClientRects;
267 Element.prototype.getClientRects = function() {
268 const originalRects = originalGetClientRects.call(this);
269
270 // Create array-like object with modified values
271 const modifiedRects = {};
272 for (let i = 0; i < originalRects.length; i++) {
273 modifiedRects[i] = variateRect(originalRects[i]);
274 }
275 modifiedRects.length = originalRects.length;
276
277 return modifiedRects;
278 };
279
280 // Override getBoundingClientRect
281 const originalGetBoundingClientRect = Element.prototype.getBoundingClientRect;
282 Element.prototype.getBoundingClientRect = function() {
283 const originalRect = originalGetBoundingClientRect.call(this);
284 return variateRect(originalRect);
285 };
286 });
287
288 // 8. Hardware concurrency and device memory spoofing
289 await page.evaluateOnNewDocument(() => {
290 // Make hardware details match common configurations
291 Object.defineProperty(navigator, 'hardwareConcurrency', {
292 get: () => 8
293 });
294
295 if ('deviceMemory' in navigator) {
296 Object.defineProperty(navigator, 'deviceMemory', {
297 get: () => 8
298 });
299 }
300 });
301
302 // 9. User agent consistency
303 if (userAgent) {
304 await page.setUserAgent(userAgent);
305 }
306
307 // 10. Navigator property consistency
308 await page.evaluateOnNewDocument((device) => {
309 // Ensure navigator properties are consistent
310 if (device === 'modern_desktop') {
311 const nav = navigator;
312 Object.defineProperty(nav, 'appVersion', { get: () => '5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15' });
313 Object.defineProperty(nav, 'platform', { get: () => 'MacIntel' });
314 Object.defineProperty(nav, 'userAgentData', { get: () => null }); // Safari doesn't support this
315 }
316 }, deviceProfile);
317
318 console.log('Enhanced fingerprint protection applied successfully');
319}
320
321/**
322 * Add TLS fingerprinting protection by modifying cipher suites
323 * @param {Object} options Configuration options
324 */
325export function applyTLSFingerprinting(options = {}) {
326 const {
327 randomize = true,
328 profile = 'chrome_mac'
329 } = options;
330
331 // Common browser TLS fingerprints (cipher suite order)
332 const tlsProfiles = {
333 chrome_mac: [
334 'TLS_AES_128_GCM_SHA256',
335 'TLS_AES_256_GCM_SHA384',
336 'TLS_CHACHA20_POLY1305_SHA256',
337 'ECDHE-ECDSA-AES128-GCM-SHA256',
338 'ECDHE-RSA-AES128-GCM-SHA256',
339 'ECDHE-ECDSA-AES256-GCM-SHA384',
340 'ECDHE-RSA-AES256-GCM-SHA384',
341 'ECDHE-ECDSA-CHACHA20-POLY1305',
342 'ECDHE-RSA-CHACHA20-POLY1305'
343 ],
344 safari: [
345 'TLS_AES_128_GCM_SHA256',
346 'TLS_AES_256_GCM_SHA384',
347 'TLS_CHACHA20_POLY1305_SHA256',
348 'ECDHE-ECDSA-AES256-GCM-SHA384',
349 'ECDHE-ECDSA-AES128-GCM-SHA256',
350 'ECDHE-RSA-AES256-GCM-SHA384',
351 'ECDHE-RSA-AES128-GCM-SHA256'
352 ],
353 firefox: [
354 'TLS_AES_128_GCM_SHA256',
355 'TLS_CHACHA20_POLY1305_SHA256',
356 'TLS_AES_256_GCM_SHA384',
357 'ECDHE-ECDSA-AES128-GCM-SHA256',
358 'ECDHE-RSA-AES128-GCM-SHA256',
359 'ECDHE-ECDSA-CHACHA20-POLY1305',
360 'ECDHE-RSA-CHACHA20-POLY1305',
361 'ECDHE-ECDSA-AES256-GCM-SHA384',
362 'ECDHE-RSA-AES256-GCM-SHA384'
363 ]
364 };
365
366 // Get the selected profile's cipher suites
367 let cipherSuites = tlsProfiles[profile] || tlsProfiles.chrome_mac;
368
369 // Optionally add very subtle randomization without changing the main pattern
370 // This makes each TLS fingerprint slightly different while maintaining browser pattern
371 if (randomize) {
372 // Select a random grouping pattern that preserves general order
373 // but introduces small variations
374 const groups = [
375 [0, 1, 2], // First three ciphers as a group
376 [3, 4], // Next two ciphers
377 [5, 6], // Next two ciphers
378 [7, 8] // Last two ciphers (if they exist)
379 ];
380
381 // Create new array to hold the randomized cipher suites
382 const randomizedSuites = [];
383
384 // Process each group
385 groups.forEach(group => {
386 // Extract the ciphers in this group
387 const groupCiphers = group
388 .map(index => cipherSuites[index])
389 .filter(cipher => cipher !== undefined);
390
391 // If we have ciphers in this group, randomize their order
392 if (groupCiphers.length > 0) {
393 // Small chance to swap positions within group only
394 if (groupCiphers.length > 1 && Math.random() < 0.3) {
395 const i = Math.floor(Math.random() * groupCiphers.length);
396 const j = Math.floor(Math.random() * groupCiphers.length);
397 if (i !== j) {
398 [groupCiphers[i], groupCiphers[j]] = [groupCiphers[j], groupCiphers[i]];
399 }
400 }
401
402 // Add group ciphers to result
403 randomizedSuites.push(...groupCiphers);
404 }
405 });
406
407 cipherSuites = randomizedSuites;
408 }
409
410 // Set environment variable for Node.js TLS connections
411 process.env.NODE_TLS_CIPHER_SUITES = cipherSuites.join(':');
412
413 return cipherSuites;
414}
415
416/**
417 * Generate consistent HTTP headers in browser-like patterns
418 * @param {Object} options Configuration options
419 * @returns {Object} Configured headers
420 */
421export function generateConsistentHeaders(options = {}) {
422 const {
423 userAgent,
424 locale = 'en-US',
425 browser = 'safari',
426 randomizeOrder = true
427 } = options;
428
429 // Base headers common to most browsers
430 let headers = {
431 'User-Agent': userAgent,
432 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
433 'Accept-Language': `${locale},en;q=0.9`,
434 'Accept-Encoding': 'gzip, deflate, br',
435 'Connection': 'keep-alive',
436 'Upgrade-Insecure-Requests': '1',
437 'Sec-Fetch-Dest': 'document',
438 'Sec-Fetch-Mode': 'navigate',
439 'Sec-Fetch-Site': 'none',
440 'Sec-Fetch-User': '?1'
441 };
442
443 // Browser-specific headers
444 if (browser === 'safari') {
445 headers = {
446 ...headers,
447 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
448 // Safari typically has fewer sec-fetch headers
449 'Sec-Fetch-Dest': undefined,
450 'Sec-Fetch-Mode': undefined,
451 'Sec-Fetch-Site': undefined,
452 'Sec-Fetch-User': undefined
453 };
454 } else if (browser === 'firefox') {
455 headers = {
456 ...headers,
457 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
458 'TE': 'trailers'
459 };
460 }
461
462 // Clean up undefined headers
463 Object.keys(headers).forEach(key => {
464 if (headers[key] === undefined) {
465 delete headers[key];
466 }
467 });
468
469 // Randomize order if requested while maintaining common patterns
470 if (randomizeOrder) {
471 const headerOrder = Object.keys(headers);
472
473 // Ensure user-agent is typically near the beginning
474 if (headerOrder.includes('User-Agent')) {
475 headerOrder.splice(headerOrder.indexOf('User-Agent'), 1);
476 const insertPosition = Math.floor(Math.random() * Math.min(2, headerOrder.length));
477 headerOrder.splice(insertPosition, 0, 'User-Agent');
478 }
479
480 // Ensure Accept is typically near the beginning as well
481 if (headerOrder.includes('Accept')) {
482 headerOrder.splice(headerOrder.indexOf('Accept'), 1);
483 const insertPosition = Math.floor(Math.random() * Math.min(3, headerOrder.length));
484 headerOrder.splice(insertPosition, 0, 'Accept');
485 }
486
487 // Create new headers object with randomized order
488 const orderedHeaders = {};
489 headerOrder.forEach(key => {
490 orderedHeaders[key] = headers[key];
491 });
492
493 return orderedHeaders;
494 }
495
496 return headers;
497}
498
499/**
500 * Add HTTP header variations and fingerprint resistance
501 * @param {Object} page Puppeteer page
502 * @param {Object} options Configuration options
503 */
504export async function applyHeaderFingerprinting(page, options = {}) {
505 // Generate consistent headers
506 const headers = generateConsistentHeaders(options);
507
508 // Set the headers on the page
509 await page.setExtraHTTPHeaders(headers);
510
511 return headers;
512}

src/fingerprint-integration.js

1/**
2 * Fingerprint Integration Module
3 * Connects advanced fingerprinting techniques with the existing crawler system
4 */
5
6import { applyAdvancedFingerprinting, applyTLSFingerprinting, applyHeaderFingerprinting } from './fingerprint-enhancement.js';
7import randomUseragent from 'random-useragent';
8
9/**
10 * Configures and applies all fingerprinting protections to a browser page
11 * @param {Page} page - Puppeteer page instance
12 * @param {Object} options - Configuration options
13 * @returns {Object} Applied fingerprint settings
14 */
15export async function configureFingerprinting(page, options = {}) {
16 console.log('Setting up advanced fingerprinting protection...');
17
18 const {
19 userAgent = randomUseragent.getRandom(),
20 browserProfile = 'safari',
21 randomizeFingerprints = true,
22 timezone = 'America/New_York',
23 locale = 'en-US'
24 } = options;
25
26 // Track applied configurations
27 const appliedConfig = {
28 userAgent,
29 browserProfile,
30 timezone,
31 locale
32 };
33
34 // 1. TLS fingerprinting (affects network requests)
35 appliedConfig.tlsCiphers = applyTLSFingerprinting({
36 randomize: randomizeFingerprints,
37 profile: browserProfile === 'safari' ? 'safari' :
38 browserProfile === 'firefox' ? 'firefox' : 'chrome_mac'
39 });
40
41 // 2. Apply browser fingerprinting protections
42 await applyAdvancedFingerprinting(page, {
43 deviceProfile: browserProfile === 'safari' ? 'safari' :
44 browserProfile === 'firefox' ? 'firefox' : 'modern_desktop',
45 webglNoise: true,
46 audioNoise: true,
47 fontConsistency: true,
48 hideTempStorage: true,
49 consistentTimezone: true,
50 userAgent,
51 timezone
52 });
53
54 // 3. Apply header fingerprinting
55 appliedConfig.headers = await applyHeaderFingerprinting(page, {
56 userAgent,
57 locale,
58 browser: browserProfile,
59 randomizeOrder: randomizeFingerprints
60 });
61
62 console.log(`Fingerprinting protection configured for profile: ${browserProfile}`);
63
64 return appliedConfig;
65}
66
67/**
68 * Creates a fingerprinting strategy based on browser rotation
69 * @param {Array} profiles - List of browser profiles to rotate through
70 * @returns {Function} Strategy function that configures fingerprinting for a page
71 */
72export function createFingerprintRotationStrategy(profiles = ['chrome', 'safari', 'firefox']) {
73 let currentProfileIndex = 0;
74
75 // Common timezone groups for better consistency
76 const timezones = [
77 'America/New_York',
78 'America/Chicago',
79 'America/Denver',
80 'America/Los_Angeles',
81 'Europe/London',
82 'Europe/Berlin',
83 'Europe/Paris',
84 'Asia/Tokyo',
85 'Australia/Sydney'
86 ];
87
88 // Common locales that match the timezones
89 const locales = [
90 'en-US',
91 'en-GB',
92 'en-CA',
93 'de-DE',
94 'fr-FR',
95 'ja-JP',
96 'en-AU'
97 ];
98
99 return async function applyRotatingFingerprint(page) {
100 // Get next profile in rotation
101 const profileName = profiles[currentProfileIndex];
102 currentProfileIndex = (currentProfileIndex + 1) % profiles.length;
103
104 // Select timezone and locale that would naturally go together
105 const timezoneIndex = Math.floor(Math.random() * timezones.length);
106 const timezone = timezones[timezoneIndex];
107
108 // Match locale to region where possible
109 let locale;
110 if (timezone.startsWith('America')) {
111 locale = locales[Math.floor(Math.random() * 3)]; // US, GB, or CA
112 } else if (timezone.startsWith('Europe')) {
113 locale = locales[Math.floor(Math.random() * 2) + 3]; // DE or FR
114 } else if (timezone.startsWith('Asia')) {
115 locale = locales[5]; // JA
116 } else {
117 locale = locales[6]; // AU
118 }
119
120 // Apply the fingerprinting
121 return await configureFingerprinting(page, {
122 browserProfile: profileName,
123 randomizeFingerprints: true,
124 timezone,
125 locale
126 });
127 };
128}
129
130/**
131 * Creates consistent fingerprints for a session
132 * @param {string} sessionId - Session identifier
133 * @returns {Object} Consistent fingerprint settings for this session
134 */
135export function generateConsistentFingerprint(sessionId) {
136 // Use session ID as seed for deterministic but unique fingerprints
137 const seed = sessionId.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0);
138
139 // Browser profiles to choose from
140 const browserProfiles = ['safari', 'chrome', 'firefox'];
141
142 // Common timezones
143 const timezones = [
144 'America/New_York', 'America/Chicago', 'America/Los_Angeles',
145 'Europe/London', 'Europe/Paris', 'Europe/Berlin',
146 'Asia/Tokyo', 'Australia/Sydney'
147 ];
148
149 // Common locales
150 const locales = ['en-US', 'en-GB', 'en-CA', 'de-DE', 'fr-FR', 'ja-JP', 'en-AU'];
151
152 // Helper for deterministic selection based on seed
153 const selectOption = (options, offset = 0) => {
154 const index = (seed + offset) % options.length;
155 return options[index];
156 };
157
158 // Generate a consistent fingerprint configuration
159 return {
160 browserProfile: selectOption(browserProfiles),
161 timezone: selectOption(timezones, 100),
162 locale: selectOption(locales, 200),
163 // Generate a consistent but small variation
164 fingerprint: {
165 screenWidth: 1920 + ((seed % 5) * 16), // Small variations in screen size
166 screenHeight: 1080 + ((seed % 3) * 8),
167 colorDepth: 24,
168 deviceMemory: 8,
169 hardwareConcurrency: 4 + (seed % 4) * 2, // 4, 6, 8, or 10 cores
170 }
171 };
172}
173
174/**
175 * Apply session-consistent fingerprinting to page
176 * @param {Page} page - Puppeteer page
177 * @param {string} sessionId - Session identifier
178 * @returns {Object} Applied configuration
179 */
180export async function applySessionFingerprinting(page, sessionId) {
181 // Generate consistent fingerprint for this session
182 const fingerprintConfig = generateConsistentFingerprint(sessionId);
183
184 // Apply the fingerprinting with the consistent settings
185 return await configureFingerprinting(page, {
186 browserProfile: fingerprintConfig.browserProfile,
187 timezone: fingerprintConfig.timezone,
188 locale: fingerprintConfig.locale,
189 randomizeFingerprints: false,
190 // Additional custom props
191 screen: {
192 width: fingerprintConfig.fingerprint.screenWidth,
193 height: fingerprintConfig.fingerprint.screenHeight,
194 colorDepth: fingerprintConfig.fingerprint.colorDepth
195 },
196 hardwareConcurrency: fingerprintConfig.fingerprint.hardwareConcurrency,
197 deviceMemory: fingerprintConfig.fingerprint.deviceMemory
198 });
199}

src/main.js

1import { Actor } from 'apify';
2import { PuppeteerCrawler, KeyValueStore, Dataset } from 'crawlee';
3import puppeteer from 'puppeteer-extra';
4import StealthPlugin from 'puppeteer-extra-plugin-stealth';
5import randomUseragent from 'random-useragent';
6import Captcha from '2captcha';
7import { fileURLToPath } from 'url';
8import path from 'path';
9import fs from 'fs';
10import { handleRequest } from './routes.js';
11import cheerio from 'cheerio';
12import cloudscraper from './utils/cloudscraper-replacement.js';
13import { HttpsProxyAgent } from 'https-proxy-agent';
14
15// Import our new challenge bypass integrator
16import {
17 createEnhancedSession,
18 enhancedPageNavigation,
19 cleanupSession
20} from './upwork-challenge-integrator.js';
21
22// Import fingerprint enhancer
23import {
24 applySessionFingerprinting,
25 createFingerprintRotationStrategy
26} from './fingerprint-integration.js';
27
28// Additional puppeteer-extra plugins
29import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha';
30import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker';
31import UserPreferencesPlugin from 'puppeteer-extra-plugin-user-preferences';
32import UserDataDirPlugin from 'puppeteer-extra-plugin-user-data-dir';
33
34// Create a fingerprint rotation strategy for the entire application
35const fingerprintRotator = createFingerprintRotationStrategy(['chrome', 'safari', 'firefox']);
36
37// Store fingerprint configurations by session
38const sessionFingerprints = {};
39
40// Directory for storing browser profiles
41const __dirname = path.dirname(fileURLToPath(import.meta.url));
42const PROFILES_DIR = path.join(__dirname, '..', 'browser_profiles');
43
44// Ensure profiles directory exists
45if (!fs.existsSync(PROFILES_DIR)) {
46 fs.mkdirSync(PROFILES_DIR, { recursive: true });
47}
48
49// Configure stealth plugin with advanced options
50const stealth = StealthPlugin({
51 webglVendor: "Google Inc. (Intel)",
52 webglRenderer: "Intel Iris OpenGL Engine",
53 navigator: {
54 platform: "MacIntel",
55 languages: ["en-US", "en"]
56 }
57});
58
59// Add all plugins to puppeteer
60puppeteer.use(stealth);
61
62// Add recaptcha plugin with 2captcha support (will be configured later if API key is provided)
63puppeteer.use(RecaptchaPlugin());
64
65// Add adblocker to reduce detection via ads and trackers
66puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
67
68// CloudScraper request function (Promise-based wrapper)
69const cloudScraperRequest = async (options) => {
70 return new Promise((resolve, reject) => {
71 cloudscraper(options, (error, response, body) => {
72 if (error) {
73 reject(error);
74 return;
75 }
76 resolve({ response, body });
77 });
78 });
79};
80
81// HTML parser fallback for when browser methods fail
82export const parseJobsFromHTML = (html) => {
83 console.log('Using direct HTML parsing fallback method');
84
85 try {
86 // Basic regex-based extraction
87 const jobs = [];
88
89 // Extract job titles and links
90 const titleRegex = /<h2[^>]*>[^<]*<a[^>]*href="([^"]+)"[^>]*>([^<]+)<\/a>/gi;
91 let titleMatch;
92 while ((titleMatch = titleRegex.exec(html)) !== null) {
93 const jobLink = titleMatch[1].startsWith('http') ? titleMatch[1] : `https://www.upwork.com${titleMatch[1]}`;
94 const title = titleMatch[2].trim();
95
96 // Create a job object
97 const job = {
98 title: title,
99 jobLink: jobLink,
100 postedDate: 'N/A',
101 jobType: 'N/A',
102 experienceLevel: 'N/A',
103 duration: 'N/A',
104 description: 'N/A',
105 budget: 'N/A',
106 skills: 'N/A',
107 _extractedBy: 'html-fallback'
108 };
109
110 // Try to find additional information for this job
111
112 // Find job description - look for a paragraph after the title
113 const descRegex = new RegExp(`href="${jobLink.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}"[^>]*>[^<]*<\\/a>[\\s\\S]*?<p[^>]*>([\\s\\S]*?)<\\/p>`, 'i');
114 const descMatch = html.match(descRegex);
115 if (descMatch) {
116 job.description = descMatch[1].trim().replace(/<[^>]*>/g, '');
117 }
118
119 // Try to find posted date
120 const postedRegex = /posted\s+([^<>]+?)(?:ago|on)/i;
121 const postedMatch = html.match(postedRegex);
122 if (postedMatch) {
123 job.postedDate = postedMatch[1].trim();
124 }
125
126 // Try to find budget
127 const budgetRegex = /\$([0-9,.]+)\s*-\s*\$([0-9,.]+)|Fixed\s*Price:\s*\$([0-9,.]+)|Budget:\s*\$([0-9,.]+)/i;
128 const budgetMatch = html.match(budgetRegex);
129 if (budgetMatch) {
130 if (budgetMatch[1] && budgetMatch[2]) {
131 job.budget = `$${budgetMatch[1]}-$${budgetMatch[2]}`;
132 } else if (budgetMatch[3]) {
133 job.budget = `$${budgetMatch[3]}`;
134 } else if (budgetMatch[4]) {
135 job.budget = `$${budgetMatch[4]}`;
136 }
137 }
138
139 // Try to find job type
140 const jobTypeRegex = /Hourly|Fixed Price/i;
141 const jobTypeMatch = html.match(jobTypeRegex);
142 if (jobTypeMatch) {
143 job.jobType = jobTypeMatch[0].trim();
144 }
145
146 // Try to find experience level
147 const expLevelRegex = /Entry Level|Intermediate|Expert/i;
148 const expLevelMatch = html.match(expLevelRegex);
149 if (expLevelMatch) {
150 job.experienceLevel = expLevelMatch[0].trim();
151 }
152
153 // Try to find duration
154 const durationRegex = /More than 6 months|3 to 6 months|1 to 3 months|Less than 1 month/i;
155 const durationMatch = html.match(durationRegex);
156 if (durationMatch) {
157 job.duration = durationMatch[0].trim();
158 }
159
160 // Try to find skills
161 const skillsRegex = /<span[^>]*data-test="skill"[^>]*>([^<]+)<\/span>/gi;
162 let skillsMatch;
163 const skills = [];
164 while ((skillsMatch = skillsRegex.exec(html)) !== null) {
165 skills.push(skillsMatch[1].trim());
166 }
167 if (skills.length > 0) {
168 job.skills = skills.join(', ');
169 }
170
171 jobs.push(job);
172 }
173
174 console.log(`Extracted ${jobs.length} jobs using HTML fallback parser`);
175 return jobs;
176 } catch (error) {
177 console.error('Error parsing HTML directly:', error);
178 return [];
179 }
180};
181
182// Browser fingerprint diversity config
183const browserProfiles = [
184 // US profiles
185 {
186 timezone: 'America/New_York',
187 locale: 'en-US',
188 geolocation: { latitude: 40.7128, longitude: -74.0060, accuracy: 100 }, // NYC
189 platform: 'Win32',
190 platformVersion: '10.0',
191 deviceMemory: 8,
192 hardwareConcurrency: 8,
193 userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
194 },
195 {
196 timezone: 'America/Los_Angeles',
197 locale: 'en-US',
198 geolocation: { latitude: 34.0522, longitude: -118.2437, accuracy: 100 }, // LA
199 platform: 'MacIntel',
200 platformVersion: '10.15.7',
201 deviceMemory: 16,
202 hardwareConcurrency: 12,
203 userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15'
204 },
205 {
206 timezone: 'America/Chicago',
207 locale: 'en-US',
208 geolocation: { latitude: 41.8781, longitude: -87.6298, accuracy: 100 }, // Chicago
209 platform: 'Win32',
210 platformVersion: '11.0',
211 deviceMemory: 16,
212 hardwareConcurrency: 8,
213 userAgent: 'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
214 },
215 // International profiles (for IP diversity)
216 {
217 timezone: 'Europe/London',
218 locale: 'en-GB',
219 geolocation: { latitude: 51.5074, longitude: -0.1278, accuracy: 100 }, // London
220 platform: 'MacIntel',
221 platformVersion: '14.1',
222 deviceMemory: 8,
223 hardwareConcurrency: 10,
224 userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15'
225 },
226 {
227 timezone: 'Europe/Berlin',
228 locale: 'de-DE',
229 geolocation: { latitude: 52.5200, longitude: 13.4050, accuracy: 100 }, // Berlin
230 platform: 'Win32',
231 platformVersion: '10.0',
232 deviceMemory: 16,
233 hardwareConcurrency: 6,
234 userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
235 }
236];
237
238// Place ProxyManager class definition at the top level (outside any block)
239class ProxyManager {
240 constructor(config) {
241 this.config = config;
242 this.proxyStats = new Map();
243 this.activeProxies = new Set();
244 this.failedProxies = new Set();
245 this.badProxies = new Set(); // New set for proxies that fail with 403
246 this.lastRotation = Date.now();
247 this.rotationInterval = 30 * 60 * 1000; // 30 minutes
248 this.maxConcurrency = config.maxConcurrency;
249 this.maxRetries = config.maxRetries;
250 this.concurrentRequests = new Map();
251 this.initialized = false;
252 }
253 // ... rest of ProxyManager methods ...
254}
255
256import { crawlUpworkJobs } from './enhanced-crawler.js';
257
258async function main() {
259 await Actor.init();
260
261 const input = await Actor.getInput() || {};
262 const {
263 searchQuery = 'shopify',
264 page = 1,
265 useApifyProxy = false,
266 maxConcurrency = 1,
267 maxRetries = 3,
268 upworkUsername = '',
269 upworkPassword = '',
270 captchaApiKey = '', // 2captcha API key for solving CAPTCHAs
271 sessionRotationCount = 3, // Number of sessions to rotate through
272 minDelayBetweenRequests = 15, // Minimum delay in seconds between requests
273 maxDelayBetweenRequests = 45, // Maximum delay in seconds between requests
274 sessionCooldownMinutes = 30, // Cooldown period for each session in minutes
275 proxyRotationEnabled = true, // Whether to rotate proxies during session
276 proxyRotationRequests = 3, // Number of requests before rotating proxy
277 simulateBrowserHistory = true, // Whether to simulate browser history
278 randomizeTimezone = true, // Whether to randomize timezone per session
279 useNewHeadless = true, // Whether to use Chrome's new headless mode
280 useRealProfiles = true, // Whether to use real browser profiles
281 useCloudScraper = true, // Whether to use CloudScraper for Cloudflare bypass
282 disableWebSecurity = false, // Whether to disable web security
283 bypassCSP = false, // Whether to bypass Content Security Policy
284 simulateExtensions = true, // Whether to simulate browser extensions
285 useFallbackHtmlParser = true, // Whether to use direct HTML parsing as fallback
286 } = input;
287
288 let shouldUseApifyProxy = useApifyProxy;
289
290 // Configure RecaptchaPlugin if API key is provided
291 if (captchaApiKey) {
292 const recaptchaPlugin = RecaptchaPlugin({
293 provider: {
294 id: '2captcha',
295 token: captchaApiKey,
296 },
297 visualFeedback: true,
298 });
299 puppeteer.use(recaptchaPlugin);
300 }
301
302 // Set up the CAPTCHA solver if API key is provided
303 const solver = captchaApiKey ? new Captcha.Solver(captchaApiKey) : null;
304
305 // Initialize sessions storage
306 const kvStore = await KeyValueStore.open();
307 let sessions = await kvStore.getValue('UPWORK_SESSIONS') || [];
308
309 // After browserProfiles, add common extension simulation data
310 const commonExtensions = [
311 {
312 name: "AdBlock",
313 description: "Block ads and popups",
314 version: "5.13.0",
315 enabled: true
316 },
317 {
318 name: "Honey",
319 description: "Automatic coupon finder",
320 version: "12.8.2",
321 enabled: true
322 },
323 {
324 name: "Grammarly",
325 description: "Grammar checking",
326 version: "14.1033.0",
327 enabled: true
328 },
329 {
330 name: "LastPass",
331 description: "Password manager",
332 version: "4.110.0",
333 enabled: true
334 },
335 {
336 name: "Dark Reader",
337 description: "Dark mode for websites",
338 version: "4.9.67",
339 enabled: false
340 }
341 ];
342
343 // Function to generate random extensions for a profile
344 const generateRandomExtensions = () => {
345 // Choose 2-5 random extensions from the common list
346 const extensionCount = Math.floor(Math.random() * 4) + 2;
347 const shuffled = [...commonExtensions].sort(() => 0.5 - Math.random());
348 const selectedExtensions = shuffled.slice(0, extensionCount);
349
350 // Randomize whether they're enabled
351 return selectedExtensions.map(ext => ({
352 ...ext,
353 enabled: Math.random() > 0.2 // 80% chance of being enabled
354 }));
355 };
356
357 // Generate new sessions if needed
358 if (sessions.length < sessionRotationCount) {
359 console.log(`Creating ${sessionRotationCount - sessions.length} new sessions for rotation`);
360 sessions = Array.from({ length: sessionRotationCount }, (_, i) => {
361 // Assign a random browser profile to this session
362 const profileIndex = Math.floor(Math.random() * browserProfiles.length);
363 const profile = browserProfiles[profileIndex];
364
365 // Create a unique user data directory for this session if using real profiles
366 const userDataDir = useRealProfiles ?
367 path.join(PROFILES_DIR, `profile_${Date.now()}_${i}`) : null;
368
369 return {
370 id: `session_${Date.now()}_${i}`,
371 cookies: null,
372 lastUsed: null,
373 usageCount: 0,
374 cooldownUntil: null,
375 requestsSinceProxyRotation: 0,
376 browserProfile: profile,
377 userDataDir: userDataDir,
378 // Simulate browser extensions if enabled
379 extensions: simulateExtensions ? generateRandomExtensions() : [],
380 // Pre-calculated history entries for common sites
381 browserHistory: simulateBrowserHistory ? generateFakeBrowsingHistory() : null,
382 };
383 });
384 await kvStore.setValue('UPWORK_SESSIONS', sessions);
385 }
386
387 // Helper function to get the next available session
388 const getNextAvailableSession = () => {
389 const now = new Date();
390
391 // Sort sessions: first by cooldown status, then by least recently used
392 const sortedSessions = [...sessions].sort((a, b) => {
393 // First check if session is in cooldown
394 const aInCooldown = a.cooldownUntil && new Date(a.cooldownUntil) > now;
395 const bInCooldown = b.cooldownUntil && new Date(b.cooldownUntil) > now;
396
397 if (aInCooldown && !bInCooldown) return 1;
398 if (!aInCooldown && bInCooldown) return -1;
399
400 // If cooldown status is the same, sort by last used time
401 if (!a.lastUsed) return -1;
402 if (!b.lastUsed) return 1;
403
404 return new Date(a.lastUsed) - new Date(b.lastUsed);
405 });
406
407 // Get the first non-cooldown session
408 const availableSession = sortedSessions.find(s => !s.cooldownUntil || new Date(s.cooldownUntil) <= now);
409
410 if (!availableSession) {
411 console.log('All sessions are in cooldown. Using the session with earliest cooldown end.');
412 return sortedSessions[0];
413 }
414
415 return availableSession;
416 };
417
418 // Helper function to update session after use
419 const updateSessionAfterUse = async (sessionId) => {
420 const sessionIndex = sessions.findIndex(s => s.id === sessionId);
421 if (sessionIndex === -1) return;
422
423 sessions[sessionIndex].lastUsed = new Date().toISOString();
424 sessions[sessionIndex].usageCount++;
425
426 // If session has been used multiple times, put it in cooldown
427 if (sessions[sessionIndex].usageCount >= 3) {
428 const cooldownUntil = new Date();
429 cooldownUntil.setMinutes(cooldownUntil.getMinutes() + sessionCooldownMinutes);
430 sessions[sessionIndex].cooldownUntil = cooldownUntil.toISOString();
431 sessions[sessionIndex].usageCount = 0;
432 console.log(`Session ${sessionId} placed in cooldown until ${cooldownUntil.toISOString()}`);
433 }
434
435 await kvStore.setValue('UPWORK_SESSIONS', sessions);
436 };
437
438 // Generate fake browsing history for a more authentic browser profile
439 function generateFakeBrowsingHistory() {
440 const popularSites = [
441 'https://www.google.com',
442 'https://www.youtube.com',
443 'https://www.facebook.com',
444 'https://www.amazon.com',
445 'https://www.wikipedia.org',
446 'https://www.reddit.com',
447 'https://twitter.com',
448 'https://www.instagram.com',
449 'https://www.linkedin.com',
450 'https://www.netflix.com',
451 'https://www.twitch.tv',
452 'https://www.github.com',
453 'https://news.ycombinator.com',
454 'https://medium.com',
455 'https://www.nytimes.com',
456 'https://www.cnn.com',
457 'https://www.bbc.com',
458 'https://www.espn.com'
459 ];
460
461 // Generate 5-15 random history entries
462 const entryCount = Math.floor(Math.random() * 10) + 5;
463 const history = [];
464
465 for (let i = 0; i < entryCount; i++) {
466 const site = popularSites[Math.floor(Math.random() * popularSites.length)];
467 // Create a timestamp within the last 7 days
468 const timestamp = new Date();
469 timestamp.setDate(timestamp.getDate() - Math.floor(Math.random() * 7));
470 timestamp.setHours(timestamp.getHours() - Math.floor(Math.random() * 24));
471 timestamp.setMinutes(timestamp.getMinutes() - Math.floor(Math.random() * 60));
472
473 history.push({
474 url: site,
475 timestamp: timestamp.toISOString(),
476 title: `${site.split('//')[1].split('.')[1].charAt(0).toUpperCase() + site.split('//')[1].split('.')[1].slice(1)} - Home`
477 });
478 }
479
480 return history;
481 }
482
483 // Function to simulate browser cache and history
484 async function simulateBrowserCacheAndHistory(page, history) {
485 if (!history) return;
486
487 await page.evaluateOnNewDocument((historyEntries) => {
488 // Override History API
489 const originalPushState = window.history.pushState;
490 const originalReplaceState = window.history.replaceState;
491
492 // Simulate history length
493 Object.defineProperty(window.history, 'length', {
494 get: function() {
495 return historyEntries.length;
496 }
497 });
498
499 // Create a fake localStorage with some entries
500 const originalGetItem = Storage.prototype.getItem;
501 const originalSetItem = Storage.prototype.setItem;
502 const originalRemoveItem = Storage.prototype.removeItem;
503
504 const storageCache = {};
505
506 // Add some common storage items
507 storageCache['theme'] = 'light';
508 storageCache['session_visited'] = 'true';
509 storageCache['gdpr_accepted'] = 'true';
510 storageCache['last_visit'] = new Date(Date.now() - 86400000).toISOString();
511
512 // Override localStorage methods
513 Storage.prototype.getItem = function(key) {
514 if (key in storageCache) {
515 return storageCache[key];
516 }
517 return originalGetItem.call(this, key);
518 };
519
520 Storage.prototype.setItem = function(key, value) {
521 storageCache[key] = value;
522 return originalSetItem.call(this, key, value);
523 };
524
525 Storage.prototype.removeItem = function(key) {
526 delete storageCache[key];
527 return originalRemoveItem.call(this, key);
528 };
529
530 }, history);
531 }
532
533 // --- Cloudflare challenge detection helper ---
534 async function isCloudflareChallenge(page) {
535 return await page.evaluate(() => {
536 return (
537 document.title.includes('Cloudflare') ||
538 document.title.includes('Attention Required') ||
539 document.querySelector('div[class*="cf-"]') !== null ||
540 document.querySelector('#challenge-form') !== null ||
541 document.body.innerText.includes('Verifying...')
542 );
543 });
544 }
545
546 // Construire l'URL de recherche Upwork
547 const baseUrl = `https://www.upwork.com/nx/search/jobs/?q=${encodeURIComponent(searchQuery)}`;
548 const startUrl = page === 1 ? baseUrl : `${baseUrl}&page=${page}`;
549
550 // Only instantiate proxyManager if useApifyProxy is true
551 let proxyCfg = null;
552 let proxyManager = null;
553 let sessionProxyUrl = null;
554 if (shouldUseApifyProxy) {
555 try {
556 proxyCfg = await Actor.createProxyConfiguration({
557 groups: ['RESIDENTIAL'],
558 countryCode: 'US',
559 password: process.env.APIFY_PROXY_PASSWORD,
560 });
561 proxyManager = new ProxyManager({ ...proxyCfg, maxConcurrency, maxRetries });
562 const proxyInfo = await proxyCfg.newUrl();
563 const urlObj = new URL(proxyInfo);
564 sessionProxyUrl = `http://groups-RESIDENTIAL,country-US:${process.env.APIFY_PROXY_PASSWORD}@${urlObj.host}`;
565 console.log(`Using proxy: ${sessionProxyUrl}`);
566 } catch (error) {
567 console.warn('Warning: Failed to initialize proxy configuration. Running without proxy:', error.message);
568 // Continue without proxy instead of throwing error
569 shouldUseApifyProxy = false;
570 }
571 } else {
572 console.log('Running without proxy as useApifyProxy is false');
573 }
574
575 // Move the entire definition of async function setupBrowserSession(...) here, just above its first usage.
576 async function setupBrowserSession(sessionId, proxyUrl) {
577 // Implementation of setupBrowserSession function
578 }
579
580 // Setup this session with the proxy URL (null if not using proxy)
581 const activeSession = getNextAvailableSession();
582 const sessionCookies = await setupBrowserSession(activeSession.id, sessionProxyUrl);
583
584 // Create request with session ID
585 const requestList = [];
586 requestList.push({
587 url: startUrl,
588 userData: {
589 pageCount: 0,
590 sessionId: activeSession.id
591 }
592 });
593
594 // Check if we should use the enhanced crawler
595 if (input.useEnhancedCrawler) {
596 console.log('Using enhanced crawler with advanced challenge bypass');
597
598 // Run enhanced crawler
599 await crawlUpworkJobs({
600 searchQuery,
601 maxPages: page || 1,
602 input,
603 proxyConfiguration: proxyCfg,
604 browserProfiles,
605 });
606
607 // Exit after enhanced crawler completes
608 await Actor.exit();
609 } else {
610 // Original crawler creation and execution
611 const crawler = new PuppeteerCrawler({
612 requestHandler: async ({ request, page, log, session, proxyInfo }) => {
613 try {
614 const response = await page.goto(request.url);
615 // Cloudflare challenge detection
616 if (await isCloudflareChallenge(page)) {
617 console.log('Cloudflare challenge detected! Rotating proxy/session, saving screenshot and HTML...');
618 const screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 50 });
619 await Actor.setValue(`cloudflare_challenge_${Date.now()}.jpg`, screenshotBuffer, { contentType: 'image/jpeg' });
620 const html = await page.content();
621 await Actor.setValue(`cloudflare_challenge_${Date.now()}.html`, html);
622 if (proxyManager && proxyInfo && proxyInfo.url) {
623 proxyManager.markProxyAsBad(proxyInfo.url, 'Cloudflare challenge');
624 }
625 // Optionally, rotate session or proxy here
626 await updateSessionAfterUse(session.id);
627 throw new Error('Cloudflare challenge detected, rotating proxy/session.');
628 }
629 if (response.status() === 403) {
630 console.log('Proxy banned by Upwork, marking as bad...');
631 if (proxyManager) {
632 proxyManager.markProxyAsBad(proxyInfo.url, '403 Upwork');
633 }
634 throw new Error('Proxy banned by Upwork');
635 }
636 // ... rest of the handler code ...
637 } catch (error) {
638 console.error(`Error in request handler for session ${session.id}:`, error);
639 await updateSessionAfterUse(session.id);
640 }
641 },
642 maxConcurrency: maxConcurrency,
643 maxRequestRetries: maxRetries,
644 requestHandlerTimeoutSecs: 300,
645 navigationTimeoutSecs: 180,
646 preNavigationHooks: [
647 // Hook pour ajouter des délais aléatoires entre requêtes et configurer le navigateur
648 async ({ page, request, session }) => {
649 // Get session info
650 const { sessionId } = request.userData;
651 const sessionIndex = sessions.findIndex(s => s.id === sessionId);
652
653 // Handle proxy rotation if enabled
654 if (proxyRotationEnabled && proxyCfg && sessionIndex !== -1) {
655 // Check if we need to rotate proxy
656 sessions[sessionIndex].requestsSinceProxyRotation =
657 (sessions[sessionIndex].requestsSinceProxyRotation || 0) + 1;
658
659 if (sessions[sessionIndex].requestsSinceProxyRotation >= proxyRotationRequests) {
660 // Time to rotate proxy
661 const newProxyUrl = await proxyCfg.newUrl();
662 console.log(`Rotating proxy for session ${sessionId} to: ${newProxyUrl}`);
663
664 // Apply the new proxy - this requires recreating the browser session
665 // We'll add the proxy to request.userData so it can be used in the next request
666 request.userData.rotateProxy = true;
667 request.userData.newProxyUrl = newProxyUrl;
668
669 // Reset counter
670 sessions[sessionIndex].requestsSinceProxyRotation = 0;
671 await kvStore.setValue('UPWORK_SESSIONS', sessions);
672 }
673 }
674
675 // First try CloudScraper if enabled to prefetch content and bypass protections
676 if (useCloudScraper) {
677 try {
678 console.log(`Attempting CloudScraper pre-fetch for ${request.url}...`);
679 const csResult = await cloudScraperRequest({
680 method: 'GET',
681 url: request.url,
682 headers: {
683 'User-Agent': sessionIndex !== -1 && sessions[sessionIndex].browserProfile ?
684 sessions[sessionIndex].browserProfile.userAgent :
685 getRandomUserAgent(),
686 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
687 'Accept-Language': sessionIndex !== -1 && sessions[sessionIndex].browserProfile && sessions[sessionIndex].browserProfile.locale ?
688 `${sessions[sessionIndex].browserProfile.locale},en-US;q=0.9,en;q=0.8` :
689 'en-US,en;q=0.9',
690 },
691 resolveWithFullResponse: true,
692 });
693
694 if (csResult.response && csResult.response.statusCode === 200) {
695 console.log('CloudScraper successfully pre-fetched the page');
696 // Could potentially extract content here and parse it directly
697 // if browser keeps getting blocked
698 } else {
699 console.log(`CloudScraper got status ${csResult.response?.statusCode || 'unknown'}`);
700 }
701 } catch (error) {
702 console.error('CloudScraper pre-fetch error:', error.message);
703 }
704 }
705
706 // Use a fresh user agent for each request
707 const profile = sessionIndex !== -1 ? sessions[sessionIndex].browserProfile : null;
708 const userAgent = profile ? profile.userAgent : getRandomUserAgent();
709
710 console.log(`Using user agent: ${userAgent}`);
711 await page.setUserAgent(userAgent);
712
713 // Get session cookies
714 if (sessionIndex !== -1 && sessions[sessionIndex].cookies) {
715 await page.setCookie(...sessions[sessionIndex].cookies);
716 console.log(`Set ${sessions[sessionIndex].cookies.length} cookies for session ${sessionId}`);
717 }
718
719 // Apply timezone from profile if available
720 if (profile && randomizeTimezone) {
721 await page.emulateTimezone(profile.timezone);
722 console.log(`Using timezone: ${profile.timezone}`);
723
724 // Set locale for this request
725 if (profile.locale) {
726 await page.evaluateOnNewDocument((locale) => {
727 Object.defineProperty(navigator, 'language', {
728 get: function() { return locale; }
729 });
730 Object.defineProperty(navigator, 'languages', {
731 get: function() { return [locale, 'en-US']; }
732 });
733 }, profile.locale);
734
735 console.log(`Using locale: ${profile.locale}`);
736 }
737 }
738
739 // Simulate browser history and cache for this session
740 if (simulateBrowserHistory && sessionIndex !== -1 && sessions[sessionIndex].browserHistory) {
741 await simulateBrowserCacheAndHistory(page, sessions[sessionIndex].browserHistory);
742 }
743
744 // Apply advanced evasion techniques
745 await page.evaluateOnNewDocument(() => {
746 // Hide automation fingerprints
747 delete Object.getPrototypeOf(navigator).webdriver;
748
749 // Override platform info
750 Object.defineProperty(navigator, 'platform', {
751 get: () => 'MacIntel',
752 });
753
754 // Override hardware concurrency
755 Object.defineProperty(navigator, 'hardwareConcurrency', {
756 get: () => 8,
757 });
758
759 // Chrome specific properties
760 if (typeof window !== 'undefined') {
761 window.chrome = {
762 runtime: {},
763 loadTimes: function() {},
764 csi: function() {},
765 app: {
766 isInstalled: false,
767 },
768 };
769 }
770
771 // Fix permissions behavior
772 const originalQuery = window.navigator.permissions?.query;
773 if (originalQuery) {
774 window.navigator.permissions.query = (parameters) => {
775 return parameters.name === 'notifications' ?
776 Promise.resolve({ state: Notification.permission }) :
777 originalQuery(parameters);
778 };
779 }
780
781 // Fix WebGL fingerprinting
782 if (window.WebGLRenderingContext) {
783 const getParameter = WebGLRenderingContext.prototype.getParameter;
784 WebGLRenderingContext.prototype.getParameter = function(parameter) {
785 // UNMASKED_VENDOR_WEBGL
786 if (parameter === 37445) {
787 return 'Google Inc. (Intel)';
788 }
789 // UNMASKED_RENDERER_WEBGL
790 if (parameter === 37446) {
791 return 'Intel Iris OpenGL Engine';
792 }
793 return getParameter.call(this, parameter);
794 };
795 }
796 });
797
798 // Generate random ordering for HTTP/2 headers to avoid fingerprinting
799 const headerOrder = [
800 'Accept',
801 'Accept-Language',
802 'Accept-Encoding',
803 'Connection',
804 'Upgrade-Insecure-Requests',
805 'Sec-Fetch-Dest',
806 'Sec-Fetch-Mode',
807 'Sec-Fetch-Site',
808 'Sec-Fetch-User',
809 'Cache-Control',
810 'DNT',
811 'Referer'
812 ].sort(() => Math.random() - 0.5);
813
814 // Apply headers in randomized order
815 const headers = {};
816 for (const header of headerOrder) {
817 switch (header) {
818 case 'Accept':
819 headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8';
820 break;
821 case 'Accept-Language':
822 headers['Accept-Language'] = profile && profile.locale ?
823 `${profile.locale},en-US;q=0.9,en;q=0.8` :
824 'en-US,en;q=0.9';
825 break;
826 case 'Accept-Encoding':
827 headers['Accept-Encoding'] = 'gzip, deflate, br';
828 break;
829 case 'Connection':
830 headers['Connection'] = 'keep-alive';
831 break;
832 case 'Upgrade-Insecure-Requests':
833 headers['Upgrade-Insecure-Requests'] = '1';
834 break;
835 case 'Sec-Fetch-Dest':
836 headers['Sec-Fetch-Dest'] = 'document';
837 break;
838 case 'Sec-Fetch-Mode':
839 headers['Sec-Fetch-Mode'] = 'navigate';
840 break;
841 case 'Sec-Fetch-Site':
842 headers['Sec-Fetch-Site'] = 'none';
843 break;
844 case 'Sec-Fetch-User':
845 headers['Sec-Fetch-User'] = '?1';
846 break;
847 case 'Cache-Control':
848 headers['Cache-Control'] = 'max-age=0';
849 break;
850 case 'DNT':
851 headers['DNT'] = '1';
852 break;
853 case 'Referer':
854 // Use a previous browsing history item as referer occasionally
855 if (sessionIndex !== -1 &&
856 sessions[sessionIndex].browserHistory &&
857 Math.random() > 0.5) {
858 const historyItems = sessions[sessionIndex].browserHistory;
859 if (historyItems.length > 0) {
860 const randomItem = historyItems[Math.floor(Math.random() * historyItems.length)];
861 headers['Referer'] = randomItem.url;
862 } else {
863 headers['Referer'] = 'https://www.google.com/search?q=upwork+jobs';
864 }
865 } else {
866 headers['Referer'] = 'https://www.google.com/search?q=upwork+jobs';
867 }
868 break;
869 }
870 }
871
872 // Set the headers with random ordering
873 await page.setExtraHTTPHeaders(headers);
874
875 // Apply much longer delays between requests (15-45 seconds)
876 const delaySeconds = Math.floor(Math.random() * (maxDelayBetweenRequests - minDelayBetweenRequests + 1)) + minDelayBetweenRequests;
877 console.log(`Waiting ${delaySeconds} seconds before the next request...`);
878 await new Promise(resolve => setTimeout(resolve, delaySeconds * 1000));
879 },
880 ],
881 postNavigationHooks: [
882 // Hook to handle CAPTCHA and challenges after navigation
883 async ({ page, request, response, session }) => {
884 const { sessionId } = request.userData;
885
886 // Take screenshot for debugging navigation issues
887 if (!response || response.status() >= 400) {
888 const screenshotBuffer = await page.screenshot({ type: "jpeg", quality: 50 });
889 await Actor.setValue(`navigation_error_${Date.now()}.jpg`, screenshotBuffer, { contentType: "image/jpeg" });
890 console.log(`Error screenshot saved for status ${response ? response.status() : 'No response'}`);
891
892 // If it's a 403, try to use CloudScraper as a fallback
893 if (response && response.status() === 403 && useCloudScraper) {
894 try {
895 console.log('Got 403, attempting to bypass with CloudScraper...');
896 const cloudScraperResult = await cloudScraperRequest({
897 method: 'GET',
898 url: request.url,
899 resolveWithFullResponse: true,
900 });
901
902 if (cloudScraperResult.response && cloudScraperResult.response.statusCode === 200) {
903 console.log('CloudScraper successfully retrieved content, saving to dataset');
904
905 // Save the HTML content to a file for manual analysis
906 await Actor.setValue(`cloudscraper_content_${Date.now()}.html`, cloudScraperResult.body);
907
908 // Use direct HTML parsing as a fallback if enabled
909 if (useFallbackHtmlParser) {
910 try {
911 console.log('Attempting to parse jobs from Cloudflare-bypassed HTML...');
912 const extractedJobs = parseJobsFromHTML(cloudScraperResult.body);
913
914 if (extractedJobs.length > 0) {
915 console.log(`Successfully extracted ${extractedJobs.length} jobs with HTML parser from Cloudflare-bypassed content`);
916
917 // Save to dataset
918 const dataset = await Actor.openDataset('jobs');
919 await dataset.pushData(extractedJobs);
920
921 console.log('Jobs from Cloudflare-bypassed content saved to dataset');
922
923 // Mark the request as handled
924 request.userData.parsedWithFallback = true;
925 }
926 } catch (parserError) {
927 console.error('Error parsing Cloudflare-bypassed content:', parserError);
928 }
929 }
930 } else {
931 console.log(`CloudScraper also failed with status: ${cloudScraperResult.response?.statusCode || 'unknown'}`);
932 }
933 } catch (error) {
934 console.error('Error using CloudScraper fallback:', error);
935 }
936 }
937 }
938
939 // Check for security challenges
940 const securityChallenges = await detectSecurityChallenges(page);
941
942 if (securityChallenges.hasCaptcha) {
943 console.log('CAPTCHA detected during navigation');
944 const screenshotBuffer = await page.screenshot({ type: "jpeg", quality: 50 });
945 await Actor.setValue(`captcha_challenge_${Date.now()}.jpg`, screenshotBuffer, { contentType: "image/jpeg" });
946
947 // Try to solve the CAPTCHA with the plugin first if available
948 if (captchaApiKey) {
949 try {
950 console.log('Attempting to solve with puppeteer-extra-plugin-recaptcha...');
951 await page.solveRecaptchas();
952 console.log('Recaptcha solved with plugin');
953
954 // Check if there's a form to submit after solving
955 const submitButton = await page.$('button[type="submit"]');
956 if (submitButton) {
957 await submitButton.click();
958 await page.waitForNavigation({ timeout: 30000 });
959 }
960 } catch (recaptchaError) {
961 console.error('Error with recaptcha plugin:', recaptchaError);
962 // Fall back to our custom implementation
963 const solved = await solveCaptcha(page);
964 if (solved) {
965 console.log('Navigation CAPTCHA solved successfully with fallback method');
966 } else {
967 console.log('Failed to solve navigation CAPTCHA');
968 session.markBad();
969 }
970 }
971 } else {
972 // Use our custom implementation
973 const solved = await solveCaptcha(page);
974 if (solved) {
975 console.log('Navigation CAPTCHA solved successfully');
976 } else {
977 console.log('Failed to solve navigation CAPTCHA');
978 session.markBad();
979 }
980 }
981 }
982
983 if (securityChallenges.hasCloudflare) {
984 console.log('Cloudflare challenge detected during navigation');
985 const screenshotBuffer = await page.screenshot({ type: "jpeg", quality: 50 });
986 await Actor.setValue(`cloudflare_challenge_${Date.now()}.jpg`, screenshotBuffer, { contentType: "image/jpeg" });
987
988 // Try to bypass with CloudScraper if enabled
989 if (useCloudScraper) {
990 try {
991 console.log('Attempting to bypass Cloudflare with CloudScraper...');
992 const cloudScraperResult = await cloudScraperRequest({
993 method: 'GET',
994 url: request.url,
995 resolveWithFullResponse: true,
996 });
997
998 if (cloudScraperResult.response && cloudScraperResult.response.statusCode === 200) {
999 console.log('CloudScraper successfully bypassed Cloudflare, saving content');
1000 await Actor.setValue(`cloudscraper_cf_bypass_${Date.now()}.html`, cloudScraperResult.body);
1001
1002 // Try to parse the HTML content directly
1003 if (useFallbackHtmlParser) {
1004 try {
1005 console.log('Attempting to parse jobs from Cloudflare-bypassed HTML...');
1006 const extractedJobs = parseJobsFromHTML(cloudScraperResult.body);
1007
1008 if (extractedJobs.length > 0) {
1009 console.log(`Successfully extracted ${extractedJobs.length} jobs with HTML parser from Cloudflare-bypassed content`);
1010
1011 // Save to dataset
1012 const dataset = await Actor.openDataset('jobs');
1013 await dataset.pushData(extractedJobs);
1014
1015 console.log('Jobs from Cloudflare-bypassed content saved to dataset');
1016
1017 // Mark the request as handled
1018 request.userData.parsedWithFallback = true;
1019 }
1020 } catch (parserError) {
1021 console.error('Error parsing Cloudflare-bypassed content:', parserError);
1022 }
1023 }
1024 } else {
1025 console.log('CloudScraper also failed to bypass Cloudflare');
1026 }
1027 } catch (error) {
1028 console.error('Error using CloudScraper for Cloudflare bypass:', error);
1029 }
1030 }
1031
1032 // Wait longer for Cloudflare to resolve
1033 console.log('Waiting longer for Cloudflare challenge resolution...');
1034 await new Promise(resolve => setTimeout(resolve, 30000));
1035 }
1036
1037 // After handling any challenges, simulate human behavior again
1038 await emulateHumanBehavior(page);
1039
1040 // Random network pattern: Sometimes load additional resources to appear more human-like
1041 if (Math.random() > 0.7) {
1042 try {
1043 console.log('Simulating additional resource loading for natural network patterns...');
1044
1045 // Choose random URLs to visit briefly
1046 const commonResources = [
1047 'https://www.upwork.com/static/assets/css/main.css',
1048 'https://www.upwork.com/static/fonts/font.woff2',
1049 'https://www.upwork.com/ab/account-security/login',
1050 'https://www.upwork.com/nx/create-profile/',
1051 'https://www.upwork.com/resources/'
1052 ];
1053
1054 // Pick 1-2 random resources
1055 const resourceCount = Math.floor(Math.random() * 2) + 1;
1056 const selectedResources = [...commonResources]
1057 .sort(() => 0.5 - Math.random())
1058 .slice(0, resourceCount);
1059
1060 // Fetch these resources in the background
1061 for (const resourceUrl of selectedResources) {
1062 await page.evaluate((url) => {
1063 // Create and append a hidden iframe to load the resource
1064 const iframe = document.createElement('iframe');
1065 iframe.style.width = '0px';
1066 iframe.style.height = '0px';
1067 iframe.style.position = 'absolute';
1068 iframe.style.top = '-9999px';
1069 iframe.style.left = '-9999px';
1070 iframe.src = url;
1071 document.body.appendChild(iframe);
1072
1073 // Remove it after a short time
1074 setTimeout(() => {
1075 if (iframe && iframe.parentNode) {
1076 iframe.parentNode.removeChild(iframe);
1077 }
1078 }, 5000);
1079 }, resourceUrl);
1080
1081 // Wait a short time between resource requests
1082 await new Promise(resolve => setTimeout(resolve, Math.floor(Math.random() * 2000) + 1000));
1083 }
1084
1085 console.log('Additional resources loaded for natural network pattern');
1086 } catch (error) {
1087 console.error('Error simulating additional resource loading:', error);
1088 }
1089 }
1090 },
1091 ],
1092 proxyConfiguration: proxyCfg,
1093 launchContext: {
1094 launcher: puppeteer,
1095 launchOptions: {
1096 headless: useNewHeadless ? "new" : true,
1097 ignoreHTTPSErrors: true,
1098 args: [
1099 '--disable-gpu',
1100 '--no-sandbox',
1101 '--disable-setuid-sandbox',
1102 '--disable-blink-features=AutomationControlled',
1103 '--disable-accelerated-2d-canvas',
1104 '--disable-infobars',
1105 '--window-size=1920,1080',
1106 '--hide-scrollbars',
1107 '--disable-notifications',
1108 '--disable-extensions',
1109 '--ignore-certificate-errors',
1110 ...(disableWebSecurity ? ['--disable-web-security'] : []),
1111 ...(bypassCSP ? ['--disable-features=IsolateOrigins,site-per-process', '--disable-site-isolation-trials'] : []),
1112 ],
1113 defaultViewport: {
1114 width: 1920,
1115 height: 1080,
1116 },
1117 },
1118 },
1119 });
1120 }
1121
1122 // Lancer le crawler avec l'URL initiale (only if we're using the original crawler)
1123 if (!input.useEnhancedCrawler) {
1124 await crawler.run(requestList);
1125 }
1126
1127 await Actor.exit();
1128}
1129
1130main();
1131
1132export default main;

src/routes.js

1import { Actor, Dataset } from 'apify';
2
3// Liste des User-Agents (évite la détection anti-bot)
4const userAgents = [
5 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
6 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
7 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
8 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
9 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
10];
11
12// Sélection aléatoire d'un User-Agent
13const getRandomUserAgent = () => userAgents[Math.floor(Math.random() * userAgents.length)];
14
15// Pause aléatoire pour simuler un comportement humain
16const randomSleep = (minMs, maxMs) =>
17 new Promise(resolve => setTimeout(resolve, Math.floor(Math.random() * (maxMs - minMs + 1)) + minMs));
18
19// Capture d'écran pour debug
20const takeScreenshot = async (page, filename) => {
21 try {
22 const screenshotBuffer = await page.screenshot({ type: "jpeg", quality: 50, fullPage: false });
23 const screenshotKey = `${filename}-${Date.now()}.jpeg`;
24 await Actor.setValue(screenshotKey, screenshotBuffer, { contentType: "image/jpeg" });
25 const screenshotUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${screenshotKey}`;
26 console.log(`📸 Screenshot saved: ${screenshotUrl}`);
27 return screenshotUrl;
28 } catch (error) {
29 console.error(`❌ Failed to take screenshot: ${error.message}`);
30 return null;
31 }
32};
33
34// Simulate human-like interaction with the page
35const simulateHumanInteraction = async (page) => {
36 // Simulate random mouse movements
37 await page.evaluate(() => {
38 const randomPoint = () => ({
39 x: Math.floor(Math.random() * window.innerWidth),
40 y: Math.floor(Math.random() * window.innerHeight),
41 });
42
43 // Create several points for a natural curve
44 const points = Array.from({ length: 5 }, randomPoint);
45
46 // Simulate mouse movement across these points
47 points.forEach((point) => {
48 const event = new MouseEvent('mousemove', {
49 bubbles: true,
50 cancelable: true,
51 clientX: point.x,
52 clientY: point.y,
53 });
54 document.dispatchEvent(event);
55 });
56 });
57
58 // Random scrolling with natural speed
59 await page.evaluate(() => {
60 return new Promise((resolve) => {
61 let scrollTop = 0;
62 const maxScroll = Math.max(
63 document.body.scrollHeight,
64 document.documentElement.scrollHeight
65 ) / 3;
66
67 // Create a natural scroll down pattern
68 const scroll = () => {
69 const step = Math.floor(Math.random() * 100) + 50;
70 scrollTop += step;
71 window.scrollTo(0, scrollTop);
72
73 if (scrollTop < maxScroll) {
74 setTimeout(scroll, Math.floor(Math.random() * 200) + 100);
75 } else {
76 // Sometimes scroll back up a bit
77 if (Math.random() > 0.7) {
78 setTimeout(() => {
79 window.scrollTo(0, scrollTop - Math.floor(Math.random() * 300));
80 resolve();
81 }, Math.floor(Math.random() * 500) + 500);
82 } else {
83 resolve();
84 }
85 }
86 };
87
88 setTimeout(scroll, Math.floor(Math.random() * 500) + 100);
89 });
90 });
91
92 // Random pauses
93 await randomSleep(1000, 3000);
94};
95
96// Vérifier si une page contient un challenge Cloudflare
97const hasCloudflareChallenge = async (page) => {
98 return page.evaluate(() => {
99 return document.title.includes('Cloudflare') ||
100 document.title.includes('Attention Required') ||
101 document.querySelector('div[class*="cf-"]') !== null ||
102 document.querySelector('#challenge-form') !== null;
103 });
104};
105
106// Check for other security challenges (like CAPTCHA, DataDome, etc.)
107const hasSecurityChallenge = async (page) => {
108 return page.evaluate(() => {
109 const pageContent = document.body.innerText.toLowerCase();
110 const pageHtml = document.documentElement.innerHTML.toLowerCase();
111
112 // Common security challenge indicators
113 const securityIndicators = [
114 'captcha',
115 'security check',
116 'bot protection',
117 'human verification',
118 'datadome',
119 'are you a robot',
120 'prove you are human',
121 'перехресні дороги', // reCAPTCHA phrase
122 'traffic light', // reCAPTCHA phrase
123 'human challenge'
124 ];
125
126 // Check for text indicators in the page
127 const hasTextIndicator = securityIndicators.some(indicator =>
128 pageContent.includes(indicator));
129
130 // Check for specific elements that might indicate a challenge
131 const hasElementIndicator =
132 document.querySelector('iframe[src*="captcha"]') !== null ||
133 document.querySelector('iframe[src*="recaptcha"]') !== null ||
134 document.querySelector('div[class*="captcha"]') !== null ||
135 document.querySelector('div[class*="g-recaptcha"]') !== null ||
136 document.querySelector('div[class*="h-captcha"]') !== null;
137
138 return hasTextIndicator || hasElementIndicator;
139 });
140};
141
142// Fonction principale de scraping
143export async function handleRequest({ request, page, log, session }) {
144 log.info(`🔍 Scraping: ${request.url}`);
145
146 try {
147 // Définir un User-Agent aléatoire
148 const userAgent = getRandomUserAgent();
149 await page.setUserAgent(userAgent);
150
151 // Ajouter des en-têtes HTTP supplémentaires plus réalistes
152 await page.setExtraHTTPHeaders({
153 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
154 'Accept-Language': 'en-US,en;q=0.9',
155 'Accept-Encoding': 'gzip, deflate, br',
156 'Connection': 'keep-alive',
157 'Upgrade-Insecure-Requests': '1',
158 'Sec-Fetch-Dest': 'document',
159 'Sec-Fetch-Mode': 'navigate',
160 'Sec-Fetch-Site': 'none',
161 'Sec-Fetch-User': '?1',
162 'Cache-Control': 'max-age=0',
163 'DNT': '1',
164 'Referer': 'https://www.google.com/search?q=upwork+jobs',
165 });
166
167 // Configure advanced browser fingerprinting evasion
168 await page.evaluateOnNewDocument(() => {
169 // Basic webdriver removal
170 delete Object.getPrototypeOf(navigator).webdriver;
171
172 // Advanced browser fingerprinting evasion
173 const overrideWebGL = () => {
174 if (!window.WebGLRenderingContext) {
175 return;
176 }
177
178 // Override WebGL fingerprinting
179 const getParameter = WebGLRenderingContext.prototype.getParameter;
180 WebGLRenderingContext.prototype.getParameter = function(parameter) {
181 // UNMASKED_VENDOR_WEBGL
182 if (parameter === 37445) {
183 return 'Google Inc. (Intel)';
184 }
185 // UNMASKED_RENDERER_WEBGL
186 if (parameter === 37446) {
187 return 'Intel Iris OpenGL Engine';
188 }
189 return getParameter.call(this, parameter);
190 };
191 };
192
193 // Canvas fingerprinting evasion
194 const overrideCanvas = () => {
195 const originalGetContext = HTMLCanvasElement.prototype.getContext;
196 HTMLCanvasElement.prototype.getContext = function(type, attributes) {
197 const context = originalGetContext.call(this, type, attributes);
198
199 if (context && type === '2d') {
200 const originalGetImageData = context.getImageData;
201 context.getImageData = function(...args) {
202 const imageData = originalGetImageData.apply(this, args);
203 // Subtle modifications to canvas data to produce consistent fingerprint
204 // but one that doesn't exactly match headless browsers
205 const pixels = imageData.data;
206 for (let i = 0; i < pixels.length; i += 4) {
207 // Only modify a small percentage of pixels for subtle alteration
208 if (Math.random() < 0.005) {
209 pixels[i] = pixels[i] ^ 1; // Red
210 pixels[i+1] = pixels[i+1] ^ 1; // Green
211 pixels[i+2] = pixels[i+2] ^ 1; // Blue
212 }
213 }
214 return imageData;
215 };
216 }
217
218 return context;
219 };
220 };
221
222 // Audio fingerprinting evasion
223 const overrideAudioContext = () => {
224 if (window.AudioContext) {
225 const originalGetChannelData = AudioBuffer.prototype.getChannelData;
226 AudioBuffer.prototype.getChannelData = function(channel) {
227 const data = originalGetChannelData.call(this, channel);
228 // Only modify a small percentage for subtle changes
229 if (window.AUDIO_FINGERPRINT_DEFENDED !== true && data.length > 0) {
230 window.AUDIO_FINGERPRINT_DEFENDED = true;
231 for (let i = 0; i < Math.min(data.length, 8); i++) {
232 data[i] = data[i] + Math.random() * 0.0001;
233 }
234 }
235 return data;
236 };
237 }
238 };
239
240 // Custom font fingerprinting evasion
241 const overrideFonts = () => {
242 // Override font measurement to produce consistent readings
243 if (document.fonts && document.fonts.load) {
244 const originalLoad = document.fonts.load;
245 document.fonts.load = function(...args) {
246 return originalLoad.apply(this, args).catch(err => {
247 // Suppress certain font loading errors
248 return Promise.resolve([]);
249 });
250 };
251 }
252 };
253
254 // Timezone & language consistency
255 const overrideTimeAndLanguage = () => {
256 // Override Intl methods to ensure consistent language/locale
257 if (window.Intl && window.Intl.DateTimeFormat) {
258 const originalDateTimeFormat = window.Intl.DateTimeFormat;
259 window.Intl.DateTimeFormat = function(...args) {
260 if (args.length === 0 || !args[0]) {
261 args[0] = 'en-US';
262 }
263 return new originalDateTimeFormat(...args);
264 };
265 }
266 };
267
268 // Apply all evasion techniques
269 overrideWebGL();
270 overrideCanvas();
271 overrideAudioContext();
272 overrideFonts();
273 overrideTimeAndLanguage();
274
275 // Chrome specific properties
276 if (typeof window !== 'undefined') {
277 window.chrome = {
278 runtime: {},
279 loadTimes: function() {},
280 csi: function() {},
281 app: {
282 isInstalled: false,
283 },
284 };
285 }
286
287 // Fix permissions behavior
288 const originalQuery = window.navigator.permissions?.query;
289 if (originalQuery) {
290 window.navigator.permissions.query = (parameters) => {
291 return parameters.name === 'notifications' ?
292 Promise.resolve({ state: Notification.permission }) :
293 originalQuery(parameters);
294 };
295 }
296 });
297
298 // Pause aléatoire avant de charger la page (comportement humain)
299 await randomSleep(2000, 5000);
300
301 // Charger la page avec une stratégie plus patiente
302 const response = await page.goto(request.url, {
303 waitUntil: 'networkidle2',
304 timeout: 180000
305 });
306
307 // Verification for Cloudflare and other protection challenges
308 const isCloudflare = await hasCloudflareChallenge(page);
309 const isSecurityChallenge = await hasSecurityChallenge(page);
310
311 if (isCloudflare) {
312 log.warning("⚠️ Cloudflare challenge detected! Taking screenshot and waiting...");
313 await takeScreenshot(page, "cloudflare_challenge");
314
315 // Wait longer for Cloudflare to resolve (sometimes it passes automatically)
316 await randomSleep(15000, 30000);
317
318 // Take another screenshot after waiting
319 await takeScreenshot(page, "after_cloudflare_wait");
320 }
321
322 if (isSecurityChallenge) {
323 log.warning("⚠️ Security challenge (CAPTCHA, etc.) detected! Taking screenshot...");
324 await takeScreenshot(page, "security_challenge");
325
326 // Store the page HTML for analysis
327 const pageContent = await page.content();
328 await Actor.setValue('challenge_page_html', pageContent);
329
330 session.markBad();
331 throw new Error("Security challenge detected - possible CAPTCHA or verification required");
332 }
333
334 // Vérification des erreurs 403 (blocage)
335 if (!response || response.status() === 403) {
336 log.warning(`🚨 403 Forbidden. Taking screenshot...`);
337 await takeScreenshot(page, "403_error");
338
339 // Get the page HTML for analysis
340 const pageContent = await page.content();
341 await Actor.setValue('forbidden_page_html', pageContent);
342
343 session.markBad();
344 throw new Error("Blocked with 403");
345 }
346
347 // Verify the response is OK
348 if (!response || !response.ok()) {
349 log.warning(`🚨 Bad response: ${response ? response.status() : 'No response'}. Taking screenshot...`);
350 await takeScreenshot(page, "bad_response");
351
352 // Get the page HTML for analysis
353 const pageContent = await page.content();
354 await Actor.setValue('bad_response_html', pageContent);
355
356 session.markBad();
357 throw new Error(`Bad response: ${response ? response.status() : 'No response'}`);
358 }
359
360 // Pause plus longue après chargement de la page
361 await randomSleep(5000, 10000);
362
363 // Vérifier l'existence des sélecteurs de job avant de continuer
364 const hasJobElements = await page.evaluate(() => {
365 return document.querySelectorAll('article[data-test="JobTile"]').length > 0;
366 });
367
368 if (!hasJobElements) {
369 log.warning("No job elements found on the page. Taking screenshot...");
370 await takeScreenshot(page, "no_jobs_found");
371
372 // Sauvegarder le HTML pour debug
373 const pageContent = await page.content();
374 await Actor.setValue('page_html', pageContent);
375 log.info("Page HTML saved for debugging");
376
377 // Vérifier si on est sur une page de login ou de captcha
378 const isLoginPage = await page.evaluate(() => {
379 return document.body.innerText.includes('Log In') ||
380 document.body.innerText.includes('Sign In') ||
381 document.body.innerText.includes('CAPTCHA');
382 });
383
384 if (isLoginPage) {
385 log.warning("Login or CAPTCHA page detected. Authentication may be required.");
386 session.markBad();
387 throw new Error("Authentication required - redirected to login page");
388 }
389
390 // Try scraping anyway - in case the selector has changed
391 log.info("Continuing anyway - selector might have changed");
392 }
393
394 // Simulate realistic human browsing behavior
395 await simulateHumanInteraction(page);
396
397 // Extraction des informations avec des sélecteurs stables (`data-test`)
398 const jobs = await page.evaluate(() => {
399 // Try multiple selector strategies to find job listings
400 const jobSelectors = [
401 'article[data-test="JobTile"]',
402 'article.job-tile',
403 'div[class*="job-tile"]',
404 'div[class*="jobTile"]',
405 'section.job-list article',
406 'div.up-card-section'
407 ];
408
409 // Try each selector until we find elements
410 let jobElements = [];
411 for (const selector of jobSelectors) {
412 jobElements = document.querySelectorAll(selector);
413 if (jobElements.length > 0) {
414 break;
415 }
416 }
417
418 // Convert to array and extract data
419 return Array.from(jobElements).map(job => {
420 // Flexible extraction function that tries multiple selectors
421 const getTextFromSelectors = (selectors) => {
422 for (const selector of selectors) {
423 const element = job.querySelector(selector);
424 if (element && element.innerText.trim()) {
425 return element.innerText.trim();
426 }
427 }
428 return "N/A";
429 };
430
431 // Flexible link extraction
432 const getLinkFromSelectors = (selectors) => {
433 for (const selector of selectors) {
434 const element = job.querySelector(selector);
435 if (element && element.getAttribute('href')) {
436 const href = element.getAttribute('href');
437 return href.startsWith('http') ? href : `https://www.upwork.com${href}`;
438 }
439 }
440 return "N/A";
441 };
442
443 // Extract job information with multiple fallback selectors
444 return {
445 title: getTextFromSelectors([
446 '[data-test="job-tile-title-link"]',
447 'a[href*="/job/"]',
448 '.job-title',
449 'h2 a',
450 'h3 a'
451 ]),
452 jobLink: getLinkFromSelectors([
453 '[data-test="job-tile-title-link"]',
454 'a[href*="/job/"]',
455 '.job-title a',
456 'h2 a',
457 'h3 a'
458 ]),
459 postedDate: getTextFromSelectors([
460 '[data-test="job-pubilshed-date"] span:nth-child(2)',
461 '.posted-on',
462 'span[class*="postedOn"]',
463 'span[class*="datePosted"]'
464 ]),
465 jobType: getTextFromSelectors([
466 '[data-test="job-type-label"] strong',
467 '.contract-type',
468 'span[class*="jobType"]'
469 ]),
470 experienceLevel: getTextFromSelectors([
471 '[data-test="experience-level"] strong',
472 '.experience-level',
473 'span[class*="experience"]'
474 ]),
475 duration: getTextFromSelectors([
476 '[data-test="duration-label"] strong:nth-child(2)',
477 '.duration',
478 'span[class*="duration"]'
479 ]),
480 description: getTextFromSelectors([
481 '[data-test="UpCLineClamp JobDescription"] p',
482 '.job-description',
483 'div[class*="description"]',
484 'p.description'
485 ]),
486 budget: getTextFromSelectors([
487 '[data-test="budget"] span',
488 '.budget',
489 'span[class*="budget"]',
490 'span[class*="price"]'
491 ]),
492 skills: getTextFromSelectors([
493 '[data-test="skills"]',
494 '.skills',
495 'div[class*="skills"]'
496 ]),
497 };
498 });
499 });
500
501 log.info(`✅ Found ${jobs.length} job listings.`);
502
503 // Stockage du résultat sous forme d'un tableau `jobs` à la racine
504 await Dataset.pushData(jobs);
505
506 } catch (error) {
507 log.error(`❌ Error scraping ${request.url}: ${error.message}`);
508 session.markBad();
509 }
510}

src/upwork-challenge-bypass.js

1/**
2 * Upwork Challenge Bypass Module
3 * Specialized techniques to bypass Upwork's advanced anti-bot system
4 */
5
6import puppeteer from 'puppeteer-extra';
7import StealthPlugin from 'puppeteer-extra-plugin-stealth';
8import { FingerprintGenerator } from 'fingerprint-generator';
9import { FingerprintInjector } from 'fingerprint-injector';
10import { HttpsProxyAgent } from 'https-proxy-agent';
11import fetch from 'node-fetch';
12import fs from 'fs/promises';
13import path from 'path';
14import { fileURLToPath } from 'url';
15
16// Initialize plugins
17puppeteer.use(StealthPlugin());
18const __dirname = path.dirname(fileURLToPath(import.meta.url));
19
20// Create fingerprint generator with constantly shifting browser signatures
21const fingerprintGenerator = new FingerprintGenerator({
22 browsers: [
23 { name: 'chrome', minVersion: 88 },
24 { name: 'firefox', minVersion: 94 },
25 { name: 'safari', minVersion: 15 }
26 ],
27 operatingSystems: ['windows', 'macos', 'linux'],
28 devices: ['desktop'],
29 locales: ['en-US', 'en-GB', 'de-DE', 'fr-FR'],
30});
31
32const fingerprintInjector = new FingerprintInjector();
33
34/**
35 * Analyzes HTTP headers from a successful Upwork session
36 * @param {string} captureFile Path to TCPDump capture from a working session
37 * @returns {Object} Header patterns to mimic
38 */
39async function analyzeWorkingHeaders(captureFile) {
40 try {
41 const capture = await fs.readFile(captureFile, 'utf-8');
42 // Parse capture to extract successful headers
43 // This is a simplified implementation - the real version would parse pcap files
44 return {
45 order: ['host', 'user-agent', 'accept', 'accept-language', 'accept-encoding', 'connection'],
46 specialValues: {
47 'sec-fetch-site': 'same-origin',
48 'sec-fetch-mode': 'navigate',
49 'sec-fetch-user': '?1',
50 'sec-fetch-dest': 'document',
51 'sec-ch-ua-platform': '"Windows"',
52 'sec-ch-ua-mobile': '?0',
53 }
54 };
55 } catch (error) {
56 console.log('No capture file available, using default headers');
57 return {
58 order: ['host', 'user-agent', 'accept', 'accept-language', 'accept-encoding', 'connection'],
59 specialValues: {
60 'sec-fetch-site': 'none',
61 'sec-fetch-mode': 'navigate',
62 'sec-fetch-user': '?1',
63 'sec-fetch-dest': 'document',
64 }
65 };
66 }
67}
68
69/**
70 * Creates browser with advanced fingerprint evasion
71 * @param {Object} options Browser configuration options
72 * @returns {Promise<Browser>} Configured browser instance
73 */
74async function createEvasiveBrowser(options = {}) {
75 const {
76 proxy,
77 headless = false,
78 profileDir,
79 cookiesPath,
80 fingerprintOptions = {},
81 } = options;
82
83 // Generate fingerprint
84 const fingerprint = fingerprintGenerator.getFingerprint({
85 devices: ['desktop'],
86 ...fingerprintOptions,
87 });
88
89 // Enhanced browser launch options
90 const launchOptions = {
91 headless,
92 args: [
93 '--disable-blink-features=AutomationControlled',
94 '--disable-features=IsolateOrigins,site-per-process',
95 '--disable-site-isolation-trials',
96 '--disable-web-security',
97 '--disable-features=ShutdownEventDrain',
98 '--disable-setuid-sandbox',
99 '--no-sandbox',
100 '--disable-dev-shm-usage',
101 '--disable-accelerated-2d-canvas',
102 '--no-first-run',
103 '--no-zygote',
104 '--disable-gpu',
105 '--disable-background-networking',
106 '--disable-default-apps',
107 '--disable-extensions',
108 '--disable-sync',
109 '--disable-background-timer-throttling',
110 '--disable-backgrounding-occluded-windows',
111 '--disable-client-side-phishing-detection',
112 '--disable-component-extensions-with-background-pages',
113 '--disable-domain-reliability',
114 '--disable-hang-monitor',
115 '--disable-ipc-flooding-protection',
116 '--disable-notifications',
117 '--disable-popup-blocking',
118 '--disable-prompt-on-repost',
119 '--disable-renderer-backgrounding',
120 '--disable-speech-api',
121 '--disable-breakpad',
122 '--metrics-recording-only',
123 '--mute-audio',
124 '--no-default-browser-check',
125 '--password-store=basic',
126 `--user-agent=${fingerprint.userAgent}`,
127 '--window-size=1920,1080',
128 '--window-position=0,0',
129 ],
130 ignoreHTTPSErrors: true,
131 defaultViewport: null,
132 };
133
134 // Add proxy if provided
135 if (proxy) {
136 launchOptions.args.push(`--proxy-server=${proxy}`);
137 }
138
139 // Add profile directory if provided
140 if (profileDir) {
141 launchOptions.userDataDir = profileDir;
142 }
143
144 // Launch browser
145 const browser = await puppeteer.launch(launchOptions);
146
147 // Load cookies if available
148 if (cookiesPath) {
149 try {
150 const cookiesString = await fs.readFile(cookiesPath, 'utf8');
151 const cookies = JSON.parse(cookiesString);
152 const pages = await browser.pages();
153 await pages[0].setCookie(...cookies);
154 } catch (error) {
155 console.log('No cookies found or invalid format');
156 }
157 }
158
159 return { browser, fingerprint };
160}
161
162/**
163 * Bypass Upwork's challenge page
164 * @param {Object} page Puppeteer page object
165 * @param {Object} fingerprint Browser fingerprint data
166 * @returns {Promise<boolean>} Success status
167 */
168async function bypassChallenge(page, fingerprint) {
169 try {
170 // Inject complete fingerprint into page
171 await fingerprintInjector.attachFingerprintToPuppeteer(page, fingerprint);
172
173 // Analyze headers from working sessions if available
174 const headerPatterns = await analyzeWorkingHeaders(
175 path.join(__dirname, '../network_analysis/successful_capture.pcap')
176 );
177
178 // Intercept and modify requests to match successful patterns
179 await page.setRequestInterception(true);
180 page.on('request', async (request) => {
181 const headers = request.headers();
182
183 // Apply special header values
184 for (const [key, value] of Object.entries(headerPatterns.specialValues)) {
185 headers[key] = value;
186 }
187
188 // Reorder headers to match typical browser patterns
189 const orderedHeaders = {};
190 headerPatterns.order.forEach(header => {
191 if (headers[header]) {
192 orderedHeaders[header] = headers[header];
193 }
194 });
195
196 // Add remaining headers
197 Object.keys(headers).forEach(header => {
198 if (!orderedHeaders[header]) {
199 orderedHeaders[header] = headers[header];
200 }
201 });
202
203 request.continue({
204 headers: orderedHeaders,
205 });
206 });
207
208 // Monitor for challenge elements
209 page.on('load', async () => {
210 const isChallengePage = await page.evaluate(() => {
211 return document.title.includes('Challenge') ||
212 document.querySelector('form[id*="challenge"]') !== null;
213 });
214
215 if (isChallengePage) {
216 console.log('Challenge page detected, attempting to solve...');
217
218 // Wait to see if any human verification appears
219 await page.waitForTimeout(3000);
220
221 // Look for common challenge elements and interact
222 const hasRecaptcha = await page.evaluate(() => {
223 return document.querySelector('iframe[src*="recaptcha"]') !== null;
224 });
225
226 if (hasRecaptcha) {
227 console.log('reCAPTCHA detected - this requires manual intervention');
228 // Would need external CAPTCHA solving service integration here
229 } else {
230 // Try to find and click any "I'm not a robot" checkbox
231 const clicked = await page.evaluate(() => {
232 const checkbox = document.querySelector('input[type="checkbox"]');
233 if (checkbox) {
234 checkbox.click();
235 return true;
236 }
237 return false;
238 });
239
240 if (clicked) {
241 console.log('Clicked verification checkbox');
242 await page.waitForNavigation({ timeout: 10000 }).catch(() => {});
243 }
244
245 // Try to find and click any continue/verify buttons
246 const buttonClicked = await page.evaluate(() => {
247 const buttons = Array.from(document.querySelectorAll('button'));
248 const verifyButton = buttons.find(button =>
249 button.textContent.includes('Verify') ||
250 button.textContent.includes('Continue') ||
251 button.textContent.includes('Submit')
252 );
253
254 if (verifyButton) {
255 verifyButton.click();
256 return true;
257 }
258 return false;
259 });
260
261 if (buttonClicked) {
262 console.log('Clicked verification button');
263 await page.waitForNavigation({ timeout: 10000 }).catch(() => {});
264 }
265 }
266 }
267 });
268
269 return true;
270 } catch (error) {
271 console.error('Error in challenge bypass:', error.message);
272 return false;
273 }
274}
275
276/**
277 * Makes direct request with custom fingerprinting
278 * @param {string} url Target URL
279 * @param {Object} options Request options
280 * @returns {Promise<Object>} Response data
281 */
282async function makeEvasiveRequest(url, options = {}) {
283 const {
284 proxy,
285 cookies = '',
286 fingerprint = fingerprintGenerator.getFingerprint(),
287 } = options;
288
289 const fetchOptions = {
290 headers: {
291 'User-Agent': fingerprint.userAgent,
292 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
293 'Accept-Language': 'en-US,en;q=0.5',
294 'Accept-Encoding': 'gzip, deflate, br',
295 'Connection': 'keep-alive',
296 'Upgrade-Insecure-Requests': '1',
297 'Sec-Fetch-Dest': 'document',
298 'Sec-Fetch-Mode': 'navigate',
299 'Sec-Fetch-Site': 'none',
300 'Sec-Fetch-User': '?1',
301 'Cache-Control': 'max-age=0',
302 'TE': 'trailers',
303 'Cookie': cookies,
304 },
305 };
306
307 // Add proxy if provided
308 if (proxy) {
309 fetchOptions.agent = new HttpsProxyAgent(proxy);
310 }
311
312 // Randomize TLS fingerprint patterns
313 process.env.NODE_TLS_CIPHER_SUITES = getRandomCipherSuites();
314
315 try {
316 const response = await fetch(url, fetchOptions);
317
318 // If we get a challenge page, we need browser-based approach
319 const text = await response.text();
320 if (text.includes('Challenge - Upwork') || response.status === 403) {
321 return { needsBrowser: true, status: response.status };
322 }
323
324 return {
325 success: response.ok,
326 status: response.status,
327 data: text
328 };
329 } catch (error) {
330 console.error('Error in evasive request:', error.message);
331 return { success: false, error: error.message };
332 }
333}
334
335/**
336 * Generates random TLS cipher suites order to randomize TLS fingerprint
337 */
338function getRandomCipherSuites() {
339 const cipherSuites = [
340 'TLS_AES_128_GCM_SHA256',
341 'TLS_AES_256_GCM_SHA384',
342 'TLS_CHACHA20_POLY1305_SHA256',
343 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256',
344 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256',
345 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384',
346 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384',
347 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256',
348 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256',
349 ];
350
351 // Shuffle array
352 for (let i = cipherSuites.length - 1; i > 0; i--) {
353 const j = Math.floor(Math.random() * (i + 1));
354 [cipherSuites[i], cipherSuites[j]] = [cipherSuites[j], cipherSuites[i]];
355 }
356
357 return cipherSuites.join(':');
358}
359
360export {
361 createEvasiveBrowser,
362 bypassChallenge,
363 makeEvasiveRequest,
364 fingerprintGenerator,
365 fingerprintInjector
366};

src/upwork-challenge-integrator.js

1/**
2 * Upwork Challenge Integrator
3 * Connects the challenge bypass module with the main scraper
4 */
5
6import { KeyValueStore } from 'crawlee';
7import fs from 'fs/promises';
8import path from 'path';
9import { fileURLToPath } from 'url';
10import cheerio from 'cheerio';
11import cloudscraper from './utils/cloudscraper-replacement.js';
12import { Actor } from 'apify';
13
14import {
15 createEvasiveBrowser,
16 bypassChallenge,
17 makeEvasiveRequest,
18 fingerprintGenerator
19} from './upwork-challenge-bypass.js';
20
21import {
22 applyAdvancedFingerprinting,
23 applyHeaderFingerprinting,
24 applyTLSFingerprinting
25} from './fingerprint-enhancement.js';
26
27const __dirname = path.dirname(fileURLToPath(import.meta.url));
28
29// Ensure directories exist
30const SCREENSHOTS_DIR = path.join(__dirname, '..', 'screenshots');
31const PROFILES_DIR = path.join(__dirname, '..', 'browser_profiles');
32
33async function ensureDirectoriesExist() {
34 await fs.mkdir(SCREENSHOTS_DIR, { recursive: true });
35 await fs.mkdir(PROFILES_DIR, { recursive: true });
36}
37
38/**
39 * Enhanced browser session setup with challenge bypass
40 * @param {Object} options Session configuration
41 * @returns {Object} Session with configured browser
42 */
43async function createEnhancedSession(options = {}) {
44 const {
45 sessionId,
46 proxyUrl,
47 headless = false,
48 input = {},
49 browserProfiles = [],
50 } = options;
51
52 await ensureDirectoriesExist();
53
54 // Session storage paths
55 const cookiesPath = path.join(PROFILES_DIR, `${sessionId}_cookies.json`);
56 const profileDir = path.join(PROFILES_DIR, `profile_${sessionId}`);
57
58 // Check if we have stored cookies
59 let hasStoredCookies = false;
60 try {
61 await fs.access(cookiesPath);
62 hasStoredCookies = true;
63 console.log(`Found cookies for session ${sessionId}`);
64 } catch (error) {
65 console.log(`No cookies found for session ${sessionId}, collecting new cookies...`);
66 }
67
68 // Select a random browser profile for fingerprinting diversity
69 const randomProfile = browserProfiles[Math.floor(Math.random() * browserProfiles.length)];
70
71 // Apply timezone from profile or random
72 const timezone = randomProfile?.timezone ||
73 input.timezone ||
74 ['America/New_York', 'America/Los_Angeles', 'Europe/London'][Math.floor(Math.random() * 3)];
75
76 console.log(`Using profile timezone: ${timezone}`);
77
78 // Configure fingerprint options based on the profile
79 const fingerprintOptions = {
80 browsers: [
81 { name: randomProfile?.platform?.includes('Mac') ? 'safari' : 'chrome', minVersion: 90 }
82 ],
83 operatingSystems: [randomProfile?.platform?.includes('Mac') ? 'macos' : 'windows'],
84 devices: ['desktop'],
85 locales: [randomProfile?.locale || 'en-US'],
86 };
87
88 // Use the profile's user agent if available
89 const userAgent = randomProfile?.userAgent || null;
90
91 try {
92 // Try direct request first to check if challenge bypass is needed
93 console.log(`Attempting to use CloudScraper to bypass Cloudflare protection...`);
94
95 try {
96 const result = await cloudscraper.get('https://www.upwork.com');
97 console.log(`CloudScraper successful with status: ${result.statusCode}`);
98
99 // If direct request works, we can use a simpler browser setup
100 // Create the browser session without advanced evasion
101 // ... existing simpler browser setup ...
102
103 return {
104 needsAdvancedEvasion: false,
105 // Simple browser config here
106 };
107 } catch (error) {
108 console.log(`CloudScraper failed to bypass protection, falling back to puppeteer`);
109
110 // Create advanced evasive browser
111 console.log(`Using real browser profile at: ${profileDir}`);
112 const { browser, fingerprint } = await createEvasiveBrowser({
113 proxy: proxyUrl,
114 headless,
115 profileDir,
116 cookiesPath: hasStoredCookies ? cookiesPath : null,
117 fingerprintOptions,
118 userAgent,
119 });
120
121 // Set up the page with advanced evasion
122 const page = await browser.newPage();
123
124 // Apply bypass techniques to page
125 await bypassChallenge(page, fingerprint);
126
127 // Apply our enhanced fingerprinting techniques
128 console.log('Applying enhanced fingerprint protection...');
129 await applyAdvancedFingerprinting(page, {
130 deviceProfile: fingerprint.device.platform.includes('Mac') ? 'safari' : 'chrome',
131 webglNoise: true,
132 audioNoise: true,
133 fontConsistency: true,
134 hideTempStorage: true,
135 consistentTimezone: true,
136 userAgent: fingerprint.userAgent,
137 timezone: timezone
138 });
139
140 // Apply TLS fingerprinting
141 applyTLSFingerprinting({
142 randomize: true,
143 profile: fingerprint.device.platform.includes('Mac') ? 'safari' : 'chrome_mac'
144 });
145
146 // Apply header fingerprinting
147 await applyHeaderFingerprinting(page, {
148 userAgent: fingerprint.userAgent,
149 locale: fingerprint.navigator.language,
150 browser: fingerprint.device.platform.includes('Mac') ? 'safari' : 'chrome',
151 randomizeOrder: true
152 });
153
154 // Visit Upwork homepage to get cookies
155 console.log('Visiting Upwork homepage...');
156 try {
157 await page.goto('https://www.upwork.com', {
158 waitUntil: 'networkidle2',
159 timeout: 60000,
160 });
161
162 // Check if we're on a challenge page
163 const isChallengePage = await page.evaluate(() => {
164 return document.title.includes('Challenge') ||
165 document.querySelector('form[id*="challenge"]') !== null;
166 });
167
168 if (isChallengePage) {
169 console.log('Still on challenge page, may need manual intervention');
170 // Take screenshot for debugging
171 await page.screenshot({
172 path: path.join(SCREENSHOTS_DIR, `challenge_${sessionId}.png`),
173 });
174
175 // Store session with challenge flag
176 return {
177 sessionId,
178 browser,
179 page,
180 userAgent: fingerprint.userAgent,
181 proxy: proxyUrl,
182 needsAdvancedEvasion: true,
183 onChallengePage: true,
184 };
185 } else {
186 console.log('Successfully bypassed challenge page!');
187
188 // Store cookies for future use
189 const cookies = await page.cookies();
190 await fs.writeFile(cookiesPath, JSON.stringify(cookies, null, 2));
191
192 // Return session with successful bypass
193 return {
194 sessionId,
195 browser,
196 page,
197 userAgent: fingerprint.userAgent,
198 cookies,
199 proxy: proxyUrl,
200 needsAdvancedEvasion: true,
201 onChallengePage: false,
202 };
203 }
204 } catch (error) {
205 console.log(`Error during cookie collection for session ${sessionId}: ${error}`);
206 await browser.close();
207
208 // Return session with error flag
209 return {
210 sessionId,
211 error: error.message,
212 needsAdvancedEvasion: true,
213 connectionFailed: true,
214 };
215 }
216 }
217 } catch (error) {
218 console.error(`Error creating enhanced session: ${error.message}`);
219 return {
220 sessionId,
221 error: error.message,
222 needsAdvancedEvasion: true,
223 setupFailed: true,
224 };
225 }
226}
227
228// Capture a screenshot and store it on Apify Storage
229const takeScreenshot = async (page, filename) => {
230 try {
231 const screenshotBuffer = await page.screenshot({
232 type: "jpeg",
233 quality: 50,
234 fullPage: false,
235 });
236 const screenshotKey = `${filename}-${Date.now()}.jpeg`;
237 await Actor.setValue(screenshotKey, screenshotBuffer, {
238 contentType: "image/jpeg",
239 });
240 const screenshotUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${screenshotKey}`;
241 console.log(`📸 Screenshot saved: ${screenshotUrl}`);
242 return screenshotUrl;
243 } catch (error) {
244 console.error(`❌ Failed to take screenshot: ${error.message}`);
245 return null;
246 }
247};
248
249/**
250 * Enhanced page navigation with challenge bypass
251 * @param {Object} options Navigation options
252 * @returns {Object} Result with page content or error
253 */
254async function enhancedPageNavigation(options = {}) {
255 const {
256 session,
257 url,
258 input = {},
259 attemptDirectRequest = true,
260 parseJobsFunction,
261 } = options;
262
263 if (!session || !url) {
264 return { success: false, error: 'Missing session or URL' };
265 }
266
267 const { page, browser, userAgent, cookies, proxy, needsAdvancedEvasion } = session;
268
269 try {
270 // Try direct request methods first if enabled
271 if (attemptDirectRequest && input.useCloudScraper) {
272 console.log(`Attempting CloudScraper pre-fetch for ${url}...`);
273
274 try {
275 const result = await cloudscraper.get(url);
276 console.log(`CloudScraper successful with status ${result.statusCode}`);
277
278 // Parse the HTML response directly if successful
279 if (typeof parseJobsFunction === 'function') {
280 const $ = cheerio.load(result.body);
281 const jobs = parseJobsFunction($);
282
283 return {
284 success: true,
285 method: 'cloudscraper',
286 jobs,
287 statusCode: result.statusCode,
288 };
289 }
290
291 return {
292 success: true,
293 method: 'cloudscraper',
294 content: result.body,
295 statusCode: result.statusCode,
296 };
297 } catch (error) {
298 const statusCode = error.statusCode || (error.response && error.response.statusCode);
299 console.log(`CloudScraper got status ${statusCode}`);
300
301 // If CloudScraper fails with 403, try evasive direct request
302 if (needsAdvancedEvasion && statusCode === 403) {
303 console.log(`Using user agent: ${userAgent}`);
304
305 try {
306 const evasiveResult = await makeEvasiveRequest(url, {
307 proxy,
308 cookies: cookies ? cookies.map(c => `${c.name}=${c.value}`).join('; ') : '',
309 fingerprint: fingerprintGenerator.getFingerprint(),
310 });
311
312 if (!evasiveResult.needsBrowser && evasiveResult.success) {
313 console.log(`Evasive request successful with status ${evasiveResult.status}`);
314
315 // Parse the HTML response directly
316 if (typeof parseJobsFunction === 'function') {
317 const $ = cheerio.load(evasiveResult.data);
318 const jobs = parseJobsFunction($);
319
320 return {
321 success: true,
322 method: 'evasive-request',
323 jobs,
324 statusCode: evasiveResult.status,
325 };
326 }
327
328 return {
329 success: true,
330 method: 'evasive-request',
331 content: evasiveResult.data,
332 statusCode: evasiveResult.status,
333 };
334 }
335 } catch (directError) {
336 console.log(`Evasive request failed: ${directError.message}`);
337 }
338 }
339 }
340 }
341
342 // If direct methods failed or not enabled, use browser
343 if (!page || !browser) {
344 return {
345 success: false,
346 error: 'Browser or page not available',
347 needsBrowserReset: true
348 };
349 }
350
351 // Add random delay to prevent patterns
352 const delaySeconds = Math.floor(Math.random() * 30) + 15;
353 console.log(`Waiting ${delaySeconds} seconds before the next request...`);
354 await page.waitForTimeout(delaySeconds * 1000);
355
356 // Re-apply header randomization before each navigation
357 await applyHeaderFingerprinting(page, {
358 userAgent: userAgent,
359 browser: session.fingerprintConfig?.browserProfile || 'safari',
360 randomizeOrder: true
361 });
362
363 // Visit the page with full browser
364 const response = await page.goto(url, {
365 waitUntil: 'networkidle2',
366 timeout: 60000,
367 });
368
369 if (response.status() === 403) {
370 // Take a screenshot of the error for debugging
371 console.log(`Error screenshot saved for status ${response.status()}`);
372 await page.screenshot({
373 path: path.join(SCREENSHOTS_DIR, `error_${Date.now()}.png`),
374 });
375
376 // Try to bypass the challenge page
377 await bypassChallenge(page, { userAgent });
378
379 // Check if we're still on a challenge page
380 const isChallengePage = await page.evaluate(() => {
381 return document.title.includes('Challenge') ||
382 document.querySelector('form[id*="challenge"]') !== null;
383 });
384
385 if (isChallengePage) {
386 console.log('Detected challenge page, about to take screenshot...');
387 const screenshotUrl = await takeScreenshot(page, 'cloudflare_challenge_enhanced_nav');
388 if (screenshotUrl) {
389 console.log(`Cloudflare challenge screenshot: ${screenshotUrl}`);
390 } else {
391 console.log('Screenshot failed or not available.');
392 }
393
394 // Try to extract data even from challenge page if possible
395 const pageContent = await page.content();
396
397 if (typeof parseJobsFunction === 'function') {
398 const $ = cheerio.load(pageContent);
399 const jobs = parseJobsFunction($);
400
401 if (jobs && jobs.length > 0) {
402 console.log(`Found ${jobs.length} jobs even on challenge page`);
403 return {
404 success: true,
405 method: 'browser-challenge-page',
406 jobs,
407 statusCode: 403,
408 isChallengePage: true,
409 };
410 }
411 }
412
413 // Simulate human-like behavior for potential bypass
414 console.log('Simulating additional resource loading for natural network patterns...');
415
416 try {
417 // Visit some common resources that real browsers would load
418 const resources = [
419 'https://www.google.com/favicon.ico',
420 'https://fonts.googleapis.com/css?family=Roboto',
421 'https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js',
422 ];
423
424 // Create a new incognito page to load resources without affecting main page
425 const context = await browser.createIncognitoBrowserContext();
426 const resourcePage = await context.newPage();
427
428 for (const resource of resources) {
429 await resourcePage.goto(resource, { waitUntil: 'domcontentloaded' }).catch(() => {});
430 await page.waitForTimeout(1000);
431 }
432
433 await resourcePage.close();
434 await context.close();
435
436 console.log('Additional resources loaded for natural network pattern');
437 } catch (resourceError) {
438 console.log(`Error loading additional resources: ${resourceError.message}`);
439 }
440
441 if (typeof parseJobsFunction === 'function') {
442 console.log('Fallback parser: about to take screenshot before returning result...');
443 const fallbackScreenshotUrl = await takeScreenshot(page, 'cloudflare_challenge_fallback');
444 if (fallbackScreenshotUrl) {
445 console.log(`Fallback parser screenshot: ${fallbackScreenshotUrl}`);
446 } else {
447 console.log('Fallback parser screenshot failed or not available.');
448 }
449 }
450
451 return {
452 success: false,
453 error: `Request blocked - received 403 status code.`,
454 isChallengePage: true,
455 statusCode: 403,
456 needsSessionRotation: true,
457 };
458 }
459
460 // If we got past the challenge, continue processing
461 console.log('Successfully bypassed challenge page after retry!');
462 }
463
464 // Process successful page response
465 const pageContent = await page.content();
466
467 if (typeof parseJobsFunction === 'function') {
468 const $ = cheerio.load(pageContent);
469 const jobs = parseJobsFunction($);
470
471 // Fallback screenshot and logging
472 if (page) {
473 console.log('Fallback parser: about to take screenshot before returning result...');
474 const fallbackScreenshotUrl = await takeScreenshot(page, 'cloudflare_challenge_fallback');
475 if (fallbackScreenshotUrl) {
476 console.log(`Fallback parser screenshot: ${fallbackScreenshotUrl}`);
477 } else {
478 console.log('Fallback parser screenshot failed or not available.');
479 }
480 } else {
481 console.log('Fallback parser: page object is missing, cannot take screenshot.');
482 }
483
484 return {
485 success: true,
486 method: 'browser',
487 jobs,
488 statusCode: response.status(),
489 };
490 }
491
492 return {
493 success: true,
494 method: 'browser',
495 content: pageContent,
496 statusCode: response.status(),
497 };
498
499 } catch (error) {
500 console.error(`Error navigating to ${url}: ${error.message}`);
501 return {
502 success: false,
503 error: error.message,
504 needsRetry: true,
505 };
506 }
507}
508
509/**
510 * Clean up browser sessions
511 * @param {Object} session Session to clean up
512 */
513async function cleanupSession(session) {
514 if (session?.browser) {
515 try {
516 await session.browser.close();
517 } catch (error) {
518 console.log(`Error closing browser: ${error.message}`);
519 }
520 }
521}
522
523export {
524 createEnhancedSession,
525 enhancedPageNavigation,
526 cleanupSession,
527};

src/utils/cloudscraper-replacement.js

1/**
2 * Cloudscraper replacement using axios
3 * Handles requests with similar API to the original cloudscraper
4 */
5
6import axios from 'axios';
7import { HttpsProxyAgent } from 'https-proxy-agent';
8
9/**
10 * Make a GET request with axios
11 * @param {string|Object} options URL or options object
12 * @param {Function} callback Optional callback for legacy API compatibility
13 * @returns {Promise<Object>} Response data
14 */
15export async function get(options, callback) {
16 let url;
17 let config = {};
18
19 if (typeof options === 'string') {
20 url = options;
21 } else {
22 url = options.url;
23 config = { ...options };
24 delete config.url;
25
26 // Convert headers
27 if (options.headers) {
28 config.headers = options.headers;
29 }
30
31 // Handle proxy
32 if (options.proxy) {
33 config.httpsAgent = new HttpsProxyAgent(options.proxy);
34 }
35
36 // Handle jar (cookies)
37 if (options.jar === true) {
38 config.withCredentials = true;
39 }
40 }
41
42 try {
43 const response = await axios.get(url, config);
44
45 // Format response to match cloudscraper's expected format
46 const result = {
47 body: response.data,
48 statusCode: response.status,
49 request: {
50 uri: { href: url },
51 jar: {
52 getCookies: () => {
53 return Object.entries(response.headers['set-cookie'] || {}).map(([key, value]) => ({
54 key,
55 value,
56 domain: new URL(url).hostname,
57 path: '/'
58 }));
59 }
60 }
61 },
62 headers: response.headers
63 };
64
65 if (callback) {
66 callback(null, response, response.data);
67 }
68
69 return result;
70 } catch (error) {
71 console.log(`Request to ${url} failed: ${error.message}`);
72
73 // Create a synthetic response for errors
74 const statusCode = error.response?.status || 500;
75 const responseData = error.response?.data || `<html><body><h1>Error ${statusCode}</h1></body></html>`;
76
77 // Format as cloudscraper response
78 const result = {
79 body: responseData,
80 statusCode: statusCode,
81 request: {
82 uri: { href: url },
83 jar: {
84 getCookies: () => []
85 }
86 },
87 headers: error.response?.headers || {},
88 error: error.message
89 };
90
91 if (callback) {
92 callback(null, result, responseData);
93 }
94
95 return result;
96 }
97}
98
99/**
100 * Make a request with axios
101 * @param {Object} options Request options
102 * @param {Function} callback Optional callback
103 * @returns {Promise<Object>} Response data
104 */
105const cloudscraperReplacement = async function(options, callback) {
106 return get(options, callback);
107};
108
109// Add method handlers for compatibility
110cloudscraperReplacement.get = get;
111
112cloudscraperReplacement.post = async function(options, callback) {
113 try {
114 let url;
115 let data;
116 let config = {};
117
118 if (typeof options === 'string') {
119 url = options;
120 data = {};
121 } else {
122 url = options.url;
123 data = options.form || options.body || {};
124 config = { ...options };
125 delete config.url;
126 delete config.form;
127 delete config.body;
128
129 // Convert headers
130 if (options.headers) {
131 config.headers = options.headers;
132 }
133
134 // Handle proxy
135 if (options.proxy) {
136 config.httpsAgent = new HttpsProxyAgent(options.proxy);
137 }
138
139 // Handle jar (cookies)
140 if (options.jar === true) {
141 config.withCredentials = true;
142 }
143 }
144
145 const response = await axios.post(url, data, config);
146
147 // Format response to match cloudscraper's expected format
148 const result = {
149 body: response.data,
150 statusCode: response.status,
151 request: {
152 uri: { href: url },
153 jar: {
154 getCookies: () => {
155 return Object.entries(response.headers['set-cookie'] || {}).map(([key, value]) => ({
156 key,
157 value,
158 domain: new URL(url).hostname,
159 path: '/'
160 }));
161 }
162 }
163 },
164 headers: response.headers
165 };
166
167 if (callback) {
168 callback(null, response, response.data);
169 }
170
171 return result;
172 } catch (error) {
173 console.log(`POST request to ${options.url || options} failed: ${error.message}`);
174
175 // Create a synthetic response for errors
176 const statusCode = error.response?.status || 500;
177 const responseData = error.response?.data || `<html><body><h1>Error ${statusCode}</h1></body></html>`;
178
179 // Format as cloudscraper response
180 const result = {
181 body: responseData,
182 statusCode: statusCode,
183 request: {
184 uri: { href: options.url || options },
185 jar: {
186 getCookies: () => []
187 }
188 },
189 headers: error.response?.headers || {},
190 error: error.message
191 };
192
193 if (callback) {
194 callback(null, result, responseData);
195 }
196
197 return result;
198 }
199};
200
201export default cloudscraperReplacement;