1import Apify from "apify"
2import { Session } from "apify/build/session_pool/session.js"
3import { BrowserPool, PlaywrightPlugin } from "browser-pool"
4import playwright from "playwright"
5
6const { utils: { log, requestAsBrowser, puppeteer } } = Apify
7
8export default class CloudflareUnblocker {
9
10
11
12
13
14
15
16 constructor (options) {
17
18 this.unblockUrl = options.unblockUrl
19 this.proxyConfiguration = options.proxyConfiguration
20
21
22 this.browserPool = new BrowserPool({
23 retireBrowserAfterPageCount: 50,
24 maxOpenPagesPerBrowser: 10,
25 browserPlugins: [
26 new PlaywrightPlugin(playwright.firefox, {
27 launchOptions: { headless: false },
28 }),
29 ],
30 })
31
32 this._shouldUseCustomTLS = false
33
34
35
36
37 this.name = this.constructor.name
38 this.log = Apify.utils.log.child({ prefix: this.constructor.name })
39 }
40
41
42
43
44 async unblock ({ session, request }) {
45 if (this._isSessionBeingRenewed(session)) {
46 request.retryCount = 0
47 this._throwError("Session is being renewed")
48 }
49
50 const oldShouldUseTLS = this._shouldUseCustomTLS
51 const requestOptions = this.getRequestOptions(session)
52 const response = await requestAsBrowser({
53 ...requestOptions,
54 url: request.url,
55 })
56
57 if (this._detectChallengeFunction(response)) {
58
59 try {
60 this._markSessionBeingRenewed(session)
61 return this._solveChallenge({
62 response,
63 session,
64 request,
65 requestOptions,
66 })
67 } finally {
68 this._markSessionNotBeingRenewed(session)
69 }
70 } else if (response.statusCode === 403) {
71
72 if (oldShouldUseTLS) {
73 this.log.info("Captcha found even with the TLS hack")
74 await Apify.setValue(
75 `CAPTCHA-HTML-${Math.random() * 1000}`,
76 response.body,
77 { contentType: "text/html" },
78 )
79 } else {
80 this.log.info(
81 "Captcha found for the first time -> switching to custom TLS",
82 )
83 this._shouldUseCustomTLS = true
84 }
85 }
86
87 this.log.info("Session OK")
88 session.setCookiesFromResponse(response)
89 return response
90 }
91
92
93
94
95
96
97
98
99
100
101
102 async _solveChallenge (options) {
103 const { request, response, session } = options
104 const { body, headers } = response
105
106 const receivedCookies = headers["set-cookie"]
107 log.debug(`${this.name}: received cookies: ${receivedCookies}`)
108 const requestOptions = this.getRequestOptions(session)
109
110 if (!receivedCookies) {
111 await Apify.setValue(`NO-COOKIES-HTML-${Math.random() * 1000}`, body, {
112 contentType: "text/html",
113 })
114 }
115
116 session.setCookiesFromResponse(response)
117
118 const cloudflareAuthReq = await this._getSolvedChallengeRequest({
119 response,
120 session,
121 request,
122 })
123
124 const browserHeaders = cloudflareAuthReq.headers()
125 const finalRequestOpts = {
126 ...requestOptions,
127 url: cloudflareAuthReq.url(),
128 payload: cloudflareAuthReq.postData(),
129 headers: {
130 ...requestOptions.headers,
131 "Content-Type": browserHeaders["content-type"],
132 Origin: browserHeaders.origin,
133 Referer: browserHeaders.referer,
134 Cookie: session.getCookieString(cloudflareAuthReq.url()),
135 "Content-Length": cloudflareAuthReq.postData().length,
136 },
137 method: "POST",
138 }
139
140 const challengeResponse = await requestAsBrowser(finalRequestOpts)
141
142 if (this._isChallengeSolvedFunction(challengeResponse)) {
143
144 request.retryCount = 0
145 session.setCookiesFromResponse(challengeResponse)
146 this.log.info("Successfully unblocked")
147 return challengeResponse
148 }
149
150
151 session.retire()
152 await Apify.setValue(
153 `BLOCKED-HTML-${challengeResponse.statusCode}-${Math.random() * 1000}`,
154 challengeResponse.body,
155 { contentType: "text/html" },
156 )
157 this._throwError("Blocked")
158 }
159
160
161
162
163
164
165
166 _markSessionBeingRenewed (session) {
167 session.userData.isBeingRenewed = true
168 }
169
170
171
172
173
174
175 _markSessionNotBeingRenewed (session) {
176 session.userData.isBeingRenewed = false
177 }
178
179
180
181
182
183
184
185 _getProxyUrl (session) {
186 return this.proxyConfiguration.newUrl(session.id)
187 }
188
189
190
191
192
193
194
195 _isSessionBeingRenewed (session) {
196 return session.userData.isBeingRenewed
197 }
198
199
200
201
202
203
204 _throwError (message) {
205 throw new Error(`${this.name}: ${message}`)
206 }
207
208
209
210
211
212
213
214
215 async _getSolvedChallengeRequest ({ response, request, session }) {
216 const { headers, body } = response
217 const page = await this.browserPool.newPage()
218 let authRequest
219
220
221 await puppeteer.addInterceptRequestHandler(page, async req => {
222 const reqUrl = req.url()
223 const method = req.method()
224
225 if (request.url === reqUrl && method === "GET") {
226 this.log.info(`Mocking initial navigation request: ${req.url()}`)
227 await req.respond({ status: 200, body, headers })
228 } else if (this._detectChallengeRequestFunction(req)) {
229 authRequest = req
230 await req.abort()
231 } else if (reqUrl.includes("transparent.gif")) {
232 const imageResponse = await this._sendImageRequest(req, session)
233 await req.respond({
234 status: 200,
235 body: imageResponse.body,
236 headers: imageResponse.headers,
237 })
238 } else {
239 await req.abort()
240 }
241 })
242
243
244
245 page.on("console", msg => this.log.debug(`${this.name}: ${msg.text()}`))
246
247
248 await page.evaluate(url => {
249
250 window.location.href = url
251 }, request.url)
252
253 await this._waitUntilChallengeFinishedFunction()
254
255
256
257 this.browserPool.recyclePage(page).catch(() => {
258 })
259 return authRequest
260 }
261
262
263
264
265
266
267
268
269 async _sendImageRequest (req, session) {
270 const browserHeaders = req.headers()
271 const requestOptions = this.getRequestOptions(session)
272
273 const imageHeaders = {
274 Referer: browserHeaders.referer,
275 Connection: "keep-alive",
276 Pragma: "no-cache",
277 "Cache-Control": "no-cache",
278 "User-Agent":
279 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
280 "Sec-Fetch-User": "?1",
281 Accept: "image/webp,image/apng,image/*,*/*;q=0.8",
282 "Sec-Fetch-Site": "none",
283 "Sec-Fetch-Mode": "no-cors",
284 "Sec-Fetch-Dest": "image",
285 "Accept-Encoding": "gzip, deflate, br",
286 "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
287 Cookie: session.getCookieString(req.url()),
288 }
289
290 return requestAsBrowser({
291 ...requestOptions,
292 url: req.url(),
293 abortFunction: () => false,
294 headers: imageHeaders,
295 })
296 }
297
298
299
300
301
302
303
304 _getBrowserHeaders (cookieString) {
305 return {
306 Connection: "close",
307 Pragma: "no-cache",
308 "Cache-Control": "no-cache",
309 "Upgrade-Insecure-Requests": "1",
310 "User-Agent":
311 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
312 "Sec-Fetch-User": "?1",
313 Accept:
314 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
315 "Sec-Fetch-Site": "none",
316 "Sec-Fetch-Mode": "navigate",
317 "Sec-Fetch-Dest": "document",
318 "Accept-Encoding": "gzip, deflate, br",
319 "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
320 Cookie: cookieString,
321 }
322 }
323
324
325
326
327
328
329
330 _detectChallengeFunction (response) {
331 return response.statusCode === 503 && response.body.includes("CDATA")
332 }
333
334
335
336
337
338
339
340 _detectChallengeRequestFunction (request) {
341 return request.method() === "POST"
342 }
343
344
345
346
347
348
349 async _waitUntilChallengeFinishedFunction () {
350 await Apify.utils.sleep(5000)
351 }
352
353
354
355
356
357
358
359 _isChallengeSolvedFunction (challengeResponse) {
360 return challengeResponse.statusCode === 200
361 }
362
363
364
365
366
367
368 getRequestOptions (session) {
369 const proxyUrl = this._getProxyUrl(session)
370
371 const requestOptions = {
372 headers: this._getBrowserHeaders(
373 session.getCookieString(this.unblockUrl),
374 ),
375 proxyUrl,
376 }
377
378 if (this._shouldUseCustomTLS) {
379
380 if (!requestOptions.https) requestOptions.https = {}
381 requestOptions.https.ciphers = "AES256-SHA"
382 }
383
384 return requestOptions
385 }
386
387
388
389
390
391
392 async createSessionFunction (sessionPool) {
393 const session = new Session({ sessionPool })
394 try {
395 await this.unblock({ session, request: { url: this.unblockUrl } })
396 } catch (e) {
397 log.warning(`${this.name}: Could not unblock session`)
398 log.exception(e, 'Unblocker failed')
399 }
400 return session
401 }
402
403
404
405
406
407 getCrawlerOptions () {
408 const that = this
409 return {
410 useSessionPool: true,
411 sessionPoolOptions: {
412 maxPoolSize: 100,
413 createSessionFunction: this.createSessionFunction.bind(this),
414 },
415 persistCookiesPerSession: true,
416 useApifyProxy: true,
417 async prepareRequestFunction ({ request, session }) {
418 const newOptions = that.getRequestOptions(session)
419 request.headers = newOptions.headers
420
421 if (newOptions.ciphers) {
422
423 this.requestOptions = { ciphers: newOptions.ciphers }
424 }
425 },
426 }
427 }
428}