Český ráj (ceskyraj.com) scraper avatar
Český ráj (ceskyraj.com) scraper

Deprecated

Pricing

Pay per usage

Go to Store
Český ráj (ceskyraj.com) scraper

Český ráj (ceskyraj.com) scraper

Deprecated

Developed by

Pavel Dolecek

Pavel Dolecek

Maintained by Community

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

0.0 (0)

Pricing

Pay per usage

1

Total users

2

Monthly users

1

Last modified

3 years ago

Dockerfile

FROM apify/actor-node:16
COPY package.json ./
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional
COPY . ./

INPUT_SCHEMA.json

{
"title": "Český ráj (ceskyraj.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"type": "object",
"schemaVersion": 1,
"properties": {
"mode": {
"title": "Mode",
"description": "",
"type": "string",
"editor": "select",
"default": "TEST",
"prefill": "TEST",
"enumTitles": [
"TEST",
"FULL",
"SINGLE"
],
"enum": [
"TEST",
"FULL",
"SINGLE"
]
},
"country": {
"title": "Country",
"description": "",
"type": "string",
"editor": "select",
"default": "CZ",
"prefill": "CZ",
"enumTitles": [
"CZ",
"SK",
"UK",
"DE",
"AT",
"HU"
],
"enum": [
"CZ",
"SK",
"UK",
"DE",
"AT",
"HU"
]
},
"debug": {
"title": "Debug",
"description": "Debug mode prints more logs, disables concurrency and other optimizations.",
"type": "boolean",
"editor": "checkbox",
"default": false
}
},
"required": [
"mode",
"country"
]
}

apify.json

{
"name": "cesky-raj-ceskyraj-com-scraper",
"version": "0.1",
"buildTag": "latest",
"env": null,
"template": "project_cheerio_crawler"
}

main.js

1/**
2 * TODO:
3 * - consider proxies
4 * - DRY price parsing
5 * - different countries to input
6 *
7 * Beware that same product can have multiple valid urls
8 * - https://www.alza.cz/iphone-13-512gb-cervena-levne-d6839524.htm
9 * - https://www.alza.cz/sport/victorias-secret-st-11128877-cc-4vmq-cerna
10 * - https://www.alza.cz/sport/victorias-secret-st-11156655-cc-38h2-bezova?dq=6920061
11 *
12 * Variants can affect price
13 * - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6804643 38k
14 * - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6771118 39k
15 *
16 * Pagination:
17 * beware: only first next 3 pages are listed
18 * -> need to use total amount of products & product per page to calculate total amount of pages
19 */
20
21import Apify from "apify";
22import cheerio from "cheerio";
23import { gotScraping } from "got-scraping";
24import { withPersistedStats } from "./_utils/stats.js";
25
26const { log } = Apify.utils;
27
28var LABEL;
29
30(function (LABEL) {
31 LABEL["INDEX"] = "INDEX";
32 LABEL["PRODUCTS"] = "PRODUCTS";
33})(LABEL || (LABEL = {}));
34var MODE;
35
36(function (MODE) {
37 MODE["TEST"] = "TEST";
38 MODE["FULL"] = "FULL";
39 MODE["SINGLE"] = "SINGLE";
40})(MODE || (MODE = {}));
41var Country;
42
43// TODO: Maybe unify with Country enum
44(function (Country) {
45 Country["CZ"] = "CZ";
46 Country["SK"] = "SK";
47 Country["UK"] = "UK";
48 Country["DE"] = "DE";
49 Country["AT"] = "AT";
50 Country["HU"] = "HU";
51})(Country || (Country = {}));
52const Countries = {
53 CZ: {
54 domain: "cz",
55 currency: "CZK",
56 },
57 SK: {
58 domain: "sk",
59 currency: "EUR",
60 },
61 UK: {
62 domain: "co.uk",
63 currency: "GBP",
64 },
65 DE: {
66 domain: "de",
67 currency: "EUR",
68 },
69 AT: {
70 domain: "at",
71 currency: "EUR",
72 },
73 HU: {
74 domain: "hu",
75 currency: "HUF",
76 },
77};
78
79async function enqueueInitialRequest(type, requestQueue, countryDef) {
80 if (type === MODE.FULL) {
81 await requestQueue.addRequest({
82 userData: { label: LABEL.INDEX },
83 url: `https://alza.${countryDef.domain}/_sitemap-categories.xml`,
84 });
85 } else if (type === MODE.TEST) {
86 await requestQueue.addRequest({
87 userData: { label: LABEL.PRODUCTS },
88 url: "https://www.alza.cz/fotobrasny/18848740.htm",
89 });
90 await requestQueue.addRequest({
91 userData: { label: LABEL.PRODUCTS },
92 url: "https://www.alza.cz/objektivy-sony-fe/18879399.htm",
93 });
94 await requestQueue.addRequest({
95 userData: { label: LABEL.PRODUCTS },
96 url: "https://www.alza.cz/baterie-pro-fotoaparaty/18851400.htm",
97 });
98 } else if (type === MODE.SINGLE) {
99 await requestQueue.addRequest({
100 userData: { label: LABEL.PRODUCTS },
101 url: "https://www.alza.sk/iphone-mobilne-telefony/18851638.htm", // single category, but containing multiple pages
102 });
103 }
104}
105
106async function handleIndex(
107 { request, session = null },
108 { mode, stats, requestQueue }
109) {
110 const response = await gotScraping({
111 url: request.url,
112 });
113 if (![200, 404].includes(response.statusCode)) {
114 session.retire();
115 request.retryCount--;
116 throw new Error(
117 `Blocked, statusCode ${response.statusCode}, url: ${request.url}`
118 );
119 }
120 const $ = cheerio.load(response.body, { xmlMode: true });
121 const urls = $("url loc").map((i, x) => $(x).text());
122 for (const url of urls) {
123 await requestQueue.addRequest({
124 userData: { label: LABEL.PRODUCTS },
125 url,
126 });
127 }
128}
129
130async function handleProducts(
131 { request, session = null },
132 { stats, processedIds, requestQueue, countryDef }
133) {
134 console.log("handleProducts", request.url);
135
136 const response = await gotScraping({
137 url: request.url,
138 headers: {
139 Host: `www.alza.${countryDef.domain}`, // IMPORTANT to get prices in CZK, not EUR
140 },
141 // proxyUrl: await proxyConfiguration.newUrl(session.id),
142 });
143
144 const $ = cheerio.load(response.body);
145
146 if (
147 // no pagination info in URL -> we are on a first/initial page -> enqueue next pages
148 !request.url.match(/-p\d+\.htm/)
149 ) {
150 // Beware: only first next 3 pages are listed, so following code is not correct
151 // $('#pagerbottom a[class="pgn"]').each((i, el) => {
152 // // beware: href attr does not start with `/`, as product urls do
153 // const url = `https://alza.${countryDef.domain}/${($(el).attr("href"))}`
154 // if (i === 0) return // skip first page, as we are already on it
155 // void requestQueue.addRequest({
156 // userData: { label: LABEL.PRODUCTS },
157 // url,
158 // })
159 // })
160
161 // Method using total amount of products and product per page to calculate total amount of pages
162 const totalAmountOfProducts = parseInt($("#lblNumberItem").text());
163 const productsPerPage = 24; // TODO: Maybe to constant
164 const totalPages = Math.ceil(totalAmountOfProducts / productsPerPage);
165 for (let i = 2; i <= totalPages; i++) {
166 const url = request.url // e.g. `/iphone-mobilne-telefony/18851638.htm`
167 .replace(/\.htm$/, "") // `/iphone-mobilne-telefony/18851638`
168 .concat(`-p${i}.htm`); // e.g. `/iphone-mobilne-telefony/18851638-p2.htm`
169 console.log("enqueue next page", url);
170 void requestQueue.addRequest({
171 userData: { label: LABEL.PRODUCTS },
172 url,
173 });
174 }
175 }
176
177 $(".browsingitem").each(function () {
178 const itemId = $(this).attr("data-id");
179 if (!processedIds.has(itemId)) {
180 stats.inc("itemsUnique");
181 processedIds.add(itemId);
182 const itemCode = $(this).attr("data-code"); // non-standard product property
183 const img = $(this).find("a.pc.browsinglink img").data("src");
184 const relativeItemUrl = $(this)
185 .find("a.name.browsinglink")
186 .first()
187 .attr("href");
188 const absoluteItemUrl = `https://alza.${countryDef.domain}${relativeItemUrl}`;
189 const itemName = $(this)
190 .find("a.name.browsinglink")
191 .first()
192 .text()
193 .replace(/([\n\r])/g, "") // replace newlines and carriage returns
194 .trim();
195
196 const currentPrice = $(this)
197 .find(".price .c2")
198 .text()
199 .replace(/,-$/, "")
200 .replace(/[^\d.,]/g, "");
201
202 const originalPrice = $(this)
203 .find(".price .np2")
204 .text()
205 .replace(/,-$/, "")
206 .replace(/[^\d.,]/g, "");
207
208 const product = {
209 itemId,
210 itemName,
211 itemUrl: absoluteItemUrl,
212 img,
213 inStock: "TODO",
214 currentPrice,
215 originalPrice,
216 currency: countryDef.currency,
217 category: "FIXME",
218
219 // Derived
220 discounted: originalPrice && originalPrice !== currentPrice,
221 _discount: originalPrice
222 ? // @ts-ignore
223 (1 - currentPrice / originalPrice) * 100
224 : undefined,
225 };
226 console.log(`Found product: ${itemId}`);
227 Apify.pushData(product);
228 } else {
229 stats.inc("itemsDuplicate");
230 }
231 });
232}
233
234const proxyGroups = [
235 "HUNGARY",
236 "CZECH_LUMINATI",
237 "GERMANY",
238 "FRANCE",
239 "ITALY",
240 "SPAIN",
241];
242
243Apify.main(async () => {
244 await Apify.utils.purgeLocalStorage();
245
246 const stats = await withPersistedStats((x) => x, {
247 categories: 0,
248 items: 0,
249 itemsUnique: 0,
250 itemsDuplicate: 0,
251 denied: 0,
252 captchas: 0,
253 });
254 const processedIds = new Set();
255
256 const input = await Apify.getInput();
257 const { debug = false, country = Country.CZ, mode = MODE.TEST } = input ?? {};
258
259 const countryDef = Countries[country];
260
261 if (debug) Apify.utils.log.setLevel(Apify.utils.log.LEVELS.DEBUG);
262
263 // const proxyConfiguration = await Apify.createProxyConfiguration({
264 // groups: proxyGroups,
265 // })
266
267 const requestQueue = await Apify.openRequestQueue();
268 await enqueueInitialRequest(mode, requestQueue, countryDef);
269
270 const globalContext = { mode, stats, requestQueue, processedIds, countryDef };
271 const crawler = new Apify.BasicCrawler({
272 requestQueue,
273 maxConcurrency: 1, // FIXME
274 maxRequestRetries: 1,
275 async handleRequestFunction(context) {
276 const { request, session } = context;
277 const { userData } = request;
278 // @ts-ignore
279 switch (userData.label) {
280 case LABEL.INDEX:
281 return await handleIndex(context, globalContext);
282 case LABEL.PRODUCTS:
283 return await handleProducts(context, globalContext);
284 }
285 },
286 async handleFailedRequestFunction({ request }) {
287 log.error(`Request ${request.url} failed multiple times`, request);
288 },
289 });
290
291 await crawler.run();
292 log.info("crawler finished");
293});

package.json

{
"name": "cesky-raj-ceskyraj-com-scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"type": "module",
"scripts": {
"start": "node ./main.js",
"push-to-apify-platform": "npx apify push"
},
"dependencies": {
"apify": "*",
"@thi.ng/atom": "*",
"cheerio": "*",
"got-scraping": "*"
},
"apify": {
"title": "Český ráj (ceskyraj.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"isPublic": true,
"isDeprecated": false,
"issuesEnabled": true,
"isAnonymouslyRunnable": true,
"notice": "",
"pictureUrl": "",
"seoTitle": "",
"seoDescription": "",
"categories": [
"ECOMMERCE"
]
}
}

.actor/actor.json

{
"actorSpecification": 1,
"name": "cesky-raj-ceskyraj-com-scraper",
"title": "Český ráj (ceskyraj.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"version": "0.1.0",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "Český ráj (ceskyraj.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"views": {
"overview": {
"title": "Overview",
"description": "Overview of the most important fields",
"transformation": {
"fields": [
"itemId",
"itemName",
"itemUrl",
"img",
"inStock",
"currentPrice",
"originalPrice",
"currency"
]
},
"display": {
"component": "table",
"columns": [
{
"label": "Item ID",
"field": "itemUrl",
"format": "link",
"textField": "itemId"
},
{
"label": "Item Name",
"field": "itemName",
"format": "text"
},
{
"label": "Img",
"field": "img",
"format": "image"
},
{
"label": "In Stock",
"field": "inStock",
"format": "boolean"
},
{
"label": "Current Price",
"field": "currentPrice",
"format": "number"
},
{
"label": "Original Price",
"field": "originalPrice",
"format": "number"
},
{
"label": "Currency",
"field": "currency",
"format": "text"
}
]
}
}
}
}
}
}

_utils/CloudflareUnblocker.js

1import Apify from "apify"
2import { Session } from "apify/build/session_pool/session.js"
3import { BrowserPool, PlaywrightPlugin } from "browser-pool"
4import playwright from "playwright"
5
6const { utils: { log, requestAsBrowser, puppeteer } } = Apify
7
8export default class CloudflareUnblocker {
9 // unblockUrl: string
10 // proxyConfiguration: any
11 // browserPool: BrowserPool
12 // _shouldUseCustomTLS: boolean
13 // log: any
14 // name: string
15
16 constructor (options) {
17 // Store options
18 this.unblockUrl = options.unblockUrl
19 this.proxyConfiguration = options.proxyConfiguration
20
21 // Create browser pool
22 this.browserPool = new BrowserPool({
23 retireBrowserAfterPageCount: 50, // TODO: explain
24 maxOpenPagesPerBrowser: 10, // TODO: explain
25 browserPlugins: [
26 new PlaywrightPlugin(playwright.firefox, {
27 launchOptions: { headless: false },
28 }),
29 ],
30 })
31
32 this._shouldUseCustomTLS = false
33
34 // Extending CrawlerExtension caused error:
35 // Class extends value #<Object> is not a constructor or null
36 // But all it should do is to add log method, so I'm adding it here
37 this.name = this.constructor.name
38 this.log = Apify.utils.log.child({ prefix: this.constructor.name })
39 }
40
41 /**
42 * Main function that unblocks your session.
43 */
44 async unblock ({ session, request }) {
45 if (this._isSessionBeingRenewed(session)) {
46 request.retryCount = 0
47 this._throwError("Session is being renewed")
48 }
49
50 const oldShouldUseTLS = this._shouldUseCustomTLS
51 const requestOptions = this.getRequestOptions(session)
52 const response = await requestAsBrowser({
53 ...requestOptions,
54 url: request.url,
55 })
56
57 if (this._detectChallengeFunction(response)) {
58 // Browser challenge detected starting the bypassing
59 try {
60 this._markSessionBeingRenewed(session)
61 return this._solveChallenge({
62 response,
63 session,
64 request,
65 requestOptions,
66 })
67 } finally {
68 this._markSessionNotBeingRenewed(session)
69 }
70 } else if (response.statusCode === 403) {
71 // Cloudflare captcha detected -> switching to slower TLS
72 if (oldShouldUseTLS) {
73 this.log.info("Captcha found even with the TLS hack")
74 await Apify.setValue(
75 `CAPTCHA-HTML-${Math.random() * 1000}`,
76 response.body,
77 { contentType: "text/html" },
78 )
79 } else {
80 this.log.info(
81 "Captcha found for the first time -> switching to custom TLS",
82 )
83 this._shouldUseCustomTLS = true
84 }
85 }
86
87 this.log.info("Session OK")
88 session.setCookiesFromResponse(response)
89 return response
90 }
91
92 /**
93 * Solves the challenge by starting the browser and saving the cookies to the session.
94 * @param options
95 * @param options.request
96 * @param options.response
97 * @param options.session
98 * @param options.requestOptions
99 * @return {Promise<*>}
100 * @private
101 */
102 async _solveChallenge (options) {
103 const { request, response, session } = options
104 const { body, headers } = response
105
106 const receivedCookies = headers["set-cookie"]
107 log.debug(`${this.name}: received cookies: ${receivedCookies}`)
108 const requestOptions = this.getRequestOptions(session)
109
110 if (!receivedCookies) {
111 await Apify.setValue(`NO-COOKIES-HTML-${Math.random() * 1000}`, body, {
112 contentType: "text/html",
113 })
114 }
115
116 session.setCookiesFromResponse(response)
117
118 const cloudflareAuthReq = await this._getSolvedChallengeRequest({
119 response,
120 session,
121 request,
122 })
123
124 const browserHeaders = cloudflareAuthReq.headers()
125 const finalRequestOpts = {
126 ...requestOptions,
127 url: cloudflareAuthReq.url(),
128 payload: cloudflareAuthReq.postData(),
129 headers: {
130 ...requestOptions.headers,
131 "Content-Type": browserHeaders["content-type"],
132 Origin: browserHeaders.origin,
133 Referer: browserHeaders.referer,
134 Cookie: session.getCookieString(cloudflareAuthReq.url()),
135 "Content-Length": cloudflareAuthReq.postData().length,
136 },
137 method: "POST",
138 }
139 // Send the challenge response from requestAsBrowser
140 const challengeResponse = await requestAsBrowser(finalRequestOpts)
141
142 if (this._isChallengeSolvedFunction(challengeResponse)) {
143 // Success
144 request.retryCount = 0
145 session.setCookiesFromResponse(challengeResponse)
146 this.log.info("Successfully unblocked")
147 return challengeResponse
148 }
149
150 // Something went wrong - challenge was not solved.
151 session.retire()
152 await Apify.setValue(
153 `BLOCKED-HTML-${challengeResponse.statusCode}-${Math.random() * 1000}`,
154 challengeResponse.body,
155 { contentType: "text/html" },
156 )
157 this._throwError("Blocked")
158 }
159
160 /**
161 * Locks session
162 * @ODO: we could add this function to the Session natively
163 * @param session {Session}
164 * @private
165 */
166 _markSessionBeingRenewed (session) {
167 session.userData.isBeingRenewed = true
168 }
169
170 /**
171 * Unlocks session
172 * @param session {Session}
173 * @private
174 */
175 _markSessionNotBeingRenewed (session) {
176 session.userData.isBeingRenewed = false
177 }
178
179 /**
180 * Gets proxy URL
181 * @param session {Session};
182 * @return {String}
183 * @private
184 */
185 _getProxyUrl (session) {
186 return this.proxyConfiguration.newUrl(session.id)
187 }
188
189 /**
190 *
191 * @param session {Session}
192 * @return {boolean}
193 * @private
194 */
195 _isSessionBeingRenewed (session) {
196 return session.userData.isBeingRenewed
197 }
198
199 /**
200 * Throws prefixed error
201 * @param message {String} - Error message
202 * @private
203 */
204 _throwError (message) {
205 throw new Error(`${this.name}: ${message}`)
206 }
207
208 /**
209 * Opens new page, where solves the challenge and returns the auth request details.
210 * @param response
211 * @param request - Puppeteer request object
212 * @return {Promise<Object>} - Auth request
213 * @private
214 */
215 async _getSolvedChallengeRequest ({ response, request, session }) {
216 const { headers, body } = response
217 const page = await this.browserPool.newPage()
218 let authRequest
219
220 // Add request interceptor to get the authRequest for unlocking our session and forward other network mesures.
221 await puppeteer.addInterceptRequestHandler(page, async req => {
222 const reqUrl = req.url()
223 const method = req.method()
224
225 if (request.url === reqUrl && method === "GET") {
226 this.log.info(`Mocking initial navigation request: ${req.url()}`)
227 await req.respond({ status: 200, body, headers })
228 } else if (this._detectChallengeRequestFunction(req)) {
229 authRequest = req
230 await req.abort()
231 } else if (reqUrl.includes("transparent.gif")) {
232 const imageResponse = await this._sendImageRequest(req, session)
233 await req.respond({
234 status: 200,
235 body: imageResponse.body,
236 headers: imageResponse.headers,
237 })
238 } else {
239 await req.abort()
240 }
241 })
242
243 // Add debug to page log
244 // @ts-ignore
245 page.on("console", msg => this.log.debug(`${this.name}: ${msg.text()}`))
246
247 // Navigate to the unblock url.
248 await page.evaluate(url => {
249 // @ts-ignore
250 window.location.href = url
251 }, request.url)
252
253 await this._waitUntilChallengeFinishedFunction()
254
255 // TODO: Probably just await page.close()
256 // @ts-ignore
257 this.browserPool.recyclePage(page).catch(() => {
258 })
259 return authRequest
260 }
261
262 /**
263 * Sends request with Cloudflare image like headers.
264 * @param req {Request} - Puppeteer request
265 * @param session {Session} - Session instance
266 * @return {Promise<import("stream").Readable | import("http").IncomingMessage>}
267 * @private
268 */
269 async _sendImageRequest (req, session) {
270 const browserHeaders = req.headers()
271 const requestOptions = this.getRequestOptions(session)
272
273 const imageHeaders = {
274 Referer: browserHeaders.referer,
275 Connection: "keep-alive",
276 Pragma: "no-cache",
277 "Cache-Control": "no-cache",
278 "User-Agent":
279 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
280 "Sec-Fetch-User": "?1",
281 Accept: "image/webp,image/apng,image/*,*/*;q=0.8",
282 "Sec-Fetch-Site": "none",
283 "Sec-Fetch-Mode": "no-cors",
284 "Sec-Fetch-Dest": "image",
285 "Accept-Encoding": "gzip, deflate, br",
286 "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
287 Cookie: session.getCookieString(req.url()),
288 }
289
290 return requestAsBrowser({
291 ...requestOptions,
292 url: req.url(),
293 abortFunction: () => false,
294 headers: imageHeaders,
295 })
296 }
297
298 /**
299 * Gets the browser headers;
300 * @param cookieString {String} - Cookie header string.
301 * @return {Object} - Browser like headers.
302 * @private
303 */
304 _getBrowserHeaders (cookieString) {
305 return {
306 Connection: "close",
307 Pragma: "no-cache",
308 "Cache-Control": "no-cache",
309 "Upgrade-Insecure-Requests": "1",
310 "User-Agent":
311 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
312 "Sec-Fetch-User": "?1",
313 Accept:
314 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
315 "Sec-Fetch-Site": "none",
316 "Sec-Fetch-Mode": "navigate",
317 "Sec-Fetch-Dest": "document",
318 "Accept-Encoding": "gzip, deflate, br",
319 "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
320 Cookie: cookieString,
321 }
322 }
323
324 /**
325 * Detects the Cloudflare challenge page by 503 status code and the CDATA tag;
326 * @param response
327 * @return {boolean}
328 * @private
329 */
330 _detectChallengeFunction (response) {
331 return response.statusCode === 503 && response.body.includes("CDATA")
332 }
333
334 /**
335 * Detects Challenge request
336 * @param request {Request} - Puppeteer request
337 * @return {boolean} - true if challenge request detected
338 * @private
339 */
340 _detectChallengeRequestFunction (request) {
341 return request.method() === "POST"
342 }
343
344 /**
345 * Waits until the challenge is computed.
346 * @return {Promise<void>}
347 * @private
348 */
349 async _waitUntilChallengeFinishedFunction () {
350 await Apify.utils.sleep(5000)
351 }
352
353 /**
354 * Checks if challenge is successfully solved.
355 * @param challengeResponse
356 * @return {boolean}
357 * @private
358 */
359 _isChallengeSolvedFunction (challengeResponse) {
360 return challengeResponse.statusCode === 200
361 }
362
363 /**
364 * Gets request options - these options are compatible with `Apify.utils.requestAsBrowser` and `@apify/http-request`
365 * @param session {Session} - Session instance
366 * @return {Object} - Contains request options
367 */
368 getRequestOptions (session) {
369 const proxyUrl = this._getProxyUrl(session)
370
371 const requestOptions = {
372 headers: this._getBrowserHeaders(
373 session.getCookieString(this.unblockUrl),
374 ),
375 proxyUrl,
376 }
377
378 if (this._shouldUseCustomTLS) {
379 // @ts-ignore
380 if (!requestOptions.https) requestOptions.https = {}
381 requestOptions.https.ciphers = "AES256-SHA"
382 }
383
384 return requestOptions
385 }
386
387 /**
388 * Creates new Session instance for the SessionPool.
389 * @param sessionPool {SessionPool}.
390 * @return {Promise<Session>}
391 */
392 async createSessionFunction (sessionPool) {
393 const session = new Session({ sessionPool })
394 try {
395 await this.unblock({ session, request: { url: this.unblockUrl } })
396 } catch (e) {
397 log.warning(`${this.name}: Could not unblock session`)
398 log.exception(e, 'Unblocker failed')
399 }
400 return session
401 }
402
403 /**
404 * Gets options for the CheerioCrawler use interface.
405 * @return {Object} - CheerioCrawler options
406 */
407 getCrawlerOptions () {
408 const that = this
409 return {
410 useSessionPool: true,
411 sessionPoolOptions: {
412 maxPoolSize: 100,
413 createSessionFunction: this.createSessionFunction.bind(this),
414 },
415 persistCookiesPerSession: true,
416 useApifyProxy: true,
417 async prepareRequestFunction ({ request, session }) {
418 const newOptions = that.getRequestOptions(session)
419 request.headers = newOptions.headers
420 // @ts-ignore
421 if (newOptions.ciphers) {
422 // @ts-ignore
423 this.requestOptions = { ciphers: newOptions.ciphers }
424 }
425 },
426 }
427 }
428}

_utils/common.js

1import { createHash } from 'crypto';
2
3// inspired by @drobnikj
4const createUniqueKeyFromUrl = (url) => {
5 const hash = createHash('sha256');
6 const cleanUrl = url.split('://')[1]; // Remove protocol
7 hash.update(cleanUrl);
8 return hash.digest('hex');
9};

_utils/stats.js

1// inspired by hlidac-shopu
2import Apify from "apify";
3import { defAtom } from "@thi.ng/atom";
4
5// TODO: make this lowes common denominator
6const defaultStats = {
7 urls: 0,
8 items: 0,
9 itemsDuplicate: 0,
10 totalItems: 0,
11 denied: 0,
12 ok: 0
13};
14
15const inc = x => x + 1;
16const dec = x => x - 1;
17
18// TODO: stats should be in atom and updated via swap function atomically
19class Stats {
20 constructor(init) {
21 this.stats = defAtom(init);
22 this.interval = setInterval(() => this.log(), 20 * 1000);
23 }
24
25 inc(key) {
26 this.stats.swapIn(key, inc);
27 }
28
29 dec(key) {
30 this.stats.swapIn(key, dec);
31 }
32
33 add(key, value) {
34 this.stats.swapIn(key, x => x + value);
35 }
36
37 get() {
38 return this.stats.deref();
39 }
40
41 log() {
42 const stats = this.stats.deref();
43 Apify.utils.log.info(`stats: ${JSON.stringify(stats)}`);
44 }
45
46 /**
47 * @param final {boolean} - If true, clearInterval apply
48 */
49 async save(final = false) {
50 if (final) {
51 clearInterval(this.interval);
52 }
53 const stats = this.stats.deref();
54 await Apify.setValue("STATS", this.get());
55 Apify.utils.log.info("STATS saved!");
56 if (stats.ok) {
57 Apify.utils.log.info(
58 `Denied ratio: ${(stats.denied ?? 0 / stats.ok) * 100} %`
59 );
60 }
61 this.log();
62 }
63}
64
65/**
66 *
67 * @param {function} fn
68 * @param {*} init
69 * @returns {Promise<Stats>}
70 */
71export async function withPersistedStats(fn, init) {
72 const stats = (await Apify.getValue("STATS")) ?? init ?? defaultStats;
73 const state = new Stats(fn(stats));
74 const persistState = () => state.save();
75
76 Apify.events.on("persistState", persistState);
77 Apify.events.on("migrating", persistState);
78
79 return state;
80}