Český ráj (ceskyraj.com) scraper
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsČeský ráj (ceskyraj.com) scraper
strajk-old/cesky-raj-ceskyraj-com-scraper
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
Dockerfile
1FROM apify/actor-node:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install --only=prod --no-optional
7
8COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "Český ráj (ceskyraj.com) scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "mode": {
8 "title": "Mode",
9 "description": "",
10 "type": "string",
11 "editor": "select",
12 "default": "TEST",
13 "prefill": "TEST",
14 "enumTitles": [
15 "TEST",
16 "FULL",
17 "SINGLE"
18 ],
19 "enum": [
20 "TEST",
21 "FULL",
22 "SINGLE"
23 ]
24 },
25 "country": {
26 "title": "Country",
27 "description": "",
28 "type": "string",
29 "editor": "select",
30 "default": "CZ",
31 "prefill": "CZ",
32 "enumTitles": [
33 "CZ",
34 "SK",
35 "UK",
36 "DE",
37 "AT",
38 "HU"
39 ],
40 "enum": [
41 "CZ",
42 "SK",
43 "UK",
44 "DE",
45 "AT",
46 "HU"
47 ]
48 },
49 "debug": {
50 "title": "Debug",
51 "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
52 "type": "boolean",
53 "editor": "checkbox",
54 "default": false
55 }
56 },
57 "required": [
58 "mode",
59 "country"
60 ]
61}
apify.json
1{
2 "name": "cesky-raj-ceskyraj-com-scraper",
3 "version": "0.1",
4 "buildTag": "latest",
5 "env": null,
6 "template": "project_cheerio_crawler"
7}
main.js
1/**
2 * TODO:
3 * - consider proxies
4 * - DRY price parsing
5 * - different countries to input
6 *
7 * Beware that same product can have multiple valid urls
8 * - https://www.alza.cz/iphone-13-512gb-cervena-levne-d6839524.htm
9 * - https://www.alza.cz/sport/victorias-secret-st-11128877-cc-4vmq-cerna
10 * - https://www.alza.cz/sport/victorias-secret-st-11156655-cc-38h2-bezova?dq=6920061
11 *
12 * Variants can affect price
13 * - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6804643 38k
14 * - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6771118 39k
15 *
16 * Pagination:
17 * beware: only first next 3 pages are listed
18 * -> need to use total amount of products & product per page to calculate total amount of pages
19 */
20
21import Apify from "apify";
22import cheerio from "cheerio";
23import { gotScraping } from "got-scraping";
24import { withPersistedStats } from "./_utils/stats.js";
25
26const { log } = Apify.utils;
27
28var LABEL;
29
30(function (LABEL) {
31 LABEL["INDEX"] = "INDEX";
32 LABEL["PRODUCTS"] = "PRODUCTS";
33})(LABEL || (LABEL = {}));
34var MODE;
35
36(function (MODE) {
37 MODE["TEST"] = "TEST";
38 MODE["FULL"] = "FULL";
39 MODE["SINGLE"] = "SINGLE";
40})(MODE || (MODE = {}));
41var Country;
42
43// TODO: Maybe unify with Country enum
44(function (Country) {
45 Country["CZ"] = "CZ";
46 Country["SK"] = "SK";
47 Country["UK"] = "UK";
48 Country["DE"] = "DE";
49 Country["AT"] = "AT";
50 Country["HU"] = "HU";
51})(Country || (Country = {}));
52const Countries = {
53 CZ: {
54 domain: "cz",
55 currency: "CZK",
56 },
57 SK: {
58 domain: "sk",
59 currency: "EUR",
60 },
61 UK: {
62 domain: "co.uk",
63 currency: "GBP",
64 },
65 DE: {
66 domain: "de",
67 currency: "EUR",
68 },
69 AT: {
70 domain: "at",
71 currency: "EUR",
72 },
73 HU: {
74 domain: "hu",
75 currency: "HUF",
76 },
77};
78
79async function enqueueInitialRequest(type, requestQueue, countryDef) {
80 if (type === MODE.FULL) {
81 await requestQueue.addRequest({
82 userData: { label: LABEL.INDEX },
83 url: `https://alza.${countryDef.domain}/_sitemap-categories.xml`,
84 });
85 } else if (type === MODE.TEST) {
86 await requestQueue.addRequest({
87 userData: { label: LABEL.PRODUCTS },
88 url: "https://www.alza.cz/fotobrasny/18848740.htm",
89 });
90 await requestQueue.addRequest({
91 userData: { label: LABEL.PRODUCTS },
92 url: "https://www.alza.cz/objektivy-sony-fe/18879399.htm",
93 });
94 await requestQueue.addRequest({
95 userData: { label: LABEL.PRODUCTS },
96 url: "https://www.alza.cz/baterie-pro-fotoaparaty/18851400.htm",
97 });
98 } else if (type === MODE.SINGLE) {
99 await requestQueue.addRequest({
100 userData: { label: LABEL.PRODUCTS },
101 url: "https://www.alza.sk/iphone-mobilne-telefony/18851638.htm", // single category, but containing multiple pages
102 });
103 }
104}
105
106async function handleIndex(
107 { request, session = null },
108 { mode, stats, requestQueue }
109) {
110 const response = await gotScraping({
111 url: request.url,
112 });
113 if (![200, 404].includes(response.statusCode)) {
114 session.retire();
115 request.retryCount--;
116 throw new Error(
117 `Blocked, statusCode ${response.statusCode}, url: ${request.url}`
118 );
119 }
120 const $ = cheerio.load(response.body, { xmlMode: true });
121 const urls = $("url loc").map((i, x) => $(x).text());
122 for (const url of urls) {
123 await requestQueue.addRequest({
124 userData: { label: LABEL.PRODUCTS },
125 url,
126 });
127 }
128}
129
130async function handleProducts(
131 { request, session = null },
132 { stats, processedIds, requestQueue, countryDef }
133) {
134 console.log("handleProducts", request.url);
135
136 const response = await gotScraping({
137 url: request.url,
138 headers: {
139 Host: `www.alza.${countryDef.domain}`, // IMPORTANT to get prices in CZK, not EUR
140 },
141 // proxyUrl: await proxyConfiguration.newUrl(session.id),
142 });
143
144 const $ = cheerio.load(response.body);
145
146 if (
147 // no pagination info in URL -> we are on a first/initial page -> enqueue next pages
148 !request.url.match(/-p\d+\.htm/)
149 ) {
150 // Beware: only first next 3 pages are listed, so following code is not correct
151 // $('#pagerbottom a[class="pgn"]').each((i, el) => {
152 // // beware: href attr does not start with `/`, as product urls do
153 // const url = `https://alza.${countryDef.domain}/${($(el).attr("href"))}`
154 // if (i === 0) return // skip first page, as we are already on it
155 // void requestQueue.addRequest({
156 // userData: { label: LABEL.PRODUCTS },
157 // url,
158 // })
159 // })
160
161 // Method using total amount of products and product per page to calculate total amount of pages
162 const totalAmountOfProducts = parseInt($("#lblNumberItem").text());
163 const productsPerPage = 24; // TODO: Maybe to constant
164 const totalPages = Math.ceil(totalAmountOfProducts / productsPerPage);
165 for (let i = 2; i <= totalPages; i++) {
166 const url = request.url // e.g. `/iphone-mobilne-telefony/18851638.htm`
167 .replace(/\.htm$/, "") // `/iphone-mobilne-telefony/18851638`
168 .concat(`-p${i}.htm`); // e.g. `/iphone-mobilne-telefony/18851638-p2.htm`
169 console.log("enqueue next page", url);
170 void requestQueue.addRequest({
171 userData: { label: LABEL.PRODUCTS },
172 url,
173 });
174 }
175 }
176
177 $(".browsingitem").each(function () {
178 const itemId = $(this).attr("data-id");
179 if (!processedIds.has(itemId)) {
180 stats.inc("itemsUnique");
181 processedIds.add(itemId);
182 const itemCode = $(this).attr("data-code"); // non-standard product property
183 const img = $(this).find("a.pc.browsinglink img").data("src");
184 const relativeItemUrl = $(this)
185 .find("a.name.browsinglink")
186 .first()
187 .attr("href");
188 const absoluteItemUrl = `https://alza.${countryDef.domain}${relativeItemUrl}`;
189 const itemName = $(this)
190 .find("a.name.browsinglink")
191 .first()
192 .text()
193 .replace(/([\n\r])/g, "") // replace newlines and carriage returns
194 .trim();
195
196 const currentPrice = $(this)
197 .find(".price .c2")
198 .text()
199 .replace(/,-$/, "")
200 .replace(/[^\d.,]/g, "");
201
202 const originalPrice = $(this)
203 .find(".price .np2")
204 .text()
205 .replace(/,-$/, "")
206 .replace(/[^\d.,]/g, "");
207
208 const product = {
209 itemId,
210 itemName,
211 itemUrl: absoluteItemUrl,
212 img,
213 inStock: "TODO",
214 currentPrice,
215 originalPrice,
216 currency: countryDef.currency,
217 category: "FIXME",
218
219 // Derived
220 discounted: originalPrice && originalPrice !== currentPrice,
221 _discount: originalPrice
222 ? // @ts-ignore
223 (1 - currentPrice / originalPrice) * 100
224 : undefined,
225 };
226 console.log(`Found product: ${itemId}`);
227 Apify.pushData(product);
228 } else {
229 stats.inc("itemsDuplicate");
230 }
231 });
232}
233
234const proxyGroups = [
235 "HUNGARY",
236 "CZECH_LUMINATI",
237 "GERMANY",
238 "FRANCE",
239 "ITALY",
240 "SPAIN",
241];
242
243Apify.main(async () => {
244 await Apify.utils.purgeLocalStorage();
245
246 const stats = await withPersistedStats((x) => x, {
247 categories: 0,
248 items: 0,
249 itemsUnique: 0,
250 itemsDuplicate: 0,
251 denied: 0,
252 captchas: 0,
253 });
254 const processedIds = new Set();
255
256 const input = await Apify.getInput();
257 const { debug = false, country = Country.CZ, mode = MODE.TEST } = input ?? {};
258
259 const countryDef = Countries[country];
260
261 if (debug) Apify.utils.log.setLevel(Apify.utils.log.LEVELS.DEBUG);
262
263 // const proxyConfiguration = await Apify.createProxyConfiguration({
264 // groups: proxyGroups,
265 // })
266
267 const requestQueue = await Apify.openRequestQueue();
268 await enqueueInitialRequest(mode, requestQueue, countryDef);
269
270 const globalContext = { mode, stats, requestQueue, processedIds, countryDef };
271 const crawler = new Apify.BasicCrawler({
272 requestQueue,
273 maxConcurrency: 1, // FIXME
274 maxRequestRetries: 1,
275 async handleRequestFunction(context) {
276 const { request, session } = context;
277 const { userData } = request;
278 // @ts-ignore
279 switch (userData.label) {
280 case LABEL.INDEX:
281 return await handleIndex(context, globalContext);
282 case LABEL.PRODUCTS:
283 return await handleProducts(context, globalContext);
284 }
285 },
286 async handleFailedRequestFunction({ request }) {
287 log.error(`Request ${request.url} failed multiple times`, request);
288 },
289 });
290
291 await crawler.run();
292 log.info("crawler finished");
293});
package.json
1{
2 "name": "cesky-raj-ceskyraj-com-scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4 "type": "module",
5 "scripts": {
6 "start": "node ./main.js",
7 "push-to-apify-platform": "npx apify push"
8 },
9 "dependencies": {
10 "apify": "*",
11 "@thi.ng/atom": "*",
12 "cheerio": "*",
13 "got-scraping": "*"
14 },
15 "apify": {
16 "title": "Český ráj (ceskyraj.com) scraper",
17 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
18 "isPublic": true,
19 "isDeprecated": false,
20 "issuesEnabled": true,
21 "isAnonymouslyRunnable": true,
22 "notice": "",
23 "pictureUrl": "",
24 "seoTitle": "",
25 "seoDescription": "",
26 "categories": [
27 "ECOMMERCE"
28 ]
29 }
30}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "cesky-raj-ceskyraj-com-scraper",
4 "title": "Český ráj (ceskyraj.com) scraper",
5 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6 "version": "0.1.0",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "Český ráj (ceskyraj.com) scraper",
11 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12 "views": {
13 "overview": {
14 "title": "Overview",
15 "description": "Overview of the most important fields",
16 "transformation": {
17 "fields": [
18 "itemId",
19 "itemName",
20 "itemUrl",
21 "img",
22 "inStock",
23 "currentPrice",
24 "originalPrice",
25 "currency"
26 ]
27 },
28 "display": {
29 "component": "table",
30 "columns": [
31 {
32 "label": "Item ID",
33 "field": "itemUrl",
34 "format": "link",
35 "textField": "itemId"
36 },
37 {
38 "label": "Item Name",
39 "field": "itemName",
40 "format": "text"
41 },
42 {
43 "label": "Img",
44 "field": "img",
45 "format": "image"
46 },
47 {
48 "label": "In Stock",
49 "field": "inStock",
50 "format": "boolean"
51 },
52 {
53 "label": "Current Price",
54 "field": "currentPrice",
55 "format": "number"
56 },
57 {
58 "label": "Original Price",
59 "field": "originalPrice",
60 "format": "number"
61 },
62 {
63 "label": "Currency",
64 "field": "currency",
65 "format": "text"
66 }
67 ]
68 }
69 }
70 }
71 }
72 }
73}
_utils/CloudflareUnblocker.js
1import Apify from "apify"
2import { Session } from "apify/build/session_pool/session.js"
3import { BrowserPool, PlaywrightPlugin } from "browser-pool"
4import playwright from "playwright"
5
6const { utils: { log, requestAsBrowser, puppeteer } } = Apify
7
8export default class CloudflareUnblocker {
9 // unblockUrl: string
10 // proxyConfiguration: any
11 // browserPool: BrowserPool
12 // _shouldUseCustomTLS: boolean
13 // log: any
14 // name: string
15
16 constructor (options) {
17 // Store options
18 this.unblockUrl = options.unblockUrl
19 this.proxyConfiguration = options.proxyConfiguration
20
21 // Create browser pool
22 this.browserPool = new BrowserPool({
23 retireBrowserAfterPageCount: 50, // TODO: explain
24 maxOpenPagesPerBrowser: 10, // TODO: explain
25 browserPlugins: [
26 new PlaywrightPlugin(playwright.firefox, {
27 launchOptions: { headless: false },
28 }),
29 ],
30 })
31
32 this._shouldUseCustomTLS = false
33
34 // Extending CrawlerExtension caused error:
35 // Class extends value #<Object> is not a constructor or null
36 // But all it should do is to add log method, so I'm adding it here
37 this.name = this.constructor.name
38 this.log = Apify.utils.log.child({ prefix: this.constructor.name })
39 }
40
41 /**
42 * Main function that unblocks your session.
43 */
44 async unblock ({ session, request }) {
45 if (this._isSessionBeingRenewed(session)) {
46 request.retryCount = 0
47 this._throwError("Session is being renewed")
48 }
49
50 const oldShouldUseTLS = this._shouldUseCustomTLS
51 const requestOptions = this.getRequestOptions(session)
52 const response = await requestAsBrowser({
53 ...requestOptions,
54 url: request.url,
55 })
56
57 if (this._detectChallengeFunction(response)) {
58 // Browser challenge detected starting the bypassing
59 try {
60 this._markSessionBeingRenewed(session)
61 return this._solveChallenge({
62 response,
63 session,
64 request,
65 requestOptions,
66 })
67 } finally {
68 this._markSessionNotBeingRenewed(session)
69 }
70 } else if (response.statusCode === 403) {
71 // Cloudflare captcha detected -> switching to slower TLS
72 if (oldShouldUseTLS) {
73 this.log.info("Captcha found even with the TLS hack")
74 await Apify.setValue(
75 `CAPTCHA-HTML-${Math.random() * 1000}`,
76 response.body,
77 { contentType: "text/html" },
78 )
79 } else {
80 this.log.info(
81 "Captcha found for the first time -> switching to custom TLS",
82 )
83 this._shouldUseCustomTLS = true
84 }
85 }
86
87 this.log.info("Session OK")
88 session.setCookiesFromResponse(response)
89 return response
90 }
91
92 /**
93 * Solves the challenge by starting the browser and saving the cookies to the session.
94 * @param options
95 * @param options.request
96 * @param options.response
97 * @param options.session
98 * @param options.requestOptions
99 * @return {Promise<*>}
100 * @private
101 */
102 async _solveChallenge (options) {
103 const { request, response, session } = options
104 const { body, headers } = response
105
106 const receivedCookies = headers["set-cookie"]
107 log.debug(`${this.name}: received cookies: ${receivedCookies}`)
108 const requestOptions = this.getRequestOptions(session)
109
110 if (!receivedCookies) {
111 await Apify.setValue(`NO-COOKIES-HTML-${Math.random() * 1000}`, body, {
112 contentType: "text/html",
113 })
114 }
115
116 session.setCookiesFromResponse(response)
117
118 const cloudflareAuthReq = await this._getSolvedChallengeRequest({
119 response,
120 session,
121 request,
122 })
123
124 const browserHeaders = cloudflareAuthReq.headers()
125 const finalRequestOpts = {
126 ...requestOptions,
127 url: cloudflareAuthReq.url(),
128 payload: cloudflareAuthReq.postData(),
129 headers: {
130 ...requestOptions.headers,
131 "Content-Type": browserHeaders["content-type"],
132 Origin: browserHeaders.origin,
133 Referer: browserHeaders.referer,
134 Cookie: session.getCookieString(cloudflareAuthReq.url()),
135 "Content-Length": cloudflareAuthReq.postData().length,
136 },
137 method: "POST",
138 }
139 // Send the challenge response from requestAsBrowser
140 const challengeResponse = await requestAsBrowser(finalRequestOpts)
141
142 if (this._isChallengeSolvedFunction(challengeResponse)) {
143 // Success
144 request.retryCount = 0
145 session.setCookiesFromResponse(challengeResponse)
146 this.log.info("Successfully unblocked")
147 return challengeResponse
148 }
149
150 // Something went wrong - challenge was not solved.
151 session.retire()
152 await Apify.setValue(
153 `BLOCKED-HTML-${challengeResponse.statusCode}-${Math.random() * 1000}`,
154 challengeResponse.body,
155 { contentType: "text/html" },
156 )
157 this._throwError("Blocked")
158 }
159
160 /**
161 * Locks session
162 * @ODO: we could add this function to the Session natively
163 * @param session {Session}
164 * @private
165 */
166 _markSessionBeingRenewed (session) {
167 session.userData.isBeingRenewed = true
168 }
169
170 /**
171 * Unlocks session
172 * @param session {Session}
173 * @private
174 */
175 _markSessionNotBeingRenewed (session) {
176 session.userData.isBeingRenewed = false
177 }
178
179 /**
180 * Gets proxy URL
181 * @param session {Session};
182 * @return {String}
183 * @private
184 */
185 _getProxyUrl (session) {
186 return this.proxyConfiguration.newUrl(session.id)
187 }
188
189 /**
190 *
191 * @param session {Session}
192 * @return {boolean}
193 * @private
194 */
195 _isSessionBeingRenewed (session) {
196 return session.userData.isBeingRenewed
197 }
198
199 /**
200 * Throws prefixed error
201 * @param message {String} - Error message
202 * @private
203 */
204 _throwError (message) {
205 throw new Error(`${this.name}: ${message}`)
206 }
207
208 /**
209 * Opens new page, where solves the challenge and returns the auth request details.
210 * @param response
211 * @param request - Puppeteer request object
212 * @return {Promise<Object>} - Auth request
213 * @private
214 */
215 async _getSolvedChallengeRequest ({ response, request, session }) {
216 const { headers, body } = response
217 const page = await this.browserPool.newPage()
218 let authRequest
219
220 // Add request interceptor to get the authRequest for unlocking our session and forward other network mesures.
221 await puppeteer.addInterceptRequestHandler(page, async req => {
222 const reqUrl = req.url()
223 const method = req.method()
224
225 if (request.url === reqUrl && method === "GET") {
226 this.log.info(`Mocking initial navigation request: ${req.url()}`)
227 await req.respond({ status: 200, body, headers })
228 } else if (this._detectChallengeRequestFunction(req)) {
229 authRequest = req
230 await req.abort()
231 } else if (reqUrl.includes("transparent.gif")) {
232 const imageResponse = await this._sendImageRequest(req, session)
233 await req.respond({
234 status: 200,
235 body: imageResponse.body,
236 headers: imageResponse.headers,
237 })
238 } else {
239 await req.abort()
240 }
241 })
242
243 // Add debug to page log
244 // @ts-ignore
245 page.on("console", msg => this.log.debug(`${this.name}: ${msg.text()}`))
246
247 // Navigate to the unblock url.
248 await page.evaluate(url => {
249 // @ts-ignore
250 window.location.href = url
251 }, request.url)
252
253 await this._waitUntilChallengeFinishedFunction()
254
255 // TODO: Probably just await page.close()
256 // @ts-ignore
257 this.browserPool.recyclePage(page).catch(() => {
258 })
259 return authRequest
260 }
261
262 /**
263 * Sends request with Cloudflare image like headers.
264 * @param req {Request} - Puppeteer request
265 * @param session {Session} - Session instance
266 * @return {Promise<import("stream").Readable | import("http").IncomingMessage>}
267 * @private
268 */
269 async _sendImageRequest (req, session) {
270 const browserHeaders = req.headers()
271 const requestOptions = this.getRequestOptions(session)
272
273 const imageHeaders = {
274 Referer: browserHeaders.referer,
275 Connection: "keep-alive",
276 Pragma: "no-cache",
277 "Cache-Control": "no-cache",
278 "User-Agent":
279 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
280 "Sec-Fetch-User": "?1",
281 Accept: "image/webp,image/apng,image/*,*/*;q=0.8",
282 "Sec-Fetch-Site": "none",
283 "Sec-Fetch-Mode": "no-cors",
284 "Sec-Fetch-Dest": "image",
285 "Accept-Encoding": "gzip, deflate, br",
286 "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
287 Cookie: session.getCookieString(req.url()),
288 }
289
290 return requestAsBrowser({
291 ...requestOptions,
292 url: req.url(),
293 abortFunction: () => false,
294 headers: imageHeaders,
295 })
296 }
297
298 /**
299 * Gets the browser headers;
300 * @param cookieString {String} - Cookie header string.
301 * @return {Object} - Browser like headers.
302 * @private
303 */
304 _getBrowserHeaders (cookieString) {
305 return {
306 Connection: "close",
307 Pragma: "no-cache",
308 "Cache-Control": "no-cache",
309 "Upgrade-Insecure-Requests": "1",
310 "User-Agent":
311 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
312 "Sec-Fetch-User": "?1",
313 Accept:
314 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
315 "Sec-Fetch-Site": "none",
316 "Sec-Fetch-Mode": "navigate",
317 "Sec-Fetch-Dest": "document",
318 "Accept-Encoding": "gzip, deflate, br",
319 "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
320 Cookie: cookieString,
321 }
322 }
323
324 /**
325 * Detects the Cloudflare challenge page by 503 status code and the CDATA tag;
326 * @param response
327 * @return {boolean}
328 * @private
329 */
330 _detectChallengeFunction (response) {
331 return response.statusCode === 503 && response.body.includes("CDATA")
332 }
333
334 /**
335 * Detects Challenge request
336 * @param request {Request} - Puppeteer request
337 * @return {boolean} - true if challenge request detected
338 * @private
339 */
340 _detectChallengeRequestFunction (request) {
341 return request.method() === "POST"
342 }
343
344 /**
345 * Waits until the challenge is computed.
346 * @return {Promise<void>}
347 * @private
348 */
349 async _waitUntilChallengeFinishedFunction () {
350 await Apify.utils.sleep(5000)
351 }
352
353 /**
354 * Checks if challenge is successfully solved.
355 * @param challengeResponse
356 * @return {boolean}
357 * @private
358 */
359 _isChallengeSolvedFunction (challengeResponse) {
360 return challengeResponse.statusCode === 200
361 }
362
363 /**
364 * Gets request options - these options are compatible with `Apify.utils.requestAsBrowser` and `@apify/http-request`
365 * @param session {Session} - Session instance
366 * @return {Object} - Contains request options
367 */
368 getRequestOptions (session) {
369 const proxyUrl = this._getProxyUrl(session)
370
371 const requestOptions = {
372 headers: this._getBrowserHeaders(
373 session.getCookieString(this.unblockUrl),
374 ),
375 proxyUrl,
376 }
377
378 if (this._shouldUseCustomTLS) {
379 // @ts-ignore
380 if (!requestOptions.https) requestOptions.https = {}
381 requestOptions.https.ciphers = "AES256-SHA"
382 }
383
384 return requestOptions
385 }
386
387 /**
388 * Creates new Session instance for the SessionPool.
389 * @param sessionPool {SessionPool}.
390 * @return {Promise<Session>}
391 */
392 async createSessionFunction (sessionPool) {
393 const session = new Session({ sessionPool })
394 try {
395 await this.unblock({ session, request: { url: this.unblockUrl } })
396 } catch (e) {
397 log.warning(`${this.name}: Could not unblock session`)
398 log.exception(e, 'Unblocker failed')
399 }
400 return session
401 }
402
403 /**
404 * Gets options for the CheerioCrawler use interface.
405 * @return {Object} - CheerioCrawler options
406 */
407 getCrawlerOptions () {
408 const that = this
409 return {
410 useSessionPool: true,
411 sessionPoolOptions: {
412 maxPoolSize: 100,
413 createSessionFunction: this.createSessionFunction.bind(this),
414 },
415 persistCookiesPerSession: true,
416 useApifyProxy: true,
417 async prepareRequestFunction ({ request, session }) {
418 const newOptions = that.getRequestOptions(session)
419 request.headers = newOptions.headers
420 // @ts-ignore
421 if (newOptions.ciphers) {
422 // @ts-ignore
423 this.requestOptions = { ciphers: newOptions.ciphers }
424 }
425 },
426 }
427 }
428}
_utils/common.js
1import { createHash } from 'crypto';
2
3// inspired by @drobnikj
4const createUniqueKeyFromUrl = (url) => {
5 const hash = createHash('sha256');
6 const cleanUrl = url.split('://')[1]; // Remove protocol
7 hash.update(cleanUrl);
8 return hash.digest('hex');
9};
_utils/stats.js
1// inspired by hlidac-shopu
2import Apify from "apify";
3import { defAtom } from "@thi.ng/atom";
4
5// TODO: make this lowes common denominator
6const defaultStats = {
7 urls: 0,
8 items: 0,
9 itemsDuplicate: 0,
10 totalItems: 0,
11 denied: 0,
12 ok: 0
13};
14
15const inc = x => x + 1;
16const dec = x => x - 1;
17
18// TODO: stats should be in atom and updated via swap function atomically
19class Stats {
20 constructor(init) {
21 this.stats = defAtom(init);
22 this.interval = setInterval(() => this.log(), 20 * 1000);
23 }
24
25 inc(key) {
26 this.stats.swapIn(key, inc);
27 }
28
29 dec(key) {
30 this.stats.swapIn(key, dec);
31 }
32
33 add(key, value) {
34 this.stats.swapIn(key, x => x + value);
35 }
36
37 get() {
38 return this.stats.deref();
39 }
40
41 log() {
42 const stats = this.stats.deref();
43 Apify.utils.log.info(`stats: ${JSON.stringify(stats)}`);
44 }
45
46 /**
47 * @param final {boolean} - If true, clearInterval apply
48 */
49 async save(final = false) {
50 if (final) {
51 clearInterval(this.interval);
52 }
53 const stats = this.stats.deref();
54 await Apify.setValue("STATS", this.get());
55 Apify.utils.log.info("STATS saved!");
56 if (stats.ok) {
57 Apify.utils.log.info(
58 `Denied ratio: ${(stats.denied ?? 0 / stats.ok) * 100} %`
59 );
60 }
61 this.log();
62 }
63}
64
65/**
66 *
67 * @param {function} fn
68 * @param {*} init
69 * @returns {Promise<Stats>}
70 */
71export async function withPersistedStats(fn, init) {
72 const stats = (await Apify.getValue("STATS")) ?? init ?? defaultStats;
73 const state = new Stats(fn(stats));
74 const persistState = () => state.save();
75
76 Apify.events.on("persistState", persistState);
77 Apify.events.on("migrating", persistState);
78
79 return state;
80}
Developer
Maintained by Community
Categories