Český ráj (ceskyraj.com) scraper avatar
Český ráj (ceskyraj.com) scraper
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Český ráj (ceskyraj.com) scraper

Český ráj (ceskyraj.com) scraper

strajk-old/cesky-raj-ceskyraj-com-scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

Dockerfile

1FROM apify/actor-node:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6  && npm install --only=prod --no-optional
7
8COPY . ./

INPUT_SCHEMA.json

1{
2  "title": "Český ráj (ceskyraj.com) scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "mode": {
8      "title": "Mode",
9      "description": "",
10      "type": "string",
11      "editor": "select",
12      "default": "TEST",
13      "prefill": "TEST",
14      "enumTitles": [
15        "TEST",
16        "FULL",
17        "SINGLE"
18      ],
19      "enum": [
20        "TEST",
21        "FULL",
22        "SINGLE"
23      ]
24    },
25    "country": {
26      "title": "Country",
27      "description": "",
28      "type": "string",
29      "editor": "select",
30      "default": "CZ",
31      "prefill": "CZ",
32      "enumTitles": [
33        "CZ",
34        "SK",
35        "UK",
36        "DE",
37        "AT",
38        "HU"
39      ],
40      "enum": [
41        "CZ",
42        "SK",
43        "UK",
44        "DE",
45        "AT",
46        "HU"
47      ]
48    },
49    "debug": {
50      "title": "Debug",
51      "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
52      "type": "boolean",
53      "editor": "checkbox",
54      "default": false
55    }
56  },
57  "required": [
58    "mode",
59    "country"
60  ]
61}

apify.json

1{
2  "name": "cesky-raj-ceskyraj-com-scraper",
3  "version": "0.1",
4  "buildTag": "latest",
5  "env": null,
6  "template": "project_cheerio_crawler"
7}

main.js

1/**
2 * TODO:
3 * - consider proxies
4 * - DRY price parsing
5 * - different countries to input
6 *
7 * Beware that same product can have multiple valid urls
8 * - https://www.alza.cz/iphone-13-512gb-cervena-levne-d6839524.htm
9 * - https://www.alza.cz/sport/victorias-secret-st-11128877-cc-4vmq-cerna
10 * - https://www.alza.cz/sport/victorias-secret-st-11156655-cc-38h2-bezova?dq=6920061
11 *
12 * Variants can affect price
13 * - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6804643 38k
14 * - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6771118 39k
15 *
16 * Pagination:
17 * beware: only first next 3 pages are listed
18 * -> need to use total amount of products & product per page to calculate total amount of pages
19 */
20
21import Apify from "apify";
22import cheerio from "cheerio";
23import { gotScraping } from "got-scraping";
24import { withPersistedStats } from "./_utils/stats.js";
25
26const { log } = Apify.utils;
27
28var LABEL;
29
30(function (LABEL) {
31  LABEL["INDEX"] = "INDEX";
32  LABEL["PRODUCTS"] = "PRODUCTS";
33})(LABEL || (LABEL = {}));
34var MODE;
35
36(function (MODE) {
37  MODE["TEST"] = "TEST";
38  MODE["FULL"] = "FULL";
39  MODE["SINGLE"] = "SINGLE";
40})(MODE || (MODE = {}));
41var Country;
42
43// TODO: Maybe unify with Country enum
44(function (Country) {
45  Country["CZ"] = "CZ";
46  Country["SK"] = "SK";
47  Country["UK"] = "UK";
48  Country["DE"] = "DE";
49  Country["AT"] = "AT";
50  Country["HU"] = "HU";
51})(Country || (Country = {}));
52const Countries = {
53  CZ: {
54    domain: "cz",
55    currency: "CZK",
56  },
57  SK: {
58    domain: "sk",
59    currency: "EUR",
60  },
61  UK: {
62    domain: "co.uk",
63    currency: "GBP",
64  },
65  DE: {
66    domain: "de",
67    currency: "EUR",
68  },
69  AT: {
70    domain: "at",
71    currency: "EUR",
72  },
73  HU: {
74    domain: "hu",
75    currency: "HUF",
76  },
77};
78
79async function enqueueInitialRequest(type, requestQueue, countryDef) {
80  if (type === MODE.FULL) {
81    await requestQueue.addRequest({
82      userData: { label: LABEL.INDEX },
83      url: `https://alza.${countryDef.domain}/_sitemap-categories.xml`,
84    });
85  } else if (type === MODE.TEST) {
86    await requestQueue.addRequest({
87      userData: { label: LABEL.PRODUCTS },
88      url: "https://www.alza.cz/fotobrasny/18848740.htm",
89    });
90    await requestQueue.addRequest({
91      userData: { label: LABEL.PRODUCTS },
92      url: "https://www.alza.cz/objektivy-sony-fe/18879399.htm",
93    });
94    await requestQueue.addRequest({
95      userData: { label: LABEL.PRODUCTS },
96      url: "https://www.alza.cz/baterie-pro-fotoaparaty/18851400.htm",
97    });
98  } else if (type === MODE.SINGLE) {
99    await requestQueue.addRequest({
100      userData: { label: LABEL.PRODUCTS },
101      url: "https://www.alza.sk/iphone-mobilne-telefony/18851638.htm", // single category, but containing multiple pages
102    });
103  }
104}
105
106async function handleIndex(
107  { request, session = null },
108  { mode, stats, requestQueue }
109) {
110  const response = await gotScraping({
111    url: request.url,
112  });
113  if (![200, 404].includes(response.statusCode)) {
114    session.retire();
115    request.retryCount--;
116    throw new Error(
117      `Blocked, statusCode ${response.statusCode}, url: ${request.url}`
118    );
119  }
120  const $ = cheerio.load(response.body, { xmlMode: true });
121  const urls = $("url loc").map((i, x) => $(x).text());
122  for (const url of urls) {
123    await requestQueue.addRequest({
124      userData: { label: LABEL.PRODUCTS },
125      url,
126    });
127  }
128}
129
130async function handleProducts(
131  { request, session = null },
132  { stats, processedIds, requestQueue, countryDef }
133) {
134  console.log("handleProducts", request.url);
135
136  const response = await gotScraping({
137    url: request.url,
138    headers: {
139      Host: `www.alza.${countryDef.domain}`, // IMPORTANT to get prices in CZK, not EUR
140    },
141    // proxyUrl: await proxyConfiguration.newUrl(session.id),
142  });
143
144  const $ = cheerio.load(response.body);
145
146  if (
147    // no pagination info in URL -> we are on a first/initial page -> enqueue next pages
148    !request.url.match(/-p\d+\.htm/)
149  ) {
150    // Beware: only first next 3 pages are listed, so following code is not correct
151    // $('#pagerbottom a[class="pgn"]').each((i, el) => {
152    //   // beware: href attr does not start with `/`, as product urls do
153    //   const url = `https://alza.${countryDef.domain}/${($(el).attr("href"))}`
154    //   if (i === 0) return // skip first page, as we are already on it
155    //   void requestQueue.addRequest({
156    //     userData: { label: LABEL.PRODUCTS },
157    //     url,
158    //   })
159    // })
160
161    // Method using total amount of products and product per page to calculate total amount of pages
162    const totalAmountOfProducts = parseInt($("#lblNumberItem").text());
163    const productsPerPage = 24; // TODO: Maybe to constant
164    const totalPages = Math.ceil(totalAmountOfProducts / productsPerPage);
165    for (let i = 2; i <= totalPages; i++) {
166      const url = request.url // e.g. `/iphone-mobilne-telefony/18851638.htm`
167        .replace(/\.htm$/, "") // `/iphone-mobilne-telefony/18851638`
168        .concat(`-p${i}.htm`); // e.g. `/iphone-mobilne-telefony/18851638-p2.htm`
169      console.log("enqueue next page", url);
170      void requestQueue.addRequest({
171        userData: { label: LABEL.PRODUCTS },
172        url,
173      });
174    }
175  }
176
177  $(".browsingitem").each(function () {
178    const itemId = $(this).attr("data-id");
179    if (!processedIds.has(itemId)) {
180      stats.inc("itemsUnique");
181      processedIds.add(itemId);
182      const itemCode = $(this).attr("data-code"); // non-standard product property
183      const img = $(this).find("a.pc.browsinglink img").data("src");
184      const relativeItemUrl = $(this)
185        .find("a.name.browsinglink")
186        .first()
187        .attr("href");
188      const absoluteItemUrl = `https://alza.${countryDef.domain}${relativeItemUrl}`;
189      const itemName = $(this)
190        .find("a.name.browsinglink")
191        .first()
192        .text()
193        .replace(/([\n\r])/g, "") // replace newlines and carriage returns
194        .trim();
195
196      const currentPrice = $(this)
197        .find(".price .c2")
198        .text()
199        .replace(/,-$/, "")
200        .replace(/[^\d.,]/g, "");
201
202      const originalPrice = $(this)
203        .find(".price .np2")
204        .text()
205        .replace(/,-$/, "")
206        .replace(/[^\d.,]/g, "");
207
208      const product = {
209        itemId,
210        itemName,
211        itemUrl: absoluteItemUrl,
212        img,
213        inStock: "TODO",
214        currentPrice,
215        originalPrice,
216        currency: countryDef.currency,
217        category: "FIXME",
218
219        // Derived
220        discounted: originalPrice && originalPrice !== currentPrice,
221        _discount: originalPrice
222          ? // @ts-ignore
223            (1 - currentPrice / originalPrice) * 100
224          : undefined,
225      };
226      console.log(`Found product: ${itemId}`);
227      Apify.pushData(product);
228    } else {
229      stats.inc("itemsDuplicate");
230    }
231  });
232}
233
234const proxyGroups = [
235  "HUNGARY",
236  "CZECH_LUMINATI",
237  "GERMANY",
238  "FRANCE",
239  "ITALY",
240  "SPAIN",
241];
242
243Apify.main(async () => {
244  await Apify.utils.purgeLocalStorage();
245
246  const stats = await withPersistedStats((x) => x, {
247    categories: 0,
248    items: 0,
249    itemsUnique: 0,
250    itemsDuplicate: 0,
251    denied: 0,
252    captchas: 0,
253  });
254  const processedIds = new Set();
255
256  const input = await Apify.getInput();
257  const { debug = false, country = Country.CZ, mode = MODE.TEST } = input ?? {};
258
259  const countryDef = Countries[country];
260
261  if (debug) Apify.utils.log.setLevel(Apify.utils.log.LEVELS.DEBUG);
262
263  // const proxyConfiguration = await Apify.createProxyConfiguration({
264  //   groups: proxyGroups,
265  // })
266
267  const requestQueue = await Apify.openRequestQueue();
268  await enqueueInitialRequest(mode, requestQueue, countryDef);
269
270  const globalContext = { mode, stats, requestQueue, processedIds, countryDef };
271  const crawler = new Apify.BasicCrawler({
272    requestQueue,
273    maxConcurrency: 1, // FIXME
274    maxRequestRetries: 1,
275    async handleRequestFunction(context) {
276      const { request, session } = context;
277      const { userData } = request;
278      // @ts-ignore
279      switch (userData.label) {
280        case LABEL.INDEX:
281          return await handleIndex(context, globalContext);
282        case LABEL.PRODUCTS:
283          return await handleProducts(context, globalContext);
284      }
285    },
286    async handleFailedRequestFunction({ request }) {
287      log.error(`Request ${request.url} failed multiple times`, request);
288    },
289  });
290
291  await crawler.run();
292  log.info("crawler finished");
293});

package.json

1{
2  "name": "cesky-raj-ceskyraj-com-scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "module",
5  "scripts": {
6    "start": "node ./main.js",
7    "push-to-apify-platform": "npx apify push"
8  },
9  "dependencies": {
10    "apify": "*",
11    "@thi.ng/atom": "*",
12    "cheerio": "*",
13    "got-scraping": "*"
14  },
15  "apify": {
16    "title": "Český ráj (ceskyraj.com) scraper",
17    "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
18    "isPublic": true,
19    "isDeprecated": false,
20    "issuesEnabled": true,
21    "isAnonymouslyRunnable": true,
22    "notice": "",
23    "pictureUrl": "",
24    "seoTitle": "",
25    "seoDescription": "",
26    "categories": [
27      "ECOMMERCE"
28    ]
29  }
30}

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "cesky-raj-ceskyraj-com-scraper",
4  "title": "Český ráj (ceskyraj.com) scraper",
5  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6  "version": "0.1.0",
7  "storages": {
8    "dataset": {
9      "actorSpecification": 1,
10      "title": "Český ráj (ceskyraj.com) scraper",
11      "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12      "views": {
13        "overview": {
14          "title": "Overview",
15          "description": "Overview of the most important fields",
16          "transformation": {
17            "fields": [
18              "itemId",
19              "itemName",
20              "itemUrl",
21              "img",
22              "inStock",
23              "currentPrice",
24              "originalPrice",
25              "currency"
26            ]
27          },
28          "display": {
29            "component": "table",
30            "columns": [
31              {
32                "label": "Item ID",
33                "field": "itemUrl",
34                "format": "link",
35                "textField": "itemId"
36              },
37              {
38                "label": "Item Name",
39                "field": "itemName",
40                "format": "text"
41              },
42              {
43                "label": "Img",
44                "field": "img",
45                "format": "image"
46              },
47              {
48                "label": "In Stock",
49                "field": "inStock",
50                "format": "boolean"
51              },
52              {
53                "label": "Current Price",
54                "field": "currentPrice",
55                "format": "number"
56              },
57              {
58                "label": "Original Price",
59                "field": "originalPrice",
60                "format": "number"
61              },
62              {
63                "label": "Currency",
64                "field": "currency",
65                "format": "text"
66              }
67            ]
68          }
69        }
70      }
71    }
72  }
73}

_utils/CloudflareUnblocker.js

1import Apify from "apify"
2import { Session } from "apify/build/session_pool/session.js"
3import { BrowserPool, PlaywrightPlugin } from "browser-pool"
4import playwright from "playwright"
5
6const { utils: { log, requestAsBrowser, puppeteer } } = Apify
7
8export default class CloudflareUnblocker {
9  // unblockUrl: string
10  // proxyConfiguration: any
11  // browserPool: BrowserPool
12  // _shouldUseCustomTLS: boolean
13  // log: any
14  // name: string
15
16  constructor (options) {
17    // Store options
18    this.unblockUrl = options.unblockUrl
19    this.proxyConfiguration = options.proxyConfiguration
20
21    // Create browser pool
22    this.browserPool = new BrowserPool({
23      retireBrowserAfterPageCount: 50, // TODO: explain
24      maxOpenPagesPerBrowser: 10, // TODO: explain
25      browserPlugins: [
26        new PlaywrightPlugin(playwright.firefox, {
27          launchOptions: { headless: false },
28        }),
29      ],
30    })
31
32    this._shouldUseCustomTLS = false
33
34    // Extending CrawlerExtension caused error:
35    // Class extends value #<Object> is not a constructor or null
36    // But all it should do is to add log method, so I'm adding it here
37    this.name = this.constructor.name
38    this.log = Apify.utils.log.child({ prefix: this.constructor.name })
39  }
40
41  /**
42   * Main function that unblocks your session.
43   */
44  async unblock ({ session, request }) {
45    if (this._isSessionBeingRenewed(session)) {
46      request.retryCount = 0
47      this._throwError("Session is being renewed")
48    }
49
50    const oldShouldUseTLS = this._shouldUseCustomTLS
51    const requestOptions = this.getRequestOptions(session)
52    const response = await requestAsBrowser({
53      ...requestOptions,
54      url: request.url,
55    })
56
57    if (this._detectChallengeFunction(response)) {
58      // Browser challenge detected starting the bypassing
59      try {
60        this._markSessionBeingRenewed(session)
61        return this._solveChallenge({
62          response,
63          session,
64          request,
65          requestOptions,
66        })
67      } finally {
68        this._markSessionNotBeingRenewed(session)
69      }
70    } else if (response.statusCode === 403) {
71      // Cloudflare captcha detected -> switching to slower TLS
72      if (oldShouldUseTLS) {
73        this.log.info("Captcha found even with the TLS hack")
74        await Apify.setValue(
75          `CAPTCHA-HTML-${Math.random() * 1000}`,
76          response.body,
77          { contentType: "text/html" },
78        )
79      } else {
80        this.log.info(
81          "Captcha found for the first time -> switching to custom TLS",
82        )
83        this._shouldUseCustomTLS = true
84      }
85    }
86
87    this.log.info("Session OK")
88    session.setCookiesFromResponse(response)
89    return response
90  }
91
92  /**
93   * Solves the challenge by starting the browser and saving the cookies to the session.
94   * @param options
95   * @param options.request
96   * @param options.response
97   * @param options.session
98   * @param options.requestOptions
99   * @return {Promise<*>}
100   * @private
101   */
102  async _solveChallenge (options) {
103    const { request, response, session } = options
104    const { body, headers } = response
105
106    const receivedCookies = headers["set-cookie"]
107    log.debug(`${this.name}: received cookies: ${receivedCookies}`)
108    const requestOptions = this.getRequestOptions(session)
109
110    if (!receivedCookies) {
111      await Apify.setValue(`NO-COOKIES-HTML-${Math.random() * 1000}`, body, {
112        contentType: "text/html",
113      })
114    }
115
116    session.setCookiesFromResponse(response)
117
118    const cloudflareAuthReq = await this._getSolvedChallengeRequest({
119      response,
120      session,
121      request,
122    })
123
124    const browserHeaders = cloudflareAuthReq.headers()
125    const finalRequestOpts = {
126      ...requestOptions,
127      url: cloudflareAuthReq.url(),
128      payload: cloudflareAuthReq.postData(),
129      headers: {
130        ...requestOptions.headers,
131        "Content-Type": browserHeaders["content-type"],
132        Origin: browserHeaders.origin,
133        Referer: browserHeaders.referer,
134        Cookie: session.getCookieString(cloudflareAuthReq.url()),
135        "Content-Length": cloudflareAuthReq.postData().length,
136      },
137      method: "POST",
138    }
139    // Send the challenge response from requestAsBrowser
140    const challengeResponse = await requestAsBrowser(finalRequestOpts)
141
142    if (this._isChallengeSolvedFunction(challengeResponse)) {
143      // Success
144      request.retryCount = 0
145      session.setCookiesFromResponse(challengeResponse)
146      this.log.info("Successfully unblocked")
147      return challengeResponse
148    }
149
150    // Something went wrong - challenge was not solved.
151    session.retire()
152    await Apify.setValue(
153      `BLOCKED-HTML-${challengeResponse.statusCode}-${Math.random() * 1000}`,
154      challengeResponse.body,
155      { contentType: "text/html" },
156    )
157    this._throwError("Blocked")
158  }
159
160  /**
161   * Locks session
162   * @ODO: we could add this function to the Session natively
163   * @param session {Session}
164   * @private
165   */
166  _markSessionBeingRenewed (session) {
167    session.userData.isBeingRenewed = true
168  }
169
170  /**
171   * Unlocks session
172   * @param session {Session}
173   * @private
174   */
175  _markSessionNotBeingRenewed (session) {
176    session.userData.isBeingRenewed = false
177  }
178
179  /**
180   * Gets proxy URL
181   * @param session {Session};
182   * @return {String}
183   * @private
184   */
185  _getProxyUrl (session) {
186    return this.proxyConfiguration.newUrl(session.id)
187  }
188
189  /**
190   *
191   * @param session {Session}
192   * @return {boolean}
193   * @private
194   */
195  _isSessionBeingRenewed (session) {
196    return session.userData.isBeingRenewed
197  }
198
199  /**
200   * Throws prefixed error
201   * @param message {String} - Error message
202   * @private
203   */
204  _throwError (message) {
205    throw new Error(`${this.name}: ${message}`)
206  }
207
208  /**
209   * Opens new page, where solves the challenge and returns the auth request details.
210   * @param response
211   * @param request - Puppeteer request object
212   * @return {Promise<Object>} - Auth request
213   * @private
214   */
215  async _getSolvedChallengeRequest ({ response, request, session }) {
216    const { headers, body } = response
217    const page = await this.browserPool.newPage()
218    let authRequest
219
220    // Add request interceptor to get the authRequest for unlocking our session and forward other network mesures.
221    await puppeteer.addInterceptRequestHandler(page, async req => {
222      const reqUrl = req.url()
223      const method = req.method()
224
225      if (request.url === reqUrl && method === "GET") {
226        this.log.info(`Mocking initial navigation request: ${req.url()}`)
227        await req.respond({ status: 200, body, headers })
228      } else if (this._detectChallengeRequestFunction(req)) {
229        authRequest = req
230        await req.abort()
231      } else if (reqUrl.includes("transparent.gif")) {
232        const imageResponse = await this._sendImageRequest(req, session)
233        await req.respond({
234          status: 200,
235          body: imageResponse.body,
236          headers: imageResponse.headers,
237        })
238      } else {
239        await req.abort()
240      }
241    })
242
243    // Add debug to page log
244    // @ts-ignore
245    page.on("console", msg => this.log.debug(`${this.name}: ${msg.text()}`))
246
247    // Navigate to the unblock url.
248    await page.evaluate(url => {
249      // @ts-ignore
250      window.location.href = url
251    }, request.url)
252
253    await this._waitUntilChallengeFinishedFunction()
254
255    // TODO: Probably just await page.close()
256    // @ts-ignore
257    this.browserPool.recyclePage(page).catch(() => {
258    })
259    return authRequest
260  }
261
262  /**
263   * Sends request with Cloudflare image like headers.
264   * @param req {Request} - Puppeteer request
265   * @param session {Session} - Session instance
266   * @return {Promise<import("stream").Readable | import("http").IncomingMessage>}
267   * @private
268   */
269  async _sendImageRequest (req, session) {
270    const browserHeaders = req.headers()
271    const requestOptions = this.getRequestOptions(session)
272
273    const imageHeaders = {
274      Referer: browserHeaders.referer,
275      Connection: "keep-alive",
276      Pragma: "no-cache",
277      "Cache-Control": "no-cache",
278      "User-Agent":
279        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
280      "Sec-Fetch-User": "?1",
281      Accept: "image/webp,image/apng,image/*,*/*;q=0.8",
282      "Sec-Fetch-Site": "none",
283      "Sec-Fetch-Mode": "no-cors",
284      "Sec-Fetch-Dest": "image",
285      "Accept-Encoding": "gzip, deflate, br",
286      "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
287      Cookie: session.getCookieString(req.url()),
288    }
289
290    return requestAsBrowser({
291      ...requestOptions,
292      url: req.url(),
293      abortFunction: () => false,
294      headers: imageHeaders,
295    })
296  }
297
298  /**
299   * Gets the browser headers;
300   * @param cookieString {String} - Cookie header string.
301   * @return {Object} - Browser like headers.
302   * @private
303   */
304  _getBrowserHeaders (cookieString) {
305    return {
306      Connection: "close",
307      Pragma: "no-cache",
308      "Cache-Control": "no-cache",
309      "Upgrade-Insecure-Requests": "1",
310      "User-Agent":
311        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
312      "Sec-Fetch-User": "?1",
313      Accept:
314        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
315      "Sec-Fetch-Site": "none",
316      "Sec-Fetch-Mode": "navigate",
317      "Sec-Fetch-Dest": "document",
318      "Accept-Encoding": "gzip, deflate, br",
319      "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
320      Cookie: cookieString,
321    }
322  }
323
324  /**
325   * Detects the Cloudflare challenge page by 503 status code and the CDATA tag;
326   * @param response
327   * @return {boolean}
328   * @private
329   */
330  _detectChallengeFunction (response) {
331    return response.statusCode === 503 && response.body.includes("CDATA")
332  }
333
334  /**
335   * Detects Challenge request
336   * @param request {Request} - Puppeteer request
337   * @return {boolean} - true if challenge request detected
338   * @private
339   */
340  _detectChallengeRequestFunction (request) {
341    return request.method() === "POST"
342  }
343
344  /**
345   * Waits until the challenge is computed.
346   * @return {Promise<void>}
347   * @private
348   */
349  async _waitUntilChallengeFinishedFunction () {
350    await Apify.utils.sleep(5000)
351  }
352
353  /**
354   * Checks if challenge is successfully solved.
355   * @param challengeResponse
356   * @return {boolean}
357   * @private
358   */
359  _isChallengeSolvedFunction (challengeResponse) {
360    return challengeResponse.statusCode === 200
361  }
362
363  /**
364   * Gets request options - these options are compatible with `Apify.utils.requestAsBrowser` and `@apify/http-request`
365   * @param session {Session} - Session instance
366   * @return {Object} - Contains request options
367   */
368  getRequestOptions (session) {
369    const proxyUrl = this._getProxyUrl(session)
370
371    const requestOptions = {
372      headers: this._getBrowserHeaders(
373        session.getCookieString(this.unblockUrl),
374      ),
375      proxyUrl,
376    }
377
378    if (this._shouldUseCustomTLS) {
379      // @ts-ignore
380      if (!requestOptions.https) requestOptions.https = {}
381      requestOptions.https.ciphers = "AES256-SHA"
382    }
383
384    return requestOptions
385  }
386
387  /**
388   * Creates new Session instance for the SessionPool.
389   * @param sessionPool {SessionPool}.
390   * @return {Promise<Session>}
391   */
392  async createSessionFunction (sessionPool) {
393    const session = new Session({ sessionPool })
394    try {
395      await this.unblock({ session, request: { url: this.unblockUrl } })
396    } catch (e) {
397      log.warning(`${this.name}: Could not unblock session`)
398      log.exception(e, 'Unblocker failed')
399    }
400    return session
401  }
402
403  /**
404   * Gets options for the CheerioCrawler use interface.
405   * @return {Object} - CheerioCrawler options
406   */
407  getCrawlerOptions () {
408    const that = this
409    return {
410      useSessionPool: true,
411      sessionPoolOptions: {
412        maxPoolSize: 100,
413        createSessionFunction: this.createSessionFunction.bind(this),
414      },
415      persistCookiesPerSession: true,
416      useApifyProxy: true,
417      async prepareRequestFunction ({ request, session }) {
418        const newOptions = that.getRequestOptions(session)
419        request.headers = newOptions.headers
420        // @ts-ignore
421        if (newOptions.ciphers) {
422          // @ts-ignore
423          this.requestOptions = { ciphers: newOptions.ciphers }
424        }
425      },
426    }
427  }
428}

_utils/common.js

1import { createHash } from 'crypto';
2
3// inspired by @drobnikj
4const createUniqueKeyFromUrl = (url) => {
5  const hash = createHash('sha256');
6  const cleanUrl = url.split('://')[1]; // Remove protocol
7  hash.update(cleanUrl);
8  return hash.digest('hex');
9};

_utils/stats.js

1// inspired by hlidac-shopu
2import Apify from "apify";
3import { defAtom } from "@thi.ng/atom";
4
5// TODO: make this lowes common denominator
6const defaultStats = {
7  urls: 0,
8  items: 0,
9  itemsDuplicate: 0,
10  totalItems: 0,
11  denied: 0,
12  ok: 0
13};
14
15const inc = x => x + 1;
16const dec = x => x - 1;
17
18// TODO: stats should be in atom and updated via swap function atomically
19class Stats {
20  constructor(init) {
21    this.stats = defAtom(init);
22    this.interval = setInterval(() => this.log(), 20 * 1000);
23  }
24
25  inc(key) {
26    this.stats.swapIn(key, inc);
27  }
28
29  dec(key) {
30    this.stats.swapIn(key, dec);
31  }
32
33  add(key, value) {
34    this.stats.swapIn(key, x => x + value);
35  }
36
37  get() {
38    return this.stats.deref();
39  }
40
41  log() {
42    const stats = this.stats.deref();
43    Apify.utils.log.info(`stats: ${JSON.stringify(stats)}`);
44  }
45
46  /**
47   * @param final {boolean} - If true, clearInterval apply
48   */
49  async save(final = false) {
50    if (final) {
51      clearInterval(this.interval);
52    }
53    const stats = this.stats.deref();
54    await Apify.setValue("STATS", this.get());
55    Apify.utils.log.info("STATS saved!");
56    if (stats.ok) {
57      Apify.utils.log.info(
58        `Denied ratio: ${(stats.denied ?? 0 / stats.ok) * 100} %`
59      );
60    }
61    this.log();
62  }
63}
64
65/**
66 *
67 * @param {function} fn
68 * @param {*} init
69 * @returns {Promise<Stats>}
70 */
71export async function withPersistedStats(fn, init) {
72  const stats = (await Apify.getValue("STATS")) ?? init ?? defaultStats;
73  const state = new Stats(fn(stats));
74  const persistState = () => state.save();
75
76  Apify.events.on("persistState", persistState);
77  Apify.events.on("migrating", persistState);
78
79  return state;
80}
Developer
Maintained by Community
Categories