Bike24 (bike24.de) scraper
DeprecatedView all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsBike24 (bike24.de) scraper
strajk-old/bike24-bike24-de-scraper
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
Dockerfile
1FROM apify/actor-node:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install --only=prod --no-optional
7
8COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "Bike24 (bike24.de) scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "mode": {
8 "title": "Mode",
9 "description": "",
10 "type": "string",
11 "editor": "select",
12 "default": "FULL",
13 "prefill": "FULL",
14 "enumTitles": [
15 "FULL",
16 "SINGLE"
17 ],
18 "enum": [
19 "FULL",
20 "SINGLE"
21 ]
22 },
23 "debug": {
24 "title": "Debug",
25 "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
26 "type": "boolean",
27 "editor": "checkbox",
28 "default": false
29 }
30 },
31 "required": [
32 "mode"
33 ]
34}
apify.json
1{
2 "name": "bike24-bike24-de-scraper",
3 "version": "0.1",
4 "buildTag": "latest",
5 "env": null,
6 "template": "project_cheerio_crawler"
7}
main.js
1/**
2 * Dev notes
3 * ===
4 *
5 * - uses Cloudflare anti-bot protection
6 *
7 * Params
8 * ---
9 * -
10 *
11 * Selectors
12 * ---
13 * -
14 */
15
16import { URL } from "node:url";
17import Apify from "apify";
18
19// This is here only cause my bundler does not traverse dependencies
20// FIXME
21import "browser-pool";
22import "playwright";
23
24import { withPersistedStats } from "./_utils/stats.js";
25import CloudflareUnblocker from "./_utils/CloudflareUnblocker.js";
26const { log } = Apify.utils;
27
28const LABELS = {
29 INDEX: "INDEX",
30 PRODUCTS: "PRODUCTS",
31};
32
33var MODE;
34
35(function (MODE) {
36 MODE["FULL"] = "FULL";
37 MODE["SINGLE"] = "SINGLE";
38})(MODE || (MODE = {}));
39
40const BASE_URL = "https://www.bike24.com";
41
42async function enqueueInitialRequest(mode, requestQueue) {
43 if (mode === MODE.FULL) {
44 await requestQueue.addRequest({
45 userData: { label: LABELS.INDEX },
46 url: "https://www.bike24.com/brands",
47 });
48 } else if (mode === MODE.SINGLE) {
49 await requestQueue.addRequest({
50 userData: { label: LABELS.PRODUCTS },
51 url: "https://www.bike24.com/brands/100percent",
52 });
53 }
54}
55
56async function handleIndex({ $ }, { stats, requestQueue }) {
57 $(".list-brands-sitemap__section-item a").each((i, el) => {
58 const url = $(el).attr("href"); // urls are relative
59 const fullUrl = `${BASE_URL}${url}`;
60 const name = $(el).text().trim(); // there's extra space at the beginning and end
61 stats.inc("categories");
62 void requestQueue.addRequest({
63 userData: { label: LABELS.PRODUCTS, category: name },
64 url,
65 });
66 });
67}
68
69async function handleProducts(
70 { $, request },
71 { stats, processedIds, requestQueue }
72) {
73 console.log("handleCategory", request.url);
74 if (!request.url.includes("page=")) {
75 // on first page
76 const totalPages = $(".page-pagination-item").last().text(); // e.g. `12`
77 for (let i = 2; i <= totalPages; i++) {
78 // skip first page, that is already handled
79 const url = new URL(request.url);
80 url.searchParams.set("page", i.toString());
81 void requestQueue.addRequest({
82 url: url.toString(),
83 userData: {
84 label: LABELS.PRODUCTS,
85 category: request.userData.category, // pass category name
86 },
87 });
88 }
89 }
90
91 $(".product-tile").each((i, el) => {
92 stats.inc("items");
93 const id = $(el)
94 .find(".product-tile__anchor")
95 .attr("href")
96 .replace(/[^0-9]/g, ""); // e.g. `p2421335.html` -> `2421335
97 if (!processedIds.has(id)) {
98 stats.inc("itemsUnique");
99 processedIds.add(id);
100 const url = $(el).find(".product-tile__anchor").attr("href"); // relative url
101 const fullUrl = `${BASE_URL}${url}`;
102 const title = $(el).find(".product-tile__title")?.text()?.trim();
103 const prices = JSON.parse($(".productPrice").attr("data-props"));
104 const img = $(el).find(".product-tile__picture img").attr("src");
105 const inStock = !!$(".delivery-message--success").length;
106 const product = {
107 itemId: id,
108 itemName: title,
109 itemUrl: url,
110 img: img,
111 inStock,
112 currentPrice: prices.price,
113 originalPrice: prices.oldPrice,
114 currency: "EUR",
115
116 // Derived
117 _discount: prices.oldPrice
118 ? (1 - prices.price / prices.oldPrice) * 100
119 : undefined,
120 };
121 Apify.pushData(product);
122 } else {
123 stats.inc("itemsDuplicate");
124 }
125 });
126}
127
128Apify.main(async () => {
129 await Apify.utils.purgeLocalStorage();
130
131 const stats = await withPersistedStats((x) => x, {
132 categories: 0,
133 items: 0,
134 itemsUnique: 0,
135 itemsDuplicate: 0,
136 });
137 const processedIds = new Set();
138
139 const input = await Apify.getInput();
140 const { debug = true, mode = MODE.SINGLE } = input ?? {};
141
142 if (debug) Apify.utils.log.setLevel(Apify.utils.log.LEVELS.DEBUG);
143
144 const requestQueue = await Apify.openRequestQueue();
145 await enqueueInitialRequest(mode, requestQueue);
146
147 const proxyConfiguration = await Apify.createProxyConfiguration({
148 groups: ["RESIDENTIAL"], // Not really sure which one to use
149 // @ts-ignore
150 useApifyProxy: true,
151 });
152
153 const cloudflareUnblocker = new CloudflareUnblocker({
154 unblockUrl: BASE_URL,
155 proxyConfiguration,
156 });
157
158 const globalContext = { mode, stats, requestQueue, processedIds };
159 const crawler = new Apify.CheerioCrawler({
160 requestQueue,
161 maxConcurrency: debug ? 1 : 3,
162 maxRequestRetries: debug ? 3 : 3,
163 useSessionPool: true,
164 sessionPoolOptions: {
165 maxPoolSize: 100,
166 createSessionFunction:
167 cloudflareUnblocker.createSessionFunction.bind(cloudflareUnblocker),
168 },
169 proxyConfiguration,
170 persistCookiesPerSession: true,
171 preNavigationHooks: [
172 async (crawlingContext, requestAsBrowserOptions) => {
173 const session = crawlingContext.session;
174 const requestOptions = cloudflareUnblocker.getRequestOptions(session);
175 Object.assign(requestAsBrowserOptions, requestOptions);
176 },
177 ],
178 async handlePageFunction(context) {
179 const { request, response } = context;
180 const { userData } = request;
181 // @ts-ignore
182 const { statusCode, body } = response;
183
184 if (statusCode !== 200) return log.info(body.toString());
185
186 switch (userData.label) {
187 case LABELS.INDEX:
188 return await handleIndex(context, globalContext);
189 case LABELS.PRODUCTS:
190 return await handleProducts(context, globalContext);
191 }
192 },
193 async handleFailedRequestFunction({ request }) {
194 log.error(`Request ${request.url} failed multiple times`, request);
195 },
196 });
197
198 await crawler.run();
199 log.info("That's all folks!");
200});
package.json
1{
2 "name": "bike24-bike24-de-scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4 "type": "module",
5 "scripts": {
6 "start": "node ./main.js",
7 "push-to-apify-platform": "npx apify push"
8 },
9 "dependencies": {
10 "@thi.ng/atom": "*",
11 "apify": "*",
12 "browser-pool": "*",
13 "playwright": "*"
14 },
15 "apify": {
16 "title": "Bike24 (bike24.de) scraper",
17 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
18 "isPublic": true,
19 "isDeprecated": false,
20 "issuesEnabled": true,
21 "isAnonymouslyRunnable": true,
22 "notice": "",
23 "pictureUrl": "",
24 "seoTitle": "",
25 "seoDescription": "",
26 "categories": [
27 "ECOMMERCE"
28 ]
29 }
30}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "bike24-bike24-de-scraper",
4 "title": "Bike24 (bike24.de) scraper",
5 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6 "version": "0.1.0",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "Bike24 (bike24.de) scraper",
11 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12 "views": {
13 "overview": {
14 "title": "Overview",
15 "description": "Overview of the most important fields",
16 "transformation": {
17 "fields": [
18 "itemId",
19 "itemName",
20 "itemUrl",
21 "img",
22 "inStock",
23 "currentPrice",
24 "originalPrice",
25 "currency"
26 ]
27 },
28 "display": {
29 "component": "table",
30 "columns": [
31 {
32 "label": "Item ID",
33 "field": "itemUrl",
34 "format": "link",
35 "textField": "itemId"
36 },
37 {
38 "label": "Item Name",
39 "field": "itemName",
40 "format": "text"
41 },
42 {
43 "label": "Img",
44 "field": "img",
45 "format": "image"
46 },
47 {
48 "label": "In Stock",
49 "field": "inStock",
50 "format": "boolean"
51 },
52 {
53 "label": "Current Price",
54 "field": "currentPrice",
55 "format": "number"
56 },
57 {
58 "label": "Original Price",
59 "field": "originalPrice",
60 "format": "number"
61 },
62 {
63 "label": "Currency",
64 "field": "currency",
65 "format": "text"
66 }
67 ]
68 }
69 }
70 }
71 }
72 }
73}
_utils/CloudflareUnblocker.js
1import Apify from "apify"
2import { Session } from "apify/build/session_pool/session.js"
3import { BrowserPool, PlaywrightPlugin } from "browser-pool"
4import playwright from "playwright"
5
6const { utils: { log, requestAsBrowser, puppeteer } } = Apify
7
8export default class CloudflareUnblocker {
9 // unblockUrl: string
10 // proxyConfiguration: any
11 // browserPool: BrowserPool
12 // _shouldUseCustomTLS: boolean
13 // log: any
14 // name: string
15
16 constructor (options) {
17 // Store options
18 this.unblockUrl = options.unblockUrl
19 this.proxyConfiguration = options.proxyConfiguration
20
21 // Create browser pool
22 this.browserPool = new BrowserPool({
23 retireBrowserAfterPageCount: 50, // TODO: explain
24 maxOpenPagesPerBrowser: 10, // TODO: explain
25 browserPlugins: [
26 new PlaywrightPlugin(playwright.firefox, {
27 launchOptions: { headless: false },
28 }),
29 ],
30 })
31
32 this._shouldUseCustomTLS = false
33
34 // Extending CrawlerExtension caused error:
35 // Class extends value #<Object> is not a constructor or null
36 // But all it should do is to add log method, so I'm adding it here
37 this.name = this.constructor.name
38 this.log = Apify.utils.log.child({ prefix: this.constructor.name })
39 }
40
41 /**
42 * Main function that unblocks your session.
43 */
44 async unblock ({ session, request }) {
45 if (this._isSessionBeingRenewed(session)) {
46 request.retryCount = 0
47 this._throwError("Session is being renewed")
48 }
49
50 const oldShouldUseTLS = this._shouldUseCustomTLS
51 const requestOptions = this.getRequestOptions(session)
52 const response = await requestAsBrowser({
53 ...requestOptions,
54 url: request.url,
55 })
56
57 if (this._detectChallengeFunction(response)) {
58 // Browser challenge detected starting the bypassing
59 try {
60 this._markSessionBeingRenewed(session)
61 return this._solveChallenge({
62 response,
63 session,
64 request,
65 requestOptions,
66 })
67 } finally {
68 this._markSessionNotBeingRenewed(session)
69 }
70 } else if (response.statusCode === 403) {
71 // Cloudflare captcha detected -> switching to slower TLS
72 if (oldShouldUseTLS) {
73 this.log.info("Captcha found even with the TLS hack")
74 await Apify.setValue(
75 `CAPTCHA-HTML-${Math.random() * 1000}`,
76 response.body,
77 { contentType: "text/html" },
78 )
79 } else {
80 this.log.info(
81 "Captcha found for the first time -> switching to custom TLS",
82 )
83 this._shouldUseCustomTLS = true
84 }
85 }
86
87 this.log.info("Session OK")
88 session.setCookiesFromResponse(response)
89 return response
90 }
91
92 /**
93 * Solves the challenge by starting the browser and saving the cookies to the session.
94 * @param options
95 * @param options.request
96 * @param options.response
97 * @param options.session
98 * @param options.requestOptions
99 * @return {Promise<*>}
100 * @private
101 */
102 async _solveChallenge (options) {
103 const { request, response, session } = options
104 const { body, headers } = response
105
106 const receivedCookies = headers["set-cookie"]
107 log.debug(`${this.name}: received cookies: ${receivedCookies}`)
108 const requestOptions = this.getRequestOptions(session)
109
110 if (!receivedCookies) {
111 await Apify.setValue(`NO-COOKIES-HTML-${Math.random() * 1000}`, body, {
112 contentType: "text/html",
113 })
114 }
115
116 session.setCookiesFromResponse(response)
117
118 const cloudflareAuthReq = await this._getSolvedChallengeRequest({
119 response,
120 session,
121 request,
122 })
123
124 const browserHeaders = cloudflareAuthReq.headers()
125 const finalRequestOpts = {
126 ...requestOptions,
127 url: cloudflareAuthReq.url(),
128 payload: cloudflareAuthReq.postData(),
129 headers: {
130 ...requestOptions.headers,
131 "Content-Type": browserHeaders["content-type"],
132 Origin: browserHeaders.origin,
133 Referer: browserHeaders.referer,
134 Cookie: session.getCookieString(cloudflareAuthReq.url()),
135 "Content-Length": cloudflareAuthReq.postData().length,
136 },
137 method: "POST",
138 }
139 // Send the challenge response from requestAsBrowser
140 const challengeResponse = await requestAsBrowser(finalRequestOpts)
141
142 if (this._isChallengeSolvedFunction(challengeResponse)) {
143 // Success
144 request.retryCount = 0
145 session.setCookiesFromResponse(challengeResponse)
146 this.log.info("Successfully unblocked")
147 return challengeResponse
148 }
149
150 // Something went wrong - challenge was not solved.
151 session.retire()
152 await Apify.setValue(
153 `BLOCKED-HTML-${challengeResponse.statusCode}-${Math.random() * 1000}`,
154 challengeResponse.body,
155 { contentType: "text/html" },
156 )
157 this._throwError("Blocked")
158 }
159
160 /**
161 * Locks session
162 * @ODO: we could add this function to the Session natively
163 * @param session {Session}
164 * @private
165 */
166 _markSessionBeingRenewed (session) {
167 session.userData.isBeingRenewed = true
168 }
169
170 /**
171 * Unlocks session
172 * @param session {Session}
173 * @private
174 */
175 _markSessionNotBeingRenewed (session) {
176 session.userData.isBeingRenewed = false
177 }
178
179 /**
180 * Gets proxy URL
181 * @param session {Session};
182 * @return {String}
183 * @private
184 */
185 _getProxyUrl (session) {
186 return this.proxyConfiguration.newUrl(session.id)
187 }
188
189 /**
190 *
191 * @param session {Session}
192 * @return {boolean}
193 * @private
194 */
195 _isSessionBeingRenewed (session) {
196 return session.userData.isBeingRenewed
197 }
198
199 /**
200 * Throws prefixed error
201 * @param message {String} - Error message
202 * @private
203 */
204 _throwError (message) {
205 throw new Error(`${this.name}: ${message}`)
206 }
207
208 /**
209 * Opens new page, where solves the challenge and returns the auth request details.
210 * @param response
211 * @param request - Puppeteer request object
212 * @return {Promise<Object>} - Auth request
213 * @private
214 */
215 async _getSolvedChallengeRequest ({ response, request, session }) {
216 const { headers, body } = response
217 const page = await this.browserPool.newPage()
218 let authRequest
219
220 // Add request interceptor to get the authRequest for unlocking our session and forward other network mesures.
221 await puppeteer.addInterceptRequestHandler(page, async req => {
222 const reqUrl = req.url()
223 const method = req.method()
224
225 if (request.url === reqUrl && method === "GET") {
226 this.log.info(`Mocking initial navigation request: ${req.url()}`)
227 await req.respond({ status: 200, body, headers })
228 } else if (this._detectChallengeRequestFunction(req)) {
229 authRequest = req
230 await req.abort()
231 } else if (reqUrl.includes("transparent.gif")) {
232 const imageResponse = await this._sendImageRequest(req, session)
233 await req.respond({
234 status: 200,
235 body: imageResponse.body,
236 headers: imageResponse.headers,
237 })
238 } else {
239 await req.abort()
240 }
241 })
242
243 // Add debug to page log
244 // @ts-ignore
245 page.on("console", msg => this.log.debug(`${this.name}: ${msg.text()}`))
246
247 // Navigate to the unblock url.
248 await page.evaluate(url => {
249 // @ts-ignore
250 window.location.href = url
251 }, request.url)
252
253 await this._waitUntilChallengeFinishedFunction()
254
255 // TODO: Probably just await page.close()
256 // @ts-ignore
257 this.browserPool.recyclePage(page).catch(() => {
258 })
259 return authRequest
260 }
261
262 /**
263 * Sends request with Cloudflare image like headers.
264 * @param req {Request} - Puppeteer request
265 * @param session {Session} - Session instance
266 * @return {Promise<import("stream").Readable | import("http").IncomingMessage>}
267 * @private
268 */
269 async _sendImageRequest (req, session) {
270 const browserHeaders = req.headers()
271 const requestOptions = this.getRequestOptions(session)
272
273 const imageHeaders = {
274 Referer: browserHeaders.referer,
275 Connection: "keep-alive",
276 Pragma: "no-cache",
277 "Cache-Control": "no-cache",
278 "User-Agent":
279 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
280 "Sec-Fetch-User": "?1",
281 Accept: "image/webp,image/apng,image/*,*/*;q=0.8",
282 "Sec-Fetch-Site": "none",
283 "Sec-Fetch-Mode": "no-cors",
284 "Sec-Fetch-Dest": "image",
285 "Accept-Encoding": "gzip, deflate, br",
286 "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
287 Cookie: session.getCookieString(req.url()),
288 }
289
290 return requestAsBrowser({
291 ...requestOptions,
292 url: req.url(),
293 abortFunction: () => false,
294 headers: imageHeaders,
295 })
296 }
297
298 /**
299 * Gets the browser headers;
300 * @param cookieString {String} - Cookie header string.
301 * @return {Object} - Browser like headers.
302 * @private
303 */
304 _getBrowserHeaders (cookieString) {
305 return {
306 Connection: "close",
307 Pragma: "no-cache",
308 "Cache-Control": "no-cache",
309 "Upgrade-Insecure-Requests": "1",
310 "User-Agent":
311 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
312 "Sec-Fetch-User": "?1",
313 Accept:
314 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
315 "Sec-Fetch-Site": "none",
316 "Sec-Fetch-Mode": "navigate",
317 "Sec-Fetch-Dest": "document",
318 "Accept-Encoding": "gzip, deflate, br",
319 "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
320 Cookie: cookieString,
321 }
322 }
323
324 /**
325 * Detects the Cloudflare challenge page by 503 status code and the CDATA tag;
326 * @param response
327 * @return {boolean}
328 * @private
329 */
330 _detectChallengeFunction (response) {
331 return response.statusCode === 503 && response.body.includes("CDATA")
332 }
333
334 /**
335 * Detects Challenge request
336 * @param request {Request} - Puppeteer request
337 * @return {boolean} - true if challenge request detected
338 * @private
339 */
340 _detectChallengeRequestFunction (request) {
341 return request.method() === "POST"
342 }
343
344 /**
345 * Waits until the challenge is computed.
346 * @return {Promise<void>}
347 * @private
348 */
349 async _waitUntilChallengeFinishedFunction () {
350 await Apify.utils.sleep(5000)
351 }
352
353 /**
354 * Checks if challenge is successfully solved.
355 * @param challengeResponse
356 * @return {boolean}
357 * @private
358 */
359 _isChallengeSolvedFunction (challengeResponse) {
360 return challengeResponse.statusCode === 200
361 }
362
363 /**
364 * Gets request options - these options are compatible with `Apify.utils.requestAsBrowser` and `@apify/http-request`
365 * @param session {Session} - Session instance
366 * @return {Object} - Contains request options
367 */
368 getRequestOptions (session) {
369 const proxyUrl = this._getProxyUrl(session)
370
371 const requestOptions = {
372 headers: this._getBrowserHeaders(
373 session.getCookieString(this.unblockUrl),
374 ),
375 proxyUrl,
376 }
377
378 if (this._shouldUseCustomTLS) {
379 // @ts-ignore
380 if (!requestOptions.https) requestOptions.https = {}
381 requestOptions.https.ciphers = "AES256-SHA"
382 }
383
384 return requestOptions
385 }
386
387 /**
388 * Creates new Session instance for the SessionPool.
389 * @param sessionPool {SessionPool}.
390 * @return {Promise<Session>}
391 */
392 async createSessionFunction (sessionPool) {
393 const session = new Session({ sessionPool })
394 try {
395 await this.unblock({ session, request: { url: this.unblockUrl } })
396 } catch (e) {
397 log.warning(`${this.name}: Could not unblock session`)
398 log.exception(e, 'Unblocker failed')
399 }
400 return session
401 }
402
403 /**
404 * Gets options for the CheerioCrawler use interface.
405 * @return {Object} - CheerioCrawler options
406 */
407 getCrawlerOptions () {
408 const that = this
409 return {
410 useSessionPool: true,
411 sessionPoolOptions: {
412 maxPoolSize: 100,
413 createSessionFunction: this.createSessionFunction.bind(this),
414 },
415 persistCookiesPerSession: true,
416 useApifyProxy: true,
417 async prepareRequestFunction ({ request, session }) {
418 const newOptions = that.getRequestOptions(session)
419 request.headers = newOptions.headers
420 // @ts-ignore
421 if (newOptions.ciphers) {
422 // @ts-ignore
423 this.requestOptions = { ciphers: newOptions.ciphers }
424 }
425 },
426 }
427 }
428}
_utils/common.js
1import { createHash } from 'crypto';
2
3// inspired by @drobnikj
4const createUniqueKeyFromUrl = (url) => {
5 const hash = createHash('sha256');
6 const cleanUrl = url.split('://')[1]; // Remove protocol
7 hash.update(cleanUrl);
8 return hash.digest('hex');
9};
_utils/stats.js
1// inspired by hlidac-shopu
2import Apify from "apify";
3import { defAtom } from "@thi.ng/atom";
4
5// TODO: make this lowes common denominator
6const defaultStats = {
7 urls: 0,
8 items: 0,
9 itemsDuplicate: 0,
10 totalItems: 0,
11 denied: 0,
12 ok: 0
13};
14
15const inc = x => x + 1;
16const dec = x => x - 1;
17
18// TODO: stats should be in atom and updated via swap function atomically
19class Stats {
20 constructor(init) {
21 this.stats = defAtom(init);
22 this.interval = setInterval(() => this.log(), 20 * 1000);
23 }
24
25 inc(key) {
26 this.stats.swapIn(key, inc);
27 }
28
29 dec(key) {
30 this.stats.swapIn(key, dec);
31 }
32
33 add(key, value) {
34 this.stats.swapIn(key, x => x + value);
35 }
36
37 get() {
38 return this.stats.deref();
39 }
40
41 log() {
42 const stats = this.stats.deref();
43 Apify.utils.log.info(`stats: ${JSON.stringify(stats)}`);
44 }
45
46 /**
47 * @param final {boolean} - If true, clearInterval apply
48 */
49 async save(final = false) {
50 if (final) {
51 clearInterval(this.interval);
52 }
53 const stats = this.stats.deref();
54 await Apify.setValue("STATS", this.get());
55 Apify.utils.log.info("STATS saved!");
56 if (stats.ok) {
57 Apify.utils.log.info(
58 `Denied ratio: ${(stats.denied ?? 0 / stats.ok) * 100} %`
59 );
60 }
61 this.log();
62 }
63}
64
65/**
66 *
67 * @param {function} fn
68 * @param {*} init
69 * @returns {Promise<Stats>}
70 */
71export async function withPersistedStats(fn, init) {
72 const stats = (await Apify.getValue("STATS")) ?? init ?? defaultStats;
73 const state = new Stats(fn(stats));
74 const persistState = () => state.save();
75
76 Apify.events.on("persistState", persistState);
77 Apify.events.on("migrating", persistState);
78
79 return state;
80}
Developer
Maintained by Community
Categories