
Bike24 (bike24.de) scraper
Deprecated
Pricing
Pay per usage
Go to Store

Bike24 (bike24.de) scraper
Deprecated
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
0.0 (0)
Pricing
Pay per usage
2
Total users
3
Monthly users
1
Last modified
3 years ago
Dockerfile
FROM apify/actor-node:16
COPY package.json ./
RUN npm --quiet set progress=false \ && npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{ "title": "Bike24 (bike24.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "type": "object", "schemaVersion": 1, "properties": { "mode": { "title": "Mode", "description": "", "type": "string", "editor": "select", "default": "FULL", "prefill": "FULL", "enumTitles": [ "FULL", "SINGLE" ], "enum": [ "FULL", "SINGLE" ] }, "debug": { "title": "Debug", "description": "Debug mode prints more logs, disables concurrency and other optimizations.", "type": "boolean", "editor": "checkbox", "default": false } }, "required": [ "mode" ]}
apify.json
{ "name": "bike24-bike24-de-scraper", "version": "0.1", "buildTag": "latest", "env": null, "template": "project_cheerio_crawler"}
main.js
1/**2 * Dev notes3 * ===4 *5 * - uses Cloudflare anti-bot protection6 *7 * Params8 * ---9 * -10 *11 * Selectors12 * ---13 * -14 */15
16import { URL } from "node:url";17import Apify from "apify";18
19// This is here only cause my bundler does not traverse dependencies20// FIXME21import "browser-pool";22import "playwright";23
24import { withPersistedStats } from "./_utils/stats.js";25import CloudflareUnblocker from "./_utils/CloudflareUnblocker.js";26const { log } = Apify.utils;27
28const LABELS = {29 INDEX: "INDEX",30 PRODUCTS: "PRODUCTS",31};32
33var MODE;34
35(function (MODE) {36 MODE["FULL"] = "FULL";37 MODE["SINGLE"] = "SINGLE";38})(MODE || (MODE = {}));39
40const BASE_URL = "https://www.bike24.com";41
42async function enqueueInitialRequest(mode, requestQueue) {43 if (mode === MODE.FULL) {44 await requestQueue.addRequest({45 userData: { label: LABELS.INDEX },46 url: "https://www.bike24.com/brands",47 });48 } else if (mode === MODE.SINGLE) {49 await requestQueue.addRequest({50 userData: { label: LABELS.PRODUCTS },51 url: "https://www.bike24.com/brands/100percent",52 });53 }54}55
56async function handleIndex({ $ }, { stats, requestQueue }) {57 $(".list-brands-sitemap__section-item a").each((i, el) => {58 const url = $(el).attr("href"); // urls are relative59 const fullUrl = `${BASE_URL}${url}`;60 const name = $(el).text().trim(); // there's extra space at the beginning and end61 stats.inc("categories");62 void requestQueue.addRequest({63 userData: { label: LABELS.PRODUCTS, category: name },64 url,65 });66 });67}68
69async function handleProducts(70 { $, request },71 { stats, processedIds, requestQueue }72) {73 console.log("handleCategory", request.url);74 if (!request.url.includes("page=")) {75 // on first page76 const totalPages = $(".page-pagination-item").last().text(); // e.g. `12`77 for (let i = 2; i <= totalPages; i++) {78 // skip first page, that is already handled79 const url = new URL(request.url);80 url.searchParams.set("page", i.toString());81 void requestQueue.addRequest({82 url: url.toString(),83 userData: {84 label: LABELS.PRODUCTS,85 category: request.userData.category, // pass category name86 },87 });88 }89 }90
91 $(".product-tile").each((i, el) => {92 stats.inc("items");93 const id = $(el)94 .find(".product-tile__anchor")95 .attr("href")96 .replace(/[^0-9]/g, ""); // e.g. `p2421335.html` -> `242133597 if (!processedIds.has(id)) {98 stats.inc("itemsUnique");99 processedIds.add(id);100 const url = $(el).find(".product-tile__anchor").attr("href"); // relative url101 const fullUrl = `${BASE_URL}${url}`;102 const title = $(el).find(".product-tile__title")?.text()?.trim();103 const prices = JSON.parse($(".productPrice").attr("data-props"));104 const img = $(el).find(".product-tile__picture img").attr("src");105 const inStock = !!$(".delivery-message--success").length;106 const product = {107 itemId: id,108 itemName: title,109 itemUrl: url,110 img: img,111 inStock,112 currentPrice: prices.price,113 originalPrice: prices.oldPrice,114 currency: "EUR",115
116 // Derived117 _discount: prices.oldPrice118 ? (1 - prices.price / prices.oldPrice) * 100119 : undefined,120 };121 Apify.pushData(product);122 } else {123 stats.inc("itemsDuplicate");124 }125 });126}127
128Apify.main(async () => {129 await Apify.utils.purgeLocalStorage();130
131 const stats = await withPersistedStats((x) => x, {132 categories: 0,133 items: 0,134 itemsUnique: 0,135 itemsDuplicate: 0,136 });137 const processedIds = new Set();138
139 const input = await Apify.getInput();140 const { debug = true, mode = MODE.SINGLE } = input ?? {};141
142 if (debug) Apify.utils.log.setLevel(Apify.utils.log.LEVELS.DEBUG);143
144 const requestQueue = await Apify.openRequestQueue();145 await enqueueInitialRequest(mode, requestQueue);146
147 const proxyConfiguration = await Apify.createProxyConfiguration({148 groups: ["RESIDENTIAL"], // Not really sure which one to use149 // @ts-ignore150 useApifyProxy: true,151 });152
153 const cloudflareUnblocker = new CloudflareUnblocker({154 unblockUrl: BASE_URL,155 proxyConfiguration,156 });157
158 const globalContext = { mode, stats, requestQueue, processedIds };159 const crawler = new Apify.CheerioCrawler({160 requestQueue,161 maxConcurrency: debug ? 1 : 3,162 maxRequestRetries: debug ? 3 : 3,163 useSessionPool: true,164 sessionPoolOptions: {165 maxPoolSize: 100,166 createSessionFunction:167 cloudflareUnblocker.createSessionFunction.bind(cloudflareUnblocker),168 },169 proxyConfiguration,170 persistCookiesPerSession: true,171 preNavigationHooks: [172 async (crawlingContext, requestAsBrowserOptions) => {173 const session = crawlingContext.session;174 const requestOptions = cloudflareUnblocker.getRequestOptions(session);175 Object.assign(requestAsBrowserOptions, requestOptions);176 },177 ],178 async handlePageFunction(context) {179 const { request, response } = context;180 const { userData } = request;181 // @ts-ignore182 const { statusCode, body } = response;183
184 if (statusCode !== 200) return log.info(body.toString());185
186 switch (userData.label) {187 case LABELS.INDEX:188 return await handleIndex(context, globalContext);189 case LABELS.PRODUCTS:190 return await handleProducts(context, globalContext);191 }192 },193 async handleFailedRequestFunction({ request }) {194 log.error(`Request ${request.url} failed multiple times`, request);195 },196 });197
198 await crawler.run();199 log.info("That's all folks!");200});
package.json
{ "name": "bike24-bike24-de-scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "type": "module", "scripts": { "start": "node ./main.js", "push-to-apify-platform": "npx apify push" }, "dependencies": { "@thi.ng/atom": "*", "apify": "*", "browser-pool": "*", "playwright": "*" }, "apify": { "title": "Bike24 (bike24.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "isPublic": true, "isDeprecated": false, "issuesEnabled": true, "isAnonymouslyRunnable": true, "notice": "", "pictureUrl": "", "seoTitle": "", "seoDescription": "", "categories": [ "ECOMMERCE" ] }}
.actor/actor.json
{ "actorSpecification": 1, "name": "bike24-bike24-de-scraper", "title": "Bike24 (bike24.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "version": "0.1.0", "storages": { "dataset": { "actorSpecification": 1, "title": "Bike24 (bike24.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "views": { "overview": { "title": "Overview", "description": "Overview of the most important fields", "transformation": { "fields": [ "itemId", "itemName", "itemUrl", "img", "inStock", "currentPrice", "originalPrice", "currency" ] }, "display": { "component": "table", "columns": [ { "label": "Item ID", "field": "itemUrl", "format": "link", "textField": "itemId" }, { "label": "Item Name", "field": "itemName", "format": "text" }, { "label": "Img", "field": "img", "format": "image" }, { "label": "In Stock", "field": "inStock", "format": "boolean" }, { "label": "Current Price", "field": "currentPrice", "format": "number" }, { "label": "Original Price", "field": "originalPrice", "format": "number" }, { "label": "Currency", "field": "currency", "format": "text" } ] } } } } }}
_utils/CloudflareUnblocker.js
1import Apify from "apify"2import { Session } from "apify/build/session_pool/session.js"3import { BrowserPool, PlaywrightPlugin } from "browser-pool"4import playwright from "playwright"5
6const { utils: { log, requestAsBrowser, puppeteer } } = Apify7
8export default class CloudflareUnblocker {9 // unblockUrl: string10 // proxyConfiguration: any11 // browserPool: BrowserPool12 // _shouldUseCustomTLS: boolean13 // log: any14 // name: string15
16 constructor (options) {17 // Store options18 this.unblockUrl = options.unblockUrl19 this.proxyConfiguration = options.proxyConfiguration20
21 // Create browser pool22 this.browserPool = new BrowserPool({23 retireBrowserAfterPageCount: 50, // TODO: explain24 maxOpenPagesPerBrowser: 10, // TODO: explain25 browserPlugins: [26 new PlaywrightPlugin(playwright.firefox, {27 launchOptions: { headless: false },28 }),29 ],30 })31
32 this._shouldUseCustomTLS = false33
34 // Extending CrawlerExtension caused error:35 // Class extends value #<Object> is not a constructor or null36 // But all it should do is to add log method, so I'm adding it here37 this.name = this.constructor.name38 this.log = Apify.utils.log.child({ prefix: this.constructor.name })39 }40
41 /**42 * Main function that unblocks your session.43 */44 async unblock ({ session, request }) {45 if (this._isSessionBeingRenewed(session)) {46 request.retryCount = 047 this._throwError("Session is being renewed")48 }49
50 const oldShouldUseTLS = this._shouldUseCustomTLS51 const requestOptions = this.getRequestOptions(session)52 const response = await requestAsBrowser({53 ...requestOptions,54 url: request.url,55 })56
57 if (this._detectChallengeFunction(response)) {58 // Browser challenge detected starting the bypassing59 try {60 this._markSessionBeingRenewed(session)61 return this._solveChallenge({62 response,63 session,64 request,65 requestOptions,66 })67 } finally {68 this._markSessionNotBeingRenewed(session)69 }70 } else if (response.statusCode === 403) {71 // Cloudflare captcha detected -> switching to slower TLS72 if (oldShouldUseTLS) {73 this.log.info("Captcha found even with the TLS hack")74 await Apify.setValue(75 `CAPTCHA-HTML-${Math.random() * 1000}`,76 response.body,77 { contentType: "text/html" },78 )79 } else {80 this.log.info(81 "Captcha found for the first time -> switching to custom TLS",82 )83 this._shouldUseCustomTLS = true84 }85 }86
87 this.log.info("Session OK")88 session.setCookiesFromResponse(response)89 return response90 }91
92 /**93 * Solves the challenge by starting the browser and saving the cookies to the session.94 * @param options95 * @param options.request96 * @param options.response97 * @param options.session98 * @param options.requestOptions99 * @return {Promise<*>}100 * @private101 */102 async _solveChallenge (options) {103 const { request, response, session } = options104 const { body, headers } = response105
106 const receivedCookies = headers["set-cookie"]107 log.debug(`${this.name}: received cookies: ${receivedCookies}`)108 const requestOptions = this.getRequestOptions(session)109
110 if (!receivedCookies) {111 await Apify.setValue(`NO-COOKIES-HTML-${Math.random() * 1000}`, body, {112 contentType: "text/html",113 })114 }115
116 session.setCookiesFromResponse(response)117
118 const cloudflareAuthReq = await this._getSolvedChallengeRequest({119 response,120 session,121 request,122 })123
124 const browserHeaders = cloudflareAuthReq.headers()125 const finalRequestOpts = {126 ...requestOptions,127 url: cloudflareAuthReq.url(),128 payload: cloudflareAuthReq.postData(),129 headers: {130 ...requestOptions.headers,131 "Content-Type": browserHeaders["content-type"],132 Origin: browserHeaders.origin,133 Referer: browserHeaders.referer,134 Cookie: session.getCookieString(cloudflareAuthReq.url()),135 "Content-Length": cloudflareAuthReq.postData().length,136 },137 method: "POST",138 }139 // Send the challenge response from requestAsBrowser140 const challengeResponse = await requestAsBrowser(finalRequestOpts)141
142 if (this._isChallengeSolvedFunction(challengeResponse)) {143 // Success144 request.retryCount = 0145 session.setCookiesFromResponse(challengeResponse)146 this.log.info("Successfully unblocked")147 return challengeResponse148 }149
150 // Something went wrong - challenge was not solved.151 session.retire()152 await Apify.setValue(153 `BLOCKED-HTML-${challengeResponse.statusCode}-${Math.random() * 1000}`,154 challengeResponse.body,155 { contentType: "text/html" },156 )157 this._throwError("Blocked")158 }159
160 /**161 * Locks session162 * @ODO: we could add this function to the Session natively163 * @param session {Session}164 * @private165 */166 _markSessionBeingRenewed (session) {167 session.userData.isBeingRenewed = true168 }169
170 /**171 * Unlocks session172 * @param session {Session}173 * @private174 */175 _markSessionNotBeingRenewed (session) {176 session.userData.isBeingRenewed = false177 }178
179 /**180 * Gets proxy URL181 * @param session {Session};182 * @return {String}183 * @private184 */185 _getProxyUrl (session) {186 return this.proxyConfiguration.newUrl(session.id)187 }188
189 /**190 *191 * @param session {Session}192 * @return {boolean}193 * @private194 */195 _isSessionBeingRenewed (session) {196 return session.userData.isBeingRenewed197 }198
199 /**200 * Throws prefixed error201 * @param message {String} - Error message202 * @private203 */204 _throwError (message) {205 throw new Error(`${this.name}: ${message}`)206 }207
208 /**209 * Opens new page, where solves the challenge and returns the auth request details.210 * @param response211 * @param request - Puppeteer request object212 * @return {Promise<Object>} - Auth request213 * @private214 */215 async _getSolvedChallengeRequest ({ response, request, session }) {216 const { headers, body } = response217 const page = await this.browserPool.newPage()218 let authRequest219
220 // Add request interceptor to get the authRequest for unlocking our session and forward other network mesures.221 await puppeteer.addInterceptRequestHandler(page, async req => {222 const reqUrl = req.url()223 const method = req.method()224
225 if (request.url === reqUrl && method === "GET") {226 this.log.info(`Mocking initial navigation request: ${req.url()}`)227 await req.respond({ status: 200, body, headers })228 } else if (this._detectChallengeRequestFunction(req)) {229 authRequest = req230 await req.abort()231 } else if (reqUrl.includes("transparent.gif")) {232 const imageResponse = await this._sendImageRequest(req, session)233 await req.respond({234 status: 200,235 body: imageResponse.body,236 headers: imageResponse.headers,237 })238 } else {239 await req.abort()240 }241 })242
243 // Add debug to page log244 // @ts-ignore245 page.on("console", msg => this.log.debug(`${this.name}: ${msg.text()}`))246
247 // Navigate to the unblock url.248 await page.evaluate(url => {249 // @ts-ignore250 window.location.href = url251 }, request.url)252
253 await this._waitUntilChallengeFinishedFunction()254
255 // TODO: Probably just await page.close()256 // @ts-ignore257 this.browserPool.recyclePage(page).catch(() => {258 })259 return authRequest260 }261
262 /**263 * Sends request with Cloudflare image like headers.264 * @param req {Request} - Puppeteer request265 * @param session {Session} - Session instance266 * @return {Promise<import("stream").Readable | import("http").IncomingMessage>}267 * @private268 */269 async _sendImageRequest (req, session) {270 const browserHeaders = req.headers()271 const requestOptions = this.getRequestOptions(session)272
273 const imageHeaders = {274 Referer: browserHeaders.referer,275 Connection: "keep-alive",276 Pragma: "no-cache",277 "Cache-Control": "no-cache",278 "User-Agent":279 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",280 "Sec-Fetch-User": "?1",281 Accept: "image/webp,image/apng,image/*,*/*;q=0.8",282 "Sec-Fetch-Site": "none",283 "Sec-Fetch-Mode": "no-cors",284 "Sec-Fetch-Dest": "image",285 "Accept-Encoding": "gzip, deflate, br",286 "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",287 Cookie: session.getCookieString(req.url()),288 }289
290 return requestAsBrowser({291 ...requestOptions,292 url: req.url(),293 abortFunction: () => false,294 headers: imageHeaders,295 })296 }297
298 /**299 * Gets the browser headers;300 * @param cookieString {String} - Cookie header string.301 * @return {Object} - Browser like headers.302 * @private303 */304 _getBrowserHeaders (cookieString) {305 return {306 Connection: "close",307 Pragma: "no-cache",308 "Cache-Control": "no-cache",309 "Upgrade-Insecure-Requests": "1",310 "User-Agent":311 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",312 "Sec-Fetch-User": "?1",313 Accept:314 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",315 "Sec-Fetch-Site": "none",316 "Sec-Fetch-Mode": "navigate",317 "Sec-Fetch-Dest": "document",318 "Accept-Encoding": "gzip, deflate, br",319 "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",320 Cookie: cookieString,321 }322 }323
324 /**325 * Detects the Cloudflare challenge page by 503 status code and the CDATA tag;326 * @param response327 * @return {boolean}328 * @private329 */330 _detectChallengeFunction (response) {331 return response.statusCode === 503 && response.body.includes("CDATA")332 }333
334 /**335 * Detects Challenge request336 * @param request {Request} - Puppeteer request337 * @return {boolean} - true if challenge request detected338 * @private339 */340 _detectChallengeRequestFunction (request) {341 return request.method() === "POST"342 }343
344 /**345 * Waits until the challenge is computed.346 * @return {Promise<void>}347 * @private348 */349 async _waitUntilChallengeFinishedFunction () {350 await Apify.utils.sleep(5000)351 }352
353 /**354 * Checks if challenge is successfully solved.355 * @param challengeResponse356 * @return {boolean}357 * @private358 */359 _isChallengeSolvedFunction (challengeResponse) {360 return challengeResponse.statusCode === 200361 }362
363 /**364 * Gets request options - these options are compatible with `Apify.utils.requestAsBrowser` and `@apify/http-request`365 * @param session {Session} - Session instance366 * @return {Object} - Contains request options367 */368 getRequestOptions (session) {369 const proxyUrl = this._getProxyUrl(session)370
371 const requestOptions = {372 headers: this._getBrowserHeaders(373 session.getCookieString(this.unblockUrl),374 ),375 proxyUrl,376 }377
378 if (this._shouldUseCustomTLS) {379 // @ts-ignore380 if (!requestOptions.https) requestOptions.https = {}381 requestOptions.https.ciphers = "AES256-SHA"382 }383
384 return requestOptions385 }386
387 /**388 * Creates new Session instance for the SessionPool.389 * @param sessionPool {SessionPool}.390 * @return {Promise<Session>}391 */392 async createSessionFunction (sessionPool) {393 const session = new Session({ sessionPool })394 try {395 await this.unblock({ session, request: { url: this.unblockUrl } })396 } catch (e) {397 log.warning(`${this.name}: Could not unblock session`)398 log.exception(e, 'Unblocker failed')399 }400 return session401 }402
403 /**404 * Gets options for the CheerioCrawler use interface.405 * @return {Object} - CheerioCrawler options406 */407 getCrawlerOptions () {408 const that = this409 return {410 useSessionPool: true,411 sessionPoolOptions: {412 maxPoolSize: 100,413 createSessionFunction: this.createSessionFunction.bind(this),414 },415 persistCookiesPerSession: true,416 useApifyProxy: true,417 async prepareRequestFunction ({ request, session }) {418 const newOptions = that.getRequestOptions(session)419 request.headers = newOptions.headers420 // @ts-ignore421 if (newOptions.ciphers) {422 // @ts-ignore423 this.requestOptions = { ciphers: newOptions.ciphers }424 }425 },426 }427 }428}
_utils/common.js
1import { createHash } from 'crypto';2
3// inspired by @drobnikj4const createUniqueKeyFromUrl = (url) => {5 const hash = createHash('sha256');6 const cleanUrl = url.split('://')[1]; // Remove protocol7 hash.update(cleanUrl);8 return hash.digest('hex');9};
_utils/stats.js
1// inspired by hlidac-shopu2import Apify from "apify";3import { defAtom } from "@thi.ng/atom";4
5// TODO: make this lowes common denominator6const defaultStats = {7 urls: 0,8 items: 0,9 itemsDuplicate: 0,10 totalItems: 0,11 denied: 0,12 ok: 013};14
15const inc = x => x + 1;16const dec = x => x - 1;17
18// TODO: stats should be in atom and updated via swap function atomically19class Stats {20 constructor(init) {21 this.stats = defAtom(init);22 this.interval = setInterval(() => this.log(), 20 * 1000);23 }24
25 inc(key) {26 this.stats.swapIn(key, inc);27 }28
29 dec(key) {30 this.stats.swapIn(key, dec);31 }32
33 add(key, value) {34 this.stats.swapIn(key, x => x + value);35 }36
37 get() {38 return this.stats.deref();39 }40
41 log() {42 const stats = this.stats.deref();43 Apify.utils.log.info(`stats: ${JSON.stringify(stats)}`);44 }45
46 /**47 * @param final {boolean} - If true, clearInterval apply48 */49 async save(final = false) {50 if (final) {51 clearInterval(this.interval);52 }53 const stats = this.stats.deref();54 await Apify.setValue("STATS", this.get());55 Apify.utils.log.info("STATS saved!");56 if (stats.ok) {57 Apify.utils.log.info(58 `Denied ratio: ${(stats.denied ?? 0 / stats.ok) * 100} %`59 );60 }61 this.log();62 }63}64
65/**66 *67 * @param {function} fn68 * @param {*} init69 * @returns {Promise<Stats>}70 */71export async function withPersistedStats(fn, init) {72 const stats = (await Apify.getValue("STATS")) ?? init ?? defaultStats;73 const state = new Stats(fn(stats));74 const persistState = () => state.save();75
76 Apify.events.on("persistState", persistState);77 Apify.events.on("migrating", persistState);78
79 return state;80}