
Bike Discount (bike-discount.de) scraper
Deprecated
Pricing
Pay per usage
Go to Store


Bike Discount (bike-discount.de) scraper
Deprecated
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
0.0 (0)
Pricing
Pay per usage
2
Total users
13
Monthly users
2
Last modified
3 years ago
Dockerfile
FROM apify/actor-node:16
COPY package.json ./
RUN npm --quiet set progress=false \ && npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{ "title": "Bike Discount (bike-discount.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "type": "object", "schemaVersion": 1, "properties": { "mode": { "title": "Mode", "description": "", "type": "string", "editor": "select", "default": "TEST", "prefill": "TEST", "enum": [ "TEST", "FULL" ], "enumTitles": [ "TEST", "FULL" ] }, "debug": { "title": "Debug", "description": "Debug mode prints more logs, disables concurrency and other optimizations.", "type": "boolean", "editor": "checkbox", "default": false }, "APIFY_DONT_STORE_IN_DATASET": { "sectionCaption": "Advanced", "sectionDescription": "Advanced options, use only if you know what you're doing.", "title": "Don't store in dataset", "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database", "type": "boolean", "default": false, "editor": "checkbox" }, "PG_CONNECTION_STRING_NORMALIZED": { "title": "Postgres connection string for normalized data", "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables", "type": "string", "editor": "textfield" }, "PG_DATA_TABLE": { "title": "Postgres table name for product data", "description": "Table name for storing product name, url, image, ...", "type": "string", "editor": "textfield" }, "PG_DATA_PRICE_TABLE": { "title": "Postgres table name for price data", "description": "Table name for storing price, original price, stock status, ...", "type": "string", "editor": "textfield" } }, "required": [ "mode" ]}
apify.json
{ "name": "bike-discount-bike-discount-de-scraper", "version": "0.1", "buildTag": "latest", "env": null, "defaultRunOptions": { "build": "latest", "timeoutSecs": 3600, "memoryMbytes": 1024 }}
main.js
1import { URL } from "node:url";2import { Actor } from "apify3";3import { CheerioCrawler, createCheerioRouter } from "crawlee";4import { init, save, toNumberOrNull } from "./_utils/common.js";5
6const LABELS = {7 INDEX: `INDEX`,8 PRODUCTS: `PRODUCTS`,9};10
11var MODE;12
13(function (MODE) {14 MODE["TEST"] = "TEST";15 MODE["FULL"] = "FULL";16})(MODE || (MODE = {}));17
18async function enqueueInitial(mode, crawler) {19 if (mode === MODE.FULL) {20 await crawler.addRequests([21 {22 userData: { label: LABELS.INDEX },23 url: `https://www.bike-discount.de/en/brands`,24 },25 ]);26 } else if (mode === MODE.TEST) {27 await crawler.addRequests([28 {29 userData: { label: LABELS.PRODUCTS },30 url: `https://www.bike-discount.de/en/brand/poc/`,31 },32 ]);33 }34}35
36const router = createCheerioRouter();37
38router.addHandler(LABELS.INDEX, async ({ crawler, $ }) => {39 const requests = [];40 $(`.comer-product--info a.comer-supplier-detail`).each((i, el) => {41 const url = $(el).attr(`href`); // urls are absolute42 const name = $(el).text().trim(); // there's extra space at the beginning and the end43 requests.push({44 userData: { label: LABELS.PRODUCTS, category: name },45 url,46 });47 });48 void crawler.addRequests(requests);49});50
51router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request, log }) => {52 log.info(`handleCategory ${request.url}`);53 if (!request.url.includes(`?p=`)) {54 // on first page55 const totalPages = Number(56 $(`.listing--bottom-paging .paging--link[title="Last page"]`).text()57 ); // e.g. `6`58 for (let i = 2; i <= totalPages; i++) {59 // skip first page, that is already handled60 const url = new URL(request.url);61 url.searchParams.set(`p`, i.toString()); // toString() to make TS happy62 void crawler.addRequests([63 {64 userData: { label: LABELS.PRODUCTS },65 url: url.toString(),66 },67 ]);68 }69 }70
71 const products = [];72 $(`.listing--container .product--box`).each((i, el) => {73 const id = $(el).attr(`data-ordernumber`); // e.g. 20080005-4020077074 const url = $(el).find(`a.product--title`).attr(`href`); // absolute url75 const title =76 $(el).find(`a.product--title strong`).text() + // brand77 ` ` +78 $(el).find(`a.product--title`).attr(`title`); // title itself79 const priceRaw = $(el)80 .find(`.product--price .is--discount, .product--price .price--default`)81 .text()82 .trim(); // e.g. `€19.95`83 const price = priceRaw.replace(/[^\d.]/g, ``); // keep dot as it's decimal separator84 const priceOrigRaw = $(el)85 .find(`.product--price .price--discount`)86 .text()87 .trim(); // `€39.95`88 const priceOrig = priceOrigRaw.replace(/[^\d.]/g, ``);89 const img = $(el).find(`.image--media img`).attr(`srcset`)?.split(`,`)?.[0];90 const inStock = true;91 const product = {92 pid: id,93 name: title,94 url: url,95 img: img,96 inStock,97 currentPrice: toNumberOrNull(price),98 originalPrice: toNumberOrNull(priceOrig),99 currency: `EUR`,100 };101 products.push(product);102 });103 await save(products);104});105
106void Actor.main(async () => {107 const input = await Actor.getInput();108 const { mode = MODE.FULL, ...rest } = input ?? {};109 await init({ actorNameOverride: `bike-discount-de` }, rest);110 const crawler = new CheerioCrawler({ requestHandler: router });111 await enqueueInitial(mode, crawler);112 await crawler.run();113});
package.json
{ "name": "bike-discount-bike-discount-de-scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "type": "module", "scripts": { "start": "node ./main.js", "push-to-apify-platform": "npx apify push" }, "dependencies": { "apify3": "npm:apify@^3.0.2", "crawlee": "*", "pg": "*", "pg-connection-string": "*", "dotenv": "*", "find-config": "*", "@elastic/elasticsearch": "*", "filenamify": "*" }, "apify": { "title": "Bike Discount (bike-discount.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "isPublic": true, "isDeprecated": false, "isAnonymouslyRunnable": true, "notice": "", "pictureUrl": "", "seoTitle": "", "seoDescription": "", "categories": [ "ECOMMERCE" ] }}
.actor/actor.json
{ "actorSpecification": 1, "name": "bike-discount-bike-discount-de-scraper", "title": "Bike Discount (bike-discount.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "version": "0.1.0", "storages": { "dataset": { "actorSpecification": 1, "title": "Bike Discount (bike-discount.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "views": { "overview": { "title": "Overview", "description": "Overview of the most important fields", "transformation": { "fields": [ "pid", "name", "url", "img", "inStock", "currentPrice", "originalPrice", "currency" ] }, "display": { "component": "table", "columns": [ { "label": "Pid", "field": "pid", "format": "text" }, { "label": "Name", "field": "name", "format": "text" }, { "label": "Url", "field": "url", "format": "link" }, { "label": "Img", "field": "img", "format": "image" }, { "label": "In Stock", "field": "inStock", "format": "boolean" }, { "label": "Current Price", "field": "currentPrice", "format": "number" }, { "label": "Original Price", "field": "originalPrice", "format": "number" }, { "label": "Currency", "field": "currency", "format": "text" } ] } } } } }}
.actor/logo.png
_utils/common.js
1import { createHash } from 'crypto'2import os from "os"3import path from "path"4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals5import fs from "fs"6import pg from "pg"7import pgConnectionString from 'pg-connection-string'8import { config } from 'dotenv'9import findConfig from "find-config"10import { Client as ElasticClient } from "@elastic/elasticsearch"11import filenamify from 'filenamify'12import { Dataset } from 'crawlee'13
14config({ path: findConfig(`.env`) })15
16const elasticIndexName = `actors-monorepo-shops`17
18const globalLogsProps = {19 __NODE_STARTED: new Date().toISOString(),20}21
22let actorName23let pgClient24let pgClientNormalized25let elasticClient26export async function init ({ actorNameOverride }, restInput) {27 parseEnvFromInput(restInput)28
29 if (os.platform() === `darwin`) {30 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts31 const basename = path.basename(filePath) // foo.ts32 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo33 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)34 .split(` `)[1]35 .trim()36 .replace(`refs/heads/`, ``)37 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)38 const gitCommitShort = gitCommit.substring(0, 7)39 globalLogsProps.__GIT_COMMIT = gitCommitShort40 }41
42 if (process.env.APIFY_IS_AT_HOME) {43 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV44 }45
46 /* ELASTIC */47 /* ======= */48 if (process.env.ELASTIC_CLOUD_ID) {49 elasticClient = new ElasticClient({50 cloud: { id: process.env.ELASTIC_CLOUD_ID },51 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },52 })53
54 // const mapping = await elasticClient.indices.getMapping({ index: actorName })55
56 // eslint-disable-next-line no-inner-declarations57 async function enforceIndexMapping () {58 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })59 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })60 await elasticClient.indices.putMapping({61 index: elasticIndexName,62 body: {63 properties: {64 _discount: { type: `float` },65 originalPrice: { type: `float` },66 currentPrice: { type: `float` },67 },68 },69 })70 }71
72 try {73 await enforceIndexMapping()74 } catch (err) {75 if (err.message.includes(`cannot be changed from type`)) {76 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)77 await elasticClient.indices.delete({ index: elasticIndexName })78 await enforceIndexMapping()79 }80 }81 }82
83 /* POSTGRESQL */84 /* ========== */85 if (process.env.PG_CONNECTION_STRING) {86 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)87 // const pgPool = new pg.Pool(pgConfig)88
89 pgClient = new pg.Client(pgConfig)90 await pgClient.connect()91
92 // Check if table exists and have proper columns93 const { rows: tables } = await pgClient.query(`94 SELECT table_name95 FROM information_schema.tables96 WHERE table_schema = 'public'97 `)98
99 // eslint-disable-next-line camelcase100 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)101 if (!tableExists) {102 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)103 }104
105 // TODO: Handle pgClient closing106 }107
108 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {109 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)110
111 pgClientNormalized = new pg.Client(pgConfig)112 await pgClientNormalized.connect()113
114 // Check if table exists and have proper columns115 const { rows: tables } = await pgClientNormalized.query(`116 SELECT table_name117 FROM information_schema.tables118 WHERE table_schema = 'public'119 `)120
121 // eslint-disable-next-line camelcase122 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)123 // eslint-disable-next-line camelcase124 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)125 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)126 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)127
128 // TODO: Handle pgClient closing129 }130}131
132// inspired by @drobnikj133// TODO: Similar, but less obfuscated for easier debugging134export const createUniqueKeyFromUrl = (url) => {135 const hash = createHash(`sha256`)136 const cleanUrl = url.split(`://`)[1] // Remove protocol137 hash.update(cleanUrl)138 return hash.digest(`hex`)139}140
141/**142 *143 * @param {Date} datetime144 * @return {Promise<void>}145 */146export const sleepUntil = async (datetime) => {147 const now = new Date()148 const difference = datetime - now149 if (difference > 0) {150 return new Promise((resolve) => {151 setTimeout(resolve, difference)152 })153 }154 return Promise.resolve()155}156
157export function parsePrice (string) {158 let amount, currency159 const noText = string.replace(/[^\d,.]/g, ``)160 const decimals = noText.match(/([,.])(\d{2})$/)161 if (decimals) {162 const decimalSeparator = decimals[1]163 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars164 const decimalAmount = decimals[2]165 amount = parseInt(noText.split(decimalSeparator)[0])166 } {167 const justNumbers = noText.replace(/[,.]/g, ``)168 amount = parseInt(justNumbers)169 }170 return { amount, currency }171}172
173export function toNumberOrNull (str) {174 // TODO: Handle better, but only after adding test175 if (str === undefined) return null176 if (str === null) return null177 if (str === ``) return null178 const num = Number(str)179 if (Number.isNaN(num)) return null180 return num181}182
183export async function save (objs) {184 if (!Array.isArray(objs)) objs = [objs]185 if (objs.length === 0) return186
187 const objsExtended = objs.map((obj) => {188 const objExtended = {189 ...obj,190 actorName,191 ...globalLogsProps,192 // __NODE_VERSION: global.process.versions.node,193 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals194 }195 // if run on Apify196 if (process.env.APIFY_IS_AT_HOME) {197 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID198 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID199 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID200 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER201 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID202 if (!process.env.APIFY_DONT_STORE_IN_DATASET) void Dataset.pushData(obj)203 }204 return objExtended205 })206 // if runs on local machine (MacOS)207 if (os.platform() === `darwin`) {208 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors209 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage210 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)211 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data212 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)213 for (const objExtended of objsExtended) {214 const id = objExtended.id ?? objExtended.pid // ?? uuidv4()215 const fileName = `${filenamify(id)}.json`216 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json217 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))218 }219 }220
221 if (pgClient) {222 const objsPg = objs.map((obj) => ({223 ...obj,224 // TODO: This is becoming not nice, and not clear225 shop: actorName,226 scrapedAt: new Date().toISOString().split(`T`)[0],227 }))228
229 const columns = getColumns(objsPg)230 const values = getValues(objsPg)231 const queryString = `232 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})233 VALUES (${values})234 `235 try {236 const { rowCount } = await pgClient.query(queryString)237 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)238 } catch (err) {239 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)240 else throw err241 }242 }243
244 // Only make sense for HlidacShopu245 if (pgClientNormalized) {246 const objsPgData = objs.map((obj) => ({247 shop: actorName,248 pid: obj.pid,249 name: obj.name,250 url: obj.url,251 img: obj.img,252 }))253
254 const objsPgDataPrice = objs.map((obj) => ({255 shop: actorName,256 pid: obj.pid,257 scrapedAt: new Date().toISOString().split(`T`)[0],258 currentPrice: obj.currentPrice,259 originalPrice: obj.originalPrice,260 inStock: obj.inStock,261 }))262
263 const queryString = `264 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})265 VALUES (${getValues(objsPgData)})266 ON CONFLICT DO NOTHING267 `268 try {269 const { rowCount } = await pgClientNormalized.query(queryString)270 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)271 } catch (err) {272 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)273 else throw err274 }275
276 const queryStringPrice = `277 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})278 VALUES (${getValues(objsPgDataPrice)})279 ON CONFLICT DO NOTHING280 `281 try {282 const { rowCount } = await pgClientNormalized.query(queryStringPrice)283 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)284 } catch (err) {285 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)286 else throw err287 }288 }289
290 if (elasticClient) {291 // .index creates or updates the document292 // .create creates a new document if it doesn't exist, 409 if it does293 // try {294 // const res = await elasticClient.index({295 // index: `actors-monorepo-shops`, // TODO: Consider using actorName296 // id, // foo-bar297 // document: objExtended, // {...}298 // })299 // } catch (err) {300 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583301 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)302 // else throw err303 // }304 }305}306
307function getColumns (objs) {308 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)309}310
311function getValues (objs) {312 return objs.map(objPg => Object.values(objPg).map((value) => {313 // escape strings to prevent SQL injection314 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`315 // convert to DB specific null316 if (typeof value === `undefined` || value === null) return `NULL`317 return value318 }).join(`, `)).join(`), (`)319}320
321export function parseEnvFromInput (input) {322 const env = {}323 for (const key in input) {324 if (key === key.toUpperCase()) env[key] = input[key]325 }326 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)327 Object.assign(process.env, env)328}