Bike Components (bike-components.de) scraper avatar
Bike Components (bike-components.de) scraper

Pricing

Pay per usage

Go to Store
Bike Components (bike-components.de) scraper

Bike Components (bike-components.de) scraper

Developed by

Pavel Dolecek

Pavel Dolecek

Maintained by Community

Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).

0.0 (0)

Pricing

Pay per usage

1

Total users

12

Monthly users

2

Runs succeeded

>99%

Last modified

2 years ago

Dockerfile

FROM apify/actor-node:18
COPY package.json ./
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional
COPY . ./

INPUT_SCHEMA.json

{
"title": "Bike Components (bike-components.de) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
"type": "object",
"schemaVersion": 1,
"properties": {
"mode": {
"title": "Mode",
"description": "",
"type": "string",
"editor": "select",
"default": "TEST",
"prefill": "TEST",
"enum": [
"TEST",
"FULL"
],
"enumTitles": [
"TEST",
"FULL"
]
},
"APIFY_USE_MEMORY_REQUEST_QUEUE": {
"sectionCaption": "Advanced",
"sectionDescription": "Advanced options, use only if you know what you're doing.",
"title": "Use in-memory request queue instead of the native one",
"description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
"type": "boolean",
"default": false,
"editor": "checkbox"
},
"APIFY_DONT_STORE_IN_DATASET": {
"title": "Don't store in dataset",
"description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
"type": "boolean",
"default": false,
"editor": "checkbox"
},
"PG_CONNECTION_STRING_NORMALIZED": {
"title": "Postgres connection string for normalized data",
"description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
"type": "string",
"editor": "textfield"
},
"PG_DATA_TABLE": {
"title": "Postgres table name for product data",
"description": "Table name for storing product name, url, image, ...",
"type": "string",
"editor": "textfield"
},
"PG_DATA_PRICE_TABLE": {
"title": "Postgres table name for price data",
"description": "Table name for storing price, original price, stock status, ...",
"type": "string",
"editor": "textfield"
}
},
"required": [
"mode"
]
}

apify.json

{
"name": "bike-components-bike-components-de-scraper",
"version": "0.1",
"buildTag": "latest",
"env": null,
"defaultRunOptions": {
"build": "latest",
"timeoutSecs": 3600,
"memoryMbytes": 1024
}
}

main.js

1import { Actor } from "apify3";
2import { CheerioCrawler, createCheerioRouter } from "crawlee";
3import { init, parsePrice, save } from "./_utils/common.js";
4
5const LABELS = {
6 INDEX: `INDEX`,
7 PRODUCTS: `PRODUCTS`,
8};
9
10var MODE;
11
12(function (MODE) {
13 MODE["TEST"] = "TEST";
14 MODE["FULL"] = "FULL";
15})(MODE || (MODE = {}));
16
17const BASE_URL = `https://www.bike-components.de`;
18
19async function enqueueInitial(mode, crawler) {
20 if (mode === MODE.FULL) {
21 await crawler.addRequests([
22 {
23 userData: { label: LABELS.INDEX },
24 url: `https://www.bike-components.de/en/brands/`,
25 },
26 ]);
27 } else if (mode === MODE.TEST) {
28 await crawler.addRequests([
29 {
30 userData: { label: LABELS.PRODUCTS },
31 url: `https://www.bike-components.de/en/100-/`,
32 },
33 ]);
34 }
35}
36
37const router = createCheerioRouter();
38
39router.addHandler(LABELS.INDEX, async ({ enqueueLinks }) => {
40 await enqueueLinks({
41 selector: `.container-manufacturer-list-for-letter .site-link`,
42 userData: { label: LABELS.PRODUCTS },
43 });
44});
45
46router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request, log }) => {
47 // Get brand id from HTML, it's needed for the API
48 const brandId = $(`body`)
49 .text()
50 .match(/"manufacturerId":(\d+)}/)[1]; // https://share.cleanshot.com/3YvVXs
51 log.info(`[PRODUCTS] ${request.url}, brandId: ${brandId}`);
52
53 // Paginate products via API
54 let hasMorePages = true;
55 let page = 0;
56 while (hasMorePages) {
57 const res = await fetch(
58 `https://www.bike-components.de/en/api/v1/catalog/DE/property/?m%5B0%5D=${brandId}&page=${page}&productsPerPage=72`,
59 {
60 headers: {
61 accept: `application/json`, // maybe not needed
62 "cache-control": `no-cache`, // maybe not needed
63 },
64 }
65 );
66
67 if (!res.ok)
68 throw new Error(
69 `[PRODUCTS] ${request.url}: API returned ${res.status} ${res.statusText}`
70 );
71
72 const resJson = await res.json();
73
74 log.info(
75 `[PRODUCTS] ${request.url}: page: ${page}, products: ${resJson.initialData.products.length}`
76 );
77
78 // Parsing!
79 const products = [];
80 for (const el of resJson.initialData.products) {
81 const currentPriceRaw = el.data.price; // `124.99€` or ` <span>from</span> 120.99€`
82 const originalPriceRaw = el.data.strikeThroughPrice; // `118.99€`
83 const product = {
84 pid: el.data.productId.toString(),
85 name: el.data.name,
86 url: BASE_URL + el.data.link,
87 img: BASE_URL + el.data.imageMedium.path, // jpeg
88 inStock: el.data.stockQuantity > 0,
89 currentPrice: parsePrice(currentPriceRaw)?.amount || null,
90 originalPrice: parsePrice(originalPriceRaw)?.amount || null,
91 currency: `EUR`,
92 };
93 products.push(product);
94 }
95 await save(products);
96
97 // Pagination logic
98 if (resJson.initialData.paging.last > resJson.initialData.paging.current) {
99 page++;
100 } else {
101 hasMorePages = false;
102 }
103 }
104});
105
106void Actor.main(async () => {
107 const input = await Actor.getInput();
108 const { mode = MODE.FULL, ...rest } = input ?? {};
109 await init({ actorNameOverride: `bike-components-de` }, rest);
110 const crawler = new CheerioCrawler({ requestHandler: router });
111 await enqueueInitial(mode, crawler);
112 await crawler.run();
113});

package.json

{
"name": "bike-components-bike-components-de-scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
"type": "module",
"scripts": {
"start": "node ./main.js",
"push-to-apify-platform": "npx apify push"
},
"dependencies": {
"apify3": "npm:apify@^3.0.2",
"crawlee": "*",
"pg": "*",
"pg-connection-string": "*",
"dotenv": "*",
"find-config": "*",
"@elastic/elasticsearch": "*",
"filenamify": "*",
"@crawlee/memory-storage": "*"
},
"apify": {
"title": "Bike Components (bike-components.de) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
"isPublic": true,
"isDeprecated": false,
"isAnonymouslyRunnable": true,
"notice": "",
"pictureUrl": "",
"seoTitle": "",
"seoDescription": "",
"categories": [
"ECOMMERCE"
]
}
}

.actor/actor.json

{
"actorSpecification": 1,
"name": "bike-components-bike-components-de-scraper",
"title": "Bike Components (bike-components.de) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
"version": "0.1.0",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "Bike Components (bike-components.de) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
"views": {
"overview": {
"title": "Overview",
"description": "Overview of the most important fields",
"transformation": {
"fields": [
"pid",
"name",
"url",
"img",
"inStock",
"currentPrice",
"originalPrice",
"currency"
]
},
"display": {
"component": "table",
"columns": [
{
"label": "Pid",
"field": "pid",
"format": "text"
},
{
"label": "Name",
"field": "name",
"format": "text"
},
{
"label": "Url",
"field": "url",
"format": "link"
},
{
"label": "Img",
"field": "img",
"format": "image"
},
{
"label": "In Stock",
"field": "inStock",
"format": "boolean"
},
{
"label": "Current Price",
"field": "currentPrice",
"format": "number"
},
{
"label": "Original Price",
"field": "originalPrice",
"format": "number"
},
{
"label": "Currency",
"field": "currency",
"format": "text"
}
]
}
}
}
}
}
}

.actor/logo.png

_utils/common.js

1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Configuration, Dataset } from 'crawlee'
13import { MemoryStorage } from '@crawlee/memory-storage'
14
15config({ path: findConfig(`.env`) })
16
17const elasticIndexName = `actors-monorepo-shops`
18
19const globalLogsProps = {
20 __NODE_STARTED: new Date().toISOString(),
21}
22
23let actorName
24let pgClient
25let pgClientNormalized
26let elasticClient
27export async function init ({ actorNameOverride }, restInput) {
28 parseEnvFromInput(restInput)
29
30 if (os.platform() === `darwin`) {
31 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
32 const basename = path.basename(filePath) // foo.ts
33 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
34 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
35 .split(` `)[1]
36 .trim()
37 .replace(`refs/heads/`, ``)
38 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
39 const gitCommitShort = gitCommit.substring(0, 7)
40 globalLogsProps.__GIT_COMMIT = gitCommitShort
41 }
42
43 if (process.env.APIFY_USE_MEMORY_REQUEST_QUEUE === `true`) { // dotenv -> bool-like vars are strings
44 Configuration.getGlobalConfig().useStorageClient(new MemoryStorage())
45 }
46
47 if (process.env.APIFY_IS_AT_HOME) {
48 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
49 }
50
51 /* ELASTIC */
52 /* ======= */
53 if (process.env.ELASTIC_CLOUD_ID) {
54 elasticClient = new ElasticClient({
55 cloud: { id: process.env.ELASTIC_CLOUD_ID },
56 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
57 })
58
59 // const mapping = await elasticClient.indices.getMapping({ index: actorName })
60
61 // eslint-disable-next-line no-inner-declarations
62 async function enforceIndexMapping () {
63 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
64 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
65 await elasticClient.indices.putMapping({
66 index: elasticIndexName,
67 body: {
68 properties: {
69 _discount: { type: `float` },
70 originalPrice: { type: `float` },
71 currentPrice: { type: `float` },
72 },
73 },
74 })
75 }
76
77 try {
78 await enforceIndexMapping()
79 } catch (err) {
80 if (err.message.includes(`cannot be changed from type`)) {
81 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
82 await elasticClient.indices.delete({ index: elasticIndexName })
83 await enforceIndexMapping()
84 }
85 }
86 }
87
88 /* POSTGRESQL */
89 /* ========== */
90 if (process.env.PG_CONNECTION_STRING) {
91 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
92 // const pgPool = new pg.Pool(pgConfig)
93
94 pgClient = new pg.Client(pgConfig)
95 await pgClient.connect()
96
97 // Check if table exists and have proper columns
98 const { rows: tables } = await pgClient.query(`
99 SELECT table_name
100 FROM information_schema.tables
101 WHERE table_schema = 'public'
102 `)
103
104 // eslint-disable-next-line camelcase
105 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
106 if (!tableExists) {
107 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
108 }
109
110 // TODO: Handle pgClient closing
111 }
112
113 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
114 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
115
116 pgClientNormalized = new pg.Client(pgConfig)
117 await pgClientNormalized.connect()
118
119 // Check if table exists and have proper columns
120 const { rows: tables } = await pgClientNormalized.query(`
121 SELECT table_name
122 FROM information_schema.tables
123 WHERE table_schema = 'public'
124 `)
125
126 // eslint-disable-next-line camelcase
127 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
128 // eslint-disable-next-line camelcase
129 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
130 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
131 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
132
133 // TODO: Handle pgClient closing
134 }
135}
136
137// inspired by @drobnikj
138// TODO: Similar, but less obfuscated for easier debugging
139export const createUniqueKeyFromUrl = (url) => {
140 const hash = createHash(`sha256`)
141 const cleanUrl = url.split(`://`)[1] // Remove protocol
142 hash.update(cleanUrl)
143 return hash.digest(`hex`)
144}
145
146/**
147 *
148 * @param {Date} datetime
149 * @return {Promise<void>}
150 */
151export const sleepUntil = async (datetime) => {
152 const now = new Date()
153 const difference = datetime - now
154 if (difference > 0) {
155 return new Promise((resolve) => {
156 setTimeout(resolve, difference)
157 })
158 }
159 return Promise.resolve()
160}
161
162// TODO: Uff, nicer! But at least it's tested
163export function parsePrice (string) {
164 let amount, currency
165 const noText = string.replace(/[^\d,.]/g, ``)
166 const decimals = noText.match(/([,.])(\d{2})$/)
167 if (decimals) {
168 const decimalSeparator = decimals[1] // ?
169 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
170 const decimalAmount = decimals[2] // ?
171 const mainAmount = noText.split(decimalSeparator)[0].replace(/\D/g, ``)
172 amount = parseFloat(mainAmount + `.` + decimalAmount) // ?
173 } else {
174 const justNumbers = noText.replace(/[,.]/g, ``)
175 amount = parseInt(justNumbers)
176 }
177 return { amount, currency }
178}
179
180export function toNumberOrNull (str) {
181 // TODO: Handle better, but only after adding test
182 if (str === undefined) return null
183 if (str === null) return null
184 if (str === ``) return null
185 const num = Number(str)
186 if (Number.isNaN(num)) return null
187 return num
188}
189
190export async function save (objs) {
191 if (!Array.isArray(objs)) objs = [objs]
192 if (objs.length === 0) return console.log(`No data to save.`)
193
194 const objsExtended = await Promise.all(objs.map(async (obj) => {
195 const objExtended = {
196 ...obj,
197 actorName,
198 ...globalLogsProps,
199 // __NODE_VERSION: global.process.versions.node,
200 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
201 }
202 // if run on Apify
203 if (process.env.APIFY_IS_AT_HOME) {
204 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
205 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
206 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
207 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
208 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
209 if (process.env.APIFY_DONT_STORE_IN_DATASET !== `true`) { // Note: dotenv is not casting vars, so they are strings
210 await Dataset.pushData(obj)
211 }
212 }
213 return objExtended
214 }))
215
216 // if runs on local machine (MacOS)
217 if (os.platform() === `darwin`) {
218 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
219 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
220 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
221 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
222 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
223 for (const objExtended of objsExtended) {
224 const id = String(objExtended.id ?? objExtended.pid) // ?? uuidv4()
225 const fileName = `${filenamify(id)}.json`
226 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
227 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
228 }
229 }
230
231 if (pgClient) {
232 const objsPg = objs.map((obj) => ({
233 ...obj,
234 // TODO: This is becoming not nice, and not clear
235 shop: actorName,
236 scrapedAt: new Date().toISOString().split(`T`)[0],
237 }))
238
239 const columns = getColumns(objsPg)
240 const values = getValues(objsPg)
241 const queryString = `
242 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
243 VALUES (${values})
244 `
245 try {
246 const { rowCount } = await pgClient.query(queryString)
247 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
248 } catch (err) {
249 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
250 else throw err
251 }
252 }
253
254 // Only make sense for HlidacShopu
255 if (pgClientNormalized) {
256 const objsPgData = objs.map((obj) => ({
257 shop: actorName,
258 pid: obj.pid,
259 name: obj.name,
260 url: obj.url,
261 img: obj.img,
262 }))
263
264 const objsPgDataPrice = objs.map((obj) => ({
265 shop: actorName,
266 pid: obj.pid,
267 scrapedAt: new Date().toISOString().split(`T`)[0],
268 currentPrice: obj.currentPrice,
269 originalPrice: obj.originalPrice,
270 inStock: obj.inStock,
271 }))
272
273 const queryString = `
274 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
275 VALUES (${getValues(objsPgData)})
276 ON CONFLICT DO NOTHING
277 `
278 try {
279 const { rowCount } = await pgClientNormalized.query(queryString)
280 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
281 } catch (err) {
282 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
283 else throw err
284 }
285
286 const queryStringPrice = `
287 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
288 VALUES (${getValues(objsPgDataPrice)})
289 ON CONFLICT DO NOTHING
290 `
291 try {
292 const { rowCount } = await pgClientNormalized.query(queryStringPrice)
293 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
294 } catch (err) {
295 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
296 else throw err
297 }
298 }
299
300 if (elasticClient) {
301 // .index creates or updates the document
302 // .create creates a new document if it doesn't exist, 409 if it does
303 // try {
304 // const res = await elasticClient.index({
305 // index: `actors-monorepo-shops`, // TODO: Consider using actorName
306 // id, // foo-bar
307 // document: objExtended, // {...}
308 // })
309 // } catch (err) {
310 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
311 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
312 // else throw err
313 // }
314 }
315}
316
317function getColumns (objs) {
318 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
319}
320
321function getValues (objs) {
322 return objs.map(objPg => Object.values(objPg).map((value) => {
323 // escape strings to prevent SQL injection
324 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
325 // convert to DB specific null
326 if (typeof value === `undefined` || value === null) return `NULL`
327 return value
328 }).join(`, `)).join(`), (`)
329}
330
331export function parseEnvFromInput (input) {
332 const env = {}
333 for (const key in input) {
334 if (key === key.toUpperCase()) env[key] = input[key]
335 }
336 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
337 Object.assign(process.env, env)
338}
339
340export const isInspect =
341 process.execArgv.join().includes(`--inspect`) ||
342 // @ts-ignore
343 process?._preload_modules?.join(`|`)?.includes(`debug`)