
Bike Components (bike-components.de) scraper
Pricing
Pay per usage
Go to Store


Bike Components (bike-components.de) scraper
Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).
0.0 (0)
Pricing
Pay per usage
1
Total users
12
Monthly users
2
Runs succeeded
>99%
Last modified
2 years ago
Dockerfile
FROM apify/actor-node:18
COPY package.json ./
RUN npm --quiet set progress=false \ && npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{ "title": "Bike Components (bike-components.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "type": "object", "schemaVersion": 1, "properties": { "mode": { "title": "Mode", "description": "", "type": "string", "editor": "select", "default": "TEST", "prefill": "TEST", "enum": [ "TEST", "FULL" ], "enumTitles": [ "TEST", "FULL" ] }, "APIFY_USE_MEMORY_REQUEST_QUEUE": { "sectionCaption": "Advanced", "sectionDescription": "Advanced options, use only if you know what you're doing.", "title": "Use in-memory request queue instead of the native one", "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.", "type": "boolean", "default": false, "editor": "checkbox" }, "APIFY_DONT_STORE_IN_DATASET": { "title": "Don't store in dataset", "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database", "type": "boolean", "default": false, "editor": "checkbox" }, "PG_CONNECTION_STRING_NORMALIZED": { "title": "Postgres connection string for normalized data", "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables", "type": "string", "editor": "textfield" }, "PG_DATA_TABLE": { "title": "Postgres table name for product data", "description": "Table name for storing product name, url, image, ...", "type": "string", "editor": "textfield" }, "PG_DATA_PRICE_TABLE": { "title": "Postgres table name for price data", "description": "Table name for storing price, original price, stock status, ...", "type": "string", "editor": "textfield" } }, "required": [ "mode" ]}
apify.json
{ "name": "bike-components-bike-components-de-scraper", "version": "0.1", "buildTag": "latest", "env": null, "defaultRunOptions": { "build": "latest", "timeoutSecs": 3600, "memoryMbytes": 1024 }}
main.js
1import { Actor } from "apify3";2import { CheerioCrawler, createCheerioRouter } from "crawlee";3import { init, parsePrice, save } from "./_utils/common.js";4
5const LABELS = {6 INDEX: `INDEX`,7 PRODUCTS: `PRODUCTS`,8};9
10var MODE;11
12(function (MODE) {13 MODE["TEST"] = "TEST";14 MODE["FULL"] = "FULL";15})(MODE || (MODE = {}));16
17const BASE_URL = `https://www.bike-components.de`;18
19async function enqueueInitial(mode, crawler) {20 if (mode === MODE.FULL) {21 await crawler.addRequests([22 {23 userData: { label: LABELS.INDEX },24 url: `https://www.bike-components.de/en/brands/`,25 },26 ]);27 } else if (mode === MODE.TEST) {28 await crawler.addRequests([29 {30 userData: { label: LABELS.PRODUCTS },31 url: `https://www.bike-components.de/en/100-/`,32 },33 ]);34 }35}36
37const router = createCheerioRouter();38
39router.addHandler(LABELS.INDEX, async ({ enqueueLinks }) => {40 await enqueueLinks({41 selector: `.container-manufacturer-list-for-letter .site-link`,42 userData: { label: LABELS.PRODUCTS },43 });44});45
46router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request, log }) => {47 // Get brand id from HTML, it's needed for the API48 const brandId = $(`body`)49 .text()50 .match(/"manufacturerId":(\d+)}/)[1]; // https://share.cleanshot.com/3YvVXs51 log.info(`[PRODUCTS] ${request.url}, brandId: ${brandId}`);52
53 // Paginate products via API54 let hasMorePages = true;55 let page = 0;56 while (hasMorePages) {57 const res = await fetch(58 `https://www.bike-components.de/en/api/v1/catalog/DE/property/?m%5B0%5D=${brandId}&page=${page}&productsPerPage=72`,59 {60 headers: {61 accept: `application/json`, // maybe not needed62 "cache-control": `no-cache`, // maybe not needed63 },64 }65 );66
67 if (!res.ok)68 throw new Error(69 `[PRODUCTS] ${request.url}: API returned ${res.status} ${res.statusText}`70 );71
72 const resJson = await res.json();73
74 log.info(75 `[PRODUCTS] ${request.url}: page: ${page}, products: ${resJson.initialData.products.length}`76 );77
78 // Parsing!79 const products = [];80 for (const el of resJson.initialData.products) {81 const currentPriceRaw = el.data.price; // `124.99€` or ` <span>from</span> 120.99€`82 const originalPriceRaw = el.data.strikeThroughPrice; // `118.99€`83 const product = {84 pid: el.data.productId.toString(),85 name: el.data.name,86 url: BASE_URL + el.data.link,87 img: BASE_URL + el.data.imageMedium.path, // jpeg88 inStock: el.data.stockQuantity > 0,89 currentPrice: parsePrice(currentPriceRaw)?.amount || null,90 originalPrice: parsePrice(originalPriceRaw)?.amount || null,91 currency: `EUR`,92 };93 products.push(product);94 }95 await save(products);96
97 // Pagination logic98 if (resJson.initialData.paging.last > resJson.initialData.paging.current) {99 page++;100 } else {101 hasMorePages = false;102 }103 }104});105
106void Actor.main(async () => {107 const input = await Actor.getInput();108 const { mode = MODE.FULL, ...rest } = input ?? {};109 await init({ actorNameOverride: `bike-components-de` }, rest);110 const crawler = new CheerioCrawler({ requestHandler: router });111 await enqueueInitial(mode, crawler);112 await crawler.run();113});
package.json
{ "name": "bike-components-bike-components-de-scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "type": "module", "scripts": { "start": "node ./main.js", "push-to-apify-platform": "npx apify push" }, "dependencies": { "apify3": "npm:apify@^3.0.2", "crawlee": "*", "pg": "*", "pg-connection-string": "*", "dotenv": "*", "find-config": "*", "@elastic/elasticsearch": "*", "filenamify": "*", "@crawlee/memory-storage": "*" }, "apify": { "title": "Bike Components (bike-components.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "isPublic": true, "isDeprecated": false, "isAnonymouslyRunnable": true, "notice": "", "pictureUrl": "", "seoTitle": "", "seoDescription": "", "categories": [ "ECOMMERCE" ] }}
.actor/actor.json
{ "actorSpecification": 1, "name": "bike-components-bike-components-de-scraper", "title": "Bike Components (bike-components.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "version": "0.1.0", "storages": { "dataset": { "actorSpecification": 1, "title": "Bike Components (bike-components.de) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "views": { "overview": { "title": "Overview", "description": "Overview of the most important fields", "transformation": { "fields": [ "pid", "name", "url", "img", "inStock", "currentPrice", "originalPrice", "currency" ] }, "display": { "component": "table", "columns": [ { "label": "Pid", "field": "pid", "format": "text" }, { "label": "Name", "field": "name", "format": "text" }, { "label": "Url", "field": "url", "format": "link" }, { "label": "Img", "field": "img", "format": "image" }, { "label": "In Stock", "field": "inStock", "format": "boolean" }, { "label": "Current Price", "field": "currentPrice", "format": "number" }, { "label": "Original Price", "field": "originalPrice", "format": "number" }, { "label": "Currency", "field": "currency", "format": "text" } ] } } } } }}
.actor/logo.png
_utils/common.js
1import { createHash } from 'crypto'2import os from "os"3import path from "path"4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals5import fs from "fs"6import pg from "pg"7import pgConnectionString from 'pg-connection-string'8import { config } from 'dotenv'9import findConfig from "find-config"10import { Client as ElasticClient } from "@elastic/elasticsearch"11import filenamify from 'filenamify'12import { Configuration, Dataset } from 'crawlee'13import { MemoryStorage } from '@crawlee/memory-storage'14
15config({ path: findConfig(`.env`) })16
17const elasticIndexName = `actors-monorepo-shops`18
19const globalLogsProps = {20 __NODE_STARTED: new Date().toISOString(),21}22
23let actorName24let pgClient25let pgClientNormalized26let elasticClient27export async function init ({ actorNameOverride }, restInput) {28 parseEnvFromInput(restInput)29
30 if (os.platform() === `darwin`) {31 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts32 const basename = path.basename(filePath) // foo.ts33 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo34 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)35 .split(` `)[1]36 .trim()37 .replace(`refs/heads/`, ``)38 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)39 const gitCommitShort = gitCommit.substring(0, 7)40 globalLogsProps.__GIT_COMMIT = gitCommitShort41 }42
43 if (process.env.APIFY_USE_MEMORY_REQUEST_QUEUE === `true`) { // dotenv -> bool-like vars are strings44 Configuration.getGlobalConfig().useStorageClient(new MemoryStorage())45 }46
47 if (process.env.APIFY_IS_AT_HOME) {48 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV49 }50
51 /* ELASTIC */52 /* ======= */53 if (process.env.ELASTIC_CLOUD_ID) {54 elasticClient = new ElasticClient({55 cloud: { id: process.env.ELASTIC_CLOUD_ID },56 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },57 })58
59 // const mapping = await elasticClient.indices.getMapping({ index: actorName })60
61 // eslint-disable-next-line no-inner-declarations62 async function enforceIndexMapping () {63 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })64 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })65 await elasticClient.indices.putMapping({66 index: elasticIndexName,67 body: {68 properties: {69 _discount: { type: `float` },70 originalPrice: { type: `float` },71 currentPrice: { type: `float` },72 },73 },74 })75 }76
77 try {78 await enforceIndexMapping()79 } catch (err) {80 if (err.message.includes(`cannot be changed from type`)) {81 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)82 await elasticClient.indices.delete({ index: elasticIndexName })83 await enforceIndexMapping()84 }85 }86 }87
88 /* POSTGRESQL */89 /* ========== */90 if (process.env.PG_CONNECTION_STRING) {91 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)92 // const pgPool = new pg.Pool(pgConfig)93
94 pgClient = new pg.Client(pgConfig)95 await pgClient.connect()96
97 // Check if table exists and have proper columns98 const { rows: tables } = await pgClient.query(`99 SELECT table_name100 FROM information_schema.tables101 WHERE table_schema = 'public'102 `)103
104 // eslint-disable-next-line camelcase105 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)106 if (!tableExists) {107 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)108 }109
110 // TODO: Handle pgClient closing111 }112
113 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {114 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)115
116 pgClientNormalized = new pg.Client(pgConfig)117 await pgClientNormalized.connect()118
119 // Check if table exists and have proper columns120 const { rows: tables } = await pgClientNormalized.query(`121 SELECT table_name122 FROM information_schema.tables123 WHERE table_schema = 'public'124 `)125
126 // eslint-disable-next-line camelcase127 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)128 // eslint-disable-next-line camelcase129 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)130 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)131 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)132
133 // TODO: Handle pgClient closing134 }135}136
137// inspired by @drobnikj138// TODO: Similar, but less obfuscated for easier debugging139export const createUniqueKeyFromUrl = (url) => {140 const hash = createHash(`sha256`)141 const cleanUrl = url.split(`://`)[1] // Remove protocol142 hash.update(cleanUrl)143 return hash.digest(`hex`)144}145
146/**147 *148 * @param {Date} datetime149 * @return {Promise<void>}150 */151export const sleepUntil = async (datetime) => {152 const now = new Date()153 const difference = datetime - now154 if (difference > 0) {155 return new Promise((resolve) => {156 setTimeout(resolve, difference)157 })158 }159 return Promise.resolve()160}161
162// TODO: Uff, nicer! But at least it's tested163export function parsePrice (string) {164 let amount, currency165 const noText = string.replace(/[^\d,.]/g, ``)166 const decimals = noText.match(/([,.])(\d{2})$/)167 if (decimals) {168 const decimalSeparator = decimals[1] // ?169 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars170 const decimalAmount = decimals[2] // ?171 const mainAmount = noText.split(decimalSeparator)[0].replace(/\D/g, ``)172 amount = parseFloat(mainAmount + `.` + decimalAmount) // ?173 } else {174 const justNumbers = noText.replace(/[,.]/g, ``)175 amount = parseInt(justNumbers)176 }177 return { amount, currency }178}179
180export function toNumberOrNull (str) {181 // TODO: Handle better, but only after adding test182 if (str === undefined) return null183 if (str === null) return null184 if (str === ``) return null185 const num = Number(str)186 if (Number.isNaN(num)) return null187 return num188}189
190export async function save (objs) {191 if (!Array.isArray(objs)) objs = [objs]192 if (objs.length === 0) return console.log(`No data to save.`)193
194 const objsExtended = await Promise.all(objs.map(async (obj) => {195 const objExtended = {196 ...obj,197 actorName,198 ...globalLogsProps,199 // __NODE_VERSION: global.process.versions.node,200 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals201 }202 // if run on Apify203 if (process.env.APIFY_IS_AT_HOME) {204 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID205 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID206 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID207 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER208 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID209 if (process.env.APIFY_DONT_STORE_IN_DATASET !== `true`) { // Note: dotenv is not casting vars, so they are strings210 await Dataset.pushData(obj)211 }212 }213 return objExtended214 }))215
216 // if runs on local machine (MacOS)217 if (os.platform() === `darwin`) {218 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors219 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage220 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)221 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data222 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)223 for (const objExtended of objsExtended) {224 const id = String(objExtended.id ?? objExtended.pid) // ?? uuidv4()225 const fileName = `${filenamify(id)}.json`226 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json227 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))228 }229 }230
231 if (pgClient) {232 const objsPg = objs.map((obj) => ({233 ...obj,234 // TODO: This is becoming not nice, and not clear235 shop: actorName,236 scrapedAt: new Date().toISOString().split(`T`)[0],237 }))238
239 const columns = getColumns(objsPg)240 const values = getValues(objsPg)241 const queryString = `242 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})243 VALUES (${values})244 `245 try {246 const { rowCount } = await pgClient.query(queryString)247 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)248 } catch (err) {249 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)250 else throw err251 }252 }253
254 // Only make sense for HlidacShopu255 if (pgClientNormalized) {256 const objsPgData = objs.map((obj) => ({257 shop: actorName,258 pid: obj.pid,259 name: obj.name,260 url: obj.url,261 img: obj.img,262 }))263
264 const objsPgDataPrice = objs.map((obj) => ({265 shop: actorName,266 pid: obj.pid,267 scrapedAt: new Date().toISOString().split(`T`)[0],268 currentPrice: obj.currentPrice,269 originalPrice: obj.originalPrice,270 inStock: obj.inStock,271 }))272
273 const queryString = `274 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})275 VALUES (${getValues(objsPgData)})276 ON CONFLICT DO NOTHING277 `278 try {279 const { rowCount } = await pgClientNormalized.query(queryString)280 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)281 } catch (err) {282 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)283 else throw err284 }285
286 const queryStringPrice = `287 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})288 VALUES (${getValues(objsPgDataPrice)})289 ON CONFLICT DO NOTHING290 `291 try {292 const { rowCount } = await pgClientNormalized.query(queryStringPrice)293 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)294 } catch (err) {295 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)296 else throw err297 }298 }299
300 if (elasticClient) {301 // .index creates or updates the document302 // .create creates a new document if it doesn't exist, 409 if it does303 // try {304 // const res = await elasticClient.index({305 // index: `actors-monorepo-shops`, // TODO: Consider using actorName306 // id, // foo-bar307 // document: objExtended, // {...}308 // })309 // } catch (err) {310 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583311 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)312 // else throw err313 // }314 }315}316
317function getColumns (objs) {318 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)319}320
321function getValues (objs) {322 return objs.map(objPg => Object.values(objPg).map((value) => {323 // escape strings to prevent SQL injection324 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`325 // convert to DB specific null326 if (typeof value === `undefined` || value === null) return `NULL`327 return value328 }).join(`, `)).join(`), (`)329}330
331export function parseEnvFromInput (input) {332 const env = {}333 for (const key in input) {334 if (key === key.toUpperCase()) env[key] = input[key]335 }336 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)337 Object.assign(process.env, env)338}339
340export const isInspect =341 process.execArgv.join().includes(`--inspect`) ||342 // @ts-ignore343 process?._preload_modules?.join(`|`)?.includes(`debug`)