Bike Components (bike-components.de) scraper

No credit card required

Bike Components (bike-components.de) scraper

Bike Components (bike-components.de) scraper

strajk/bike-components-bike-components-de-scraper

No credit card required

Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).

Dockerfile

1FROM apify/actor-node:18 2 3COPY package.json ./ 4 5RUN npm --quiet set progress=false \ 6 && npm install --only=prod --no-optional 7 8COPY . ./

INPUT_SCHEMA.json

1{ 2 "title": "Bike Components (bike-components.de) scraper", 3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 4 "type": "object", 5 "schemaVersion": 1, 6 "properties": { 7 "mode": { 8 "title": "Mode", 9 "description": "", 10 "type": "string", 11 "editor": "select", 12 "default": "TEST", 13 "prefill": "TEST", 14 "enum": [ 15 "TEST", 16 "FULL" 17 ], 18 "enumTitles": [ 19 "TEST", 20 "FULL" 21 ] 22 }, 23 "APIFY_USE_MEMORY_REQUEST_QUEUE": { 24 "sectionCaption": "Advanced", 25 "sectionDescription": "Advanced options, use only if you know what you're doing.", 26 "title": "Use in-memory request queue instead of the native one", 27 "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.", 28 "type": "boolean", 29 "default": false, 30 "editor": "checkbox" 31 }, 32 "APIFY_DONT_STORE_IN_DATASET": { 33 "title": "Don't store in dataset", 34 "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database", 35 "type": "boolean", 36 "default": false, 37 "editor": "checkbox" 38 }, 39 "PG_CONNECTION_STRING_NORMALIZED": { 40 "title": "Postgres connection string for normalized data", 41 "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables", 42 "type": "string", 43 "editor": "textfield" 44 }, 45 "PG_DATA_TABLE": { 46 "title": "Postgres table name for product data", 47 "description": "Table name for storing product name, url, image, ...", 48 "type": "string", 49 "editor": "textfield" 50 }, 51 "PG_DATA_PRICE_TABLE": { 52 "title": "Postgres table name for price data", 53 "description": "Table name for storing price, original price, stock status, ...", 54 "type": "string", 55 "editor": "textfield" 56 } 57 }, 58 "required": [ 59 "mode" 60 ] 61}

apify.json

1{ 2 "name": "bike-components-bike-components-de-scraper", 3 "version": "0.1", 4 "buildTag": "latest", 5 "env": null, 6 "defaultRunOptions": { 7 "build": "latest", 8 "timeoutSecs": 3600, 9 "memoryMbytes": 1024 10 } 11}

main.js

1import { Actor } from "apify3"; 2import { CheerioCrawler, createCheerioRouter } from "crawlee"; 3import { init, parsePrice, save } from "./_utils/common.js"; 4 5const LABELS = { 6 INDEX: `INDEX`, 7 PRODUCTS: `PRODUCTS`, 8}; 9 10var MODE; 11 12(function (MODE) { 13 MODE["TEST"] = "TEST"; 14 MODE["FULL"] = "FULL"; 15})(MODE || (MODE = {})); 16 17const BASE_URL = `https://www.bike-components.de`; 18 19async function enqueueInitial(mode, crawler) { 20 if (mode === MODE.FULL) { 21 await crawler.addRequests([ 22 { 23 userData: { label: LABELS.INDEX }, 24 url: `https://www.bike-components.de/en/brands/`, 25 }, 26 ]); 27 } else if (mode === MODE.TEST) { 28 await crawler.addRequests([ 29 { 30 userData: { label: LABELS.PRODUCTS }, 31 url: `https://www.bike-components.de/en/100-/`, 32 }, 33 ]); 34 } 35} 36 37const router = createCheerioRouter(); 38 39router.addHandler(LABELS.INDEX, async ({ enqueueLinks }) => { 40 await enqueueLinks({ 41 selector: `.container-manufacturer-list-for-letter .site-link`, 42 userData: { label: LABELS.PRODUCTS }, 43 }); 44}); 45 46router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request, log }) => { 47 // Get brand id from HTML, it's needed for the API 48 const brandId = $(`body`) 49 .text() 50 .match(/"manufacturerId":(\d+)}/)[1]; // https://share.cleanshot.com/3YvVXs 51 log.info(`[PRODUCTS] ${request.url}, brandId: ${brandId}`); 52 53 // Paginate products via API 54 let hasMorePages = true; 55 let page = 0; 56 while (hasMorePages) { 57 const res = await fetch( 58 `https://www.bike-components.de/en/api/v1/catalog/DE/property/?m%5B0%5D=${brandId}&page=${page}&productsPerPage=72`, 59 { 60 headers: { 61 accept: `application/json`, // maybe not needed 62 "cache-control": `no-cache`, // maybe not needed 63 }, 64 } 65 ); 66 67 if (!res.ok) 68 throw new Error( 69 `[PRODUCTS] ${request.url}: API returned ${res.status} ${res.statusText}` 70 ); 71 72 const resJson = await res.json(); 73 74 log.info( 75 `[PRODUCTS] ${request.url}: page: ${page}, products: ${resJson.initialData.products.length}` 76 ); 77 78 // Parsing! 79 const products = []; 80 for (const el of resJson.initialData.products) { 81 const currentPriceRaw = el.data.price; // `124.99€` or ` <span>from</span> 120.99€` 82 const originalPriceRaw = el.data.strikeThroughPrice; // `118.99€` 83 const product = { 84 pid: el.data.productId.toString(), 85 name: el.data.name, 86 url: BASE_URL + el.data.link, 87 img: BASE_URL + el.data.imageMedium.path, // jpeg 88 inStock: el.data.stockQuantity > 0, 89 currentPrice: parsePrice(currentPriceRaw)?.amount || null, 90 originalPrice: parsePrice(originalPriceRaw)?.amount || null, 91 currency: `EUR`, 92 }; 93 products.push(product); 94 } 95 await save(products); 96 97 // Pagination logic 98 if (resJson.initialData.paging.last > resJson.initialData.paging.current) { 99 page++; 100 } else { 101 hasMorePages = false; 102 } 103 } 104}); 105 106void Actor.main(async () => { 107 const input = await Actor.getInput(); 108 const { mode = MODE.FULL, ...rest } = input ?? {}; 109 await init({ actorNameOverride: `bike-components-de` }, rest); 110 const crawler = new CheerioCrawler({ requestHandler: router }); 111 await enqueueInitial(mode, crawler); 112 await crawler.run(); 113}); 114

package.json

1{ 2 "name": "bike-components-bike-components-de-scraper", 3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 4 "type": "module", 5 "scripts": { 6 "start": "node ./main.js", 7 "push-to-apify-platform": "npx apify push" 8 }, 9 "dependencies": { 10 "apify3": "npm:apify@^3.0.2", 11 "crawlee": "*", 12 "pg": "*", 13 "pg-connection-string": "*", 14 "dotenv": "*", 15 "find-config": "*", 16 "@elastic/elasticsearch": "*", 17 "filenamify": "*", 18 "@crawlee/memory-storage": "*" 19 }, 20 "apify": { 21 "title": "Bike Components (bike-components.de) scraper", 22 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 23 "isPublic": true, 24 "isDeprecated": false, 25 "isAnonymouslyRunnable": true, 26 "notice": "", 27 "pictureUrl": "", 28 "seoTitle": "", 29 "seoDescription": "", 30 "categories": [ 31 "ECOMMERCE" 32 ] 33 } 34}

.actor/actor.json

1{ 2 "actorSpecification": 1, 3 "name": "bike-components-bike-components-de-scraper", 4 "title": "Bike Components (bike-components.de) scraper", 5 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 6 "version": "0.1.0", 7 "storages": { 8 "dataset": { 9 "actorSpecification": 1, 10 "title": "Bike Components (bike-components.de) scraper", 11 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 12 "views": { 13 "overview": { 14 "title": "Overview", 15 "description": "Overview of the most important fields", 16 "transformation": { 17 "fields": [ 18 "pid", 19 "name", 20 "url", 21 "img", 22 "inStock", 23 "currentPrice", 24 "originalPrice", 25 "currency" 26 ] 27 }, 28 "display": { 29 "component": "table", 30 "columns": [ 31 { 32 "label": "Pid", 33 "field": "pid", 34 "format": "text" 35 }, 36 { 37 "label": "Name", 38 "field": "name", 39 "format": "text" 40 }, 41 { 42 "label": "Url", 43 "field": "url", 44 "format": "link" 45 }, 46 { 47 "label": "Img", 48 "field": "img", 49 "format": "image" 50 }, 51 { 52 "label": "In Stock", 53 "field": "inStock", 54 "format": "boolean" 55 }, 56 { 57 "label": "Current Price", 58 "field": "currentPrice", 59 "format": "number" 60 }, 61 { 62 "label": "Original Price", 63 "field": "originalPrice", 64 "format": "number" 65 }, 66 { 67 "label": "Currency", 68 "field": "currency", 69 "format": "text" 70 } 71 ] 72 } 73 } 74 } 75 } 76 } 77}

.actor/logo.png

_utils/common.js

1import { createHash } from 'crypto' 2import os from "os" 3import path from "path" 4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals 5import fs from "fs" 6import pg from "pg" 7import pgConnectionString from 'pg-connection-string' 8import { config } from 'dotenv' 9import findConfig from "find-config" 10import { Client as ElasticClient } from "@elastic/elasticsearch" 11import filenamify from 'filenamify' 12import { Configuration, Dataset } from 'crawlee' 13import { MemoryStorage } from '@crawlee/memory-storage' 14 15config({ path: findConfig(`.env`) }) 16 17const elasticIndexName = `actors-monorepo-shops` 18 19const globalLogsProps = { 20 __NODE_STARTED: new Date().toISOString(), 21} 22 23let actorName 24let pgClient 25let pgClientNormalized 26let elasticClient 27export async function init ({ actorNameOverride }, restInput) { 28 parseEnvFromInput(restInput) 29 30 if (os.platform() === `darwin`) { 31 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts 32 const basename = path.basename(filePath) // foo.ts 33 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo 34 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`) 35 .split(` `)[1] 36 .trim() 37 .replace(`refs/heads/`, ``) 38 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`) 39 const gitCommitShort = gitCommit.substring(0, 7) 40 globalLogsProps.__GIT_COMMIT = gitCommitShort 41 } 42 43 if (process.env.APIFY_USE_MEMORY_REQUEST_QUEUE === `true`) { // dotenv -> bool-like vars are strings 44 Configuration.getGlobalConfig().useStorageClient(new MemoryStorage()) 45 } 46 47 if (process.env.APIFY_IS_AT_HOME) { 48 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV 49 } 50 51 /* ELASTIC */ 52 /* ======= */ 53 if (process.env.ELASTIC_CLOUD_ID) { 54 elasticClient = new ElasticClient({ 55 cloud: { id: process.env.ELASTIC_CLOUD_ID }, 56 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY }, 57 }) 58 59 // const mapping = await elasticClient.indices.getMapping({ index: actorName }) 60 61 // eslint-disable-next-line no-inner-declarations 62 async function enforceIndexMapping () { 63 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName }) 64 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName }) 65 await elasticClient.indices.putMapping({ 66 index: elasticIndexName, 67 body: { 68 properties: { 69 _discount: { type: `float` }, 70 originalPrice: { type: `float` }, 71 currentPrice: { type: `float` }, 72 }, 73 }, 74 }) 75 } 76 77 try { 78 await enforceIndexMapping() 79 } catch (err) { 80 if (err.message.includes(`cannot be changed from type`)) { 81 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`) 82 await elasticClient.indices.delete({ index: elasticIndexName }) 83 await enforceIndexMapping() 84 } 85 } 86 } 87 88 /* POSTGRESQL */ 89 /* ========== */ 90 if (process.env.PG_CONNECTION_STRING) { 91 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING) 92 // const pgPool = new pg.Pool(pgConfig) 93 94 pgClient = new pg.Client(pgConfig) 95 await pgClient.connect() 96 97 // Check if table exists and have proper columns 98 const { rows: tables } = await pgClient.query(` 99 SELECT table_name 100 FROM information_schema.tables 101 WHERE table_schema = 'public' 102 `) 103 104 // eslint-disable-next-line camelcase 105 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE) 106 if (!tableExists) { 107 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`) 108 } 109 110 // TODO: Handle pgClient closing 111 } 112 113 if (process.env.PG_CONNECTION_STRING_NORMALIZED) { 114 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED) 115 116 pgClientNormalized = new pg.Client(pgConfig) 117 await pgClientNormalized.connect() 118 119 // Check if table exists and have proper columns 120 const { rows: tables } = await pgClientNormalized.query(` 121 SELECT table_name 122 FROM information_schema.tables 123 WHERE table_schema = 'public' 124 `) 125 126 // eslint-disable-next-line camelcase 127 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE) 128 // eslint-disable-next-line camelcase 129 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE) 130 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`) 131 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`) 132 133 // TODO: Handle pgClient closing 134 } 135} 136 137// inspired by @drobnikj 138// TODO: Similar, but less obfuscated for easier debugging 139export const createUniqueKeyFromUrl = (url) => { 140 const hash = createHash(`sha256`) 141 const cleanUrl = url.split(`://`)[1] // Remove protocol 142 hash.update(cleanUrl) 143 return hash.digest(`hex`) 144} 145 146/** 147 * 148 * @param {Date} datetime 149 * @return {Promise<void>} 150 */ 151export const sleepUntil = async (datetime) => { 152 const now = new Date() 153 const difference = datetime - now 154 if (difference > 0) { 155 return new Promise((resolve) => { 156 setTimeout(resolve, difference) 157 }) 158 } 159 return Promise.resolve() 160} 161 162// TODO: Uff, nicer! But at least it's tested 163export function parsePrice (string) { 164 let amount, currency 165 const noText = string.replace(/[^\d,.]/g, ``) 166 const decimals = noText.match(/([,.])(\d{2})$/) 167 if (decimals) { 168 const decimalSeparator = decimals[1] // ? 169 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars 170 const decimalAmount = decimals[2] // ? 171 const mainAmount = noText.split(decimalSeparator)[0].replace(/\D/g, ``) 172 amount = parseFloat(mainAmount + `.` + decimalAmount) // ? 173 } else { 174 const justNumbers = noText.replace(/[,.]/g, ``) 175 amount = parseInt(justNumbers) 176 } 177 return { amount, currency } 178} 179 180export function toNumberOrNull (str) { 181 // TODO: Handle better, but only after adding test 182 if (str === undefined) return null 183 if (str === null) return null 184 if (str === ``) return null 185 const num = Number(str) 186 if (Number.isNaN(num)) return null 187 return num 188} 189 190export async function save (objs) { 191 if (!Array.isArray(objs)) objs = [objs] 192 if (objs.length === 0) return console.log(`No data to save.`) 193 194 const objsExtended = await Promise.all(objs.map(async (obj) => { 195 const objExtended = { 196 ...obj, 197 actorName, 198 ...globalLogsProps, 199 // __NODE_VERSION: global.process.versions.node, 200 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals 201 } 202 // if run on Apify 203 if (process.env.APIFY_IS_AT_HOME) { 204 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID 205 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID 206 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID 207 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER 208 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID 209 if (process.env.APIFY_DONT_STORE_IN_DATASET !== `true`) { // Note: dotenv is not casting vars, so they are strings 210 await Dataset.pushData(obj) 211 } 212 } 213 return objExtended 214 })) 215 216 // if runs on local machine (MacOS) 217 if (os.platform() === `darwin`) { 218 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors 219 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage 220 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir) 221 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data 222 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir) 223 for (const objExtended of objsExtended) { 224 const id = String(objExtended.id ?? objExtended.pid) // ?? uuidv4() 225 const fileName = `${filenamify(id)}.json` 226 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json 227 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2)) 228 } 229 } 230 231 if (pgClient) { 232 const objsPg = objs.map((obj) => ({ 233 ...obj, 234 // TODO: This is becoming not nice, and not clear 235 shop: actorName, 236 scrapedAt: new Date().toISOString().split(`T`)[0], 237 })) 238 239 const columns = getColumns(objsPg) 240 const values = getValues(objsPg) 241 const queryString = ` 242 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns}) 243 VALUES (${values}) 244 ` 245 try { 246 const { rowCount } = await pgClient.query(queryString) 247 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`) 248 } catch (err) { 249 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 250 else throw err 251 } 252 } 253 254 // Only make sense for HlidacShopu 255 if (pgClientNormalized) { 256 const objsPgData = objs.map((obj) => ({ 257 shop: actorName, 258 pid: obj.pid, 259 name: obj.name, 260 url: obj.url, 261 img: obj.img, 262 })) 263 264 const objsPgDataPrice = objs.map((obj) => ({ 265 shop: actorName, 266 pid: obj.pid, 267 scrapedAt: new Date().toISOString().split(`T`)[0], 268 currentPrice: obj.currentPrice, 269 originalPrice: obj.originalPrice, 270 inStock: obj.inStock, 271 })) 272 273 const queryString = ` 274 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)}) 275 VALUES (${getValues(objsPgData)}) 276 ON CONFLICT DO NOTHING 277 ` 278 try { 279 const { rowCount } = await pgClientNormalized.query(queryString) 280 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`) 281 } catch (err) { 282 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 283 else throw err 284 } 285 286 const queryStringPrice = ` 287 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)}) 288 VALUES (${getValues(objsPgDataPrice)}) 289 ON CONFLICT DO NOTHING 290 ` 291 try { 292 const { rowCount } = await pgClientNormalized.query(queryStringPrice) 293 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`) 294 } catch (err) { 295 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 296 else throw err 297 } 298 } 299 300 if (elasticClient) { 301 // .index creates or updates the document 302 // .create creates a new document if it doesn't exist, 409 if it does 303 // try { 304 // const res = await elasticClient.index({ 305 // index: `actors-monorepo-shops`, // TODO: Consider using actorName 306 // id, // foo-bar 307 // document: objExtended, // {...} 308 // }) 309 // } catch (err) { 310 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583 311 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`) 312 // else throw err 313 // } 314 } 315} 316 317function getColumns (objs) { 318 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `) 319} 320 321function getValues (objs) { 322 return objs.map(objPg => Object.values(objPg).map((value) => { 323 // escape strings to prevent SQL injection 324 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'` 325 // convert to DB specific null 326 if (typeof value === `undefined` || value === null) return `NULL` 327 return value 328 }).join(`, `)).join(`), (`) 329} 330 331export function parseEnvFromInput (input) { 332 const env = {} 333 for (const key in input) { 334 if (key === key.toUpperCase()) env[key] = input[key] 335 } 336 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`) 337 Object.assign(process.env, env) 338} 339 340export const isInspect = 341 process.execArgv.join().includes(`--inspect`) || 342 // @ts-ignore 343 process?._preload_modules?.join(`|`)?.includes(`debug`) 344
Developer
Maintained by Community
Actor stats
  • 5 users
  • 314 runs
  • Modified 10 months ago
Categories

You might also like these Actors