BikerBoarder (biker-boarder.de) scraper avatar

BikerBoarder (biker-boarder.de) scraper

Deprecated
Go to Store
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
BikerBoarder (biker-boarder.de) scraper

BikerBoarder (biker-boarder.de) scraper

strajk/bikerboarder-biker-boarder-de-scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

Dockerfile

1FROM apify/actor-node:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6  && npm install --only=prod --no-optional
7
8COPY . ./

INPUT_SCHEMA.json

1{
2  "title": "BikerBoarder (biker-boarder.de) scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "mode": {
8      "title": "Mode",
9      "description": "",
10      "type": "string",
11      "editor": "select",
12      "default": "TEST",
13      "prefill": "TEST",
14      "enum": [
15        "TEST",
16        "FULL"
17      ],
18      "enumTitles": [
19        "TEST mode (scrapes only \"Evoc\" & \"Fox\" brands)",
20        "FULL"
21      ]
22    },
23    "APIFY_DONT_STORE_IN_DATASET": {
24      "sectionCaption": "Advanced",
25      "sectionDescription": "Advanced options, use only if you know what you're doing.",
26      "title": "Don't store in dataset",
27      "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
28      "type": "boolean",
29      "default": false,
30      "editor": "checkbox"
31    },
32    "PG_CONNECTION_STRING_NORMALIZED": {
33      "title": "Postgres connection string for normalized data",
34      "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
35      "type": "string",
36      "editor": "textfield"
37    },
38    "PG_DATA_TABLE": {
39      "title": "Postgres table name for product data",
40      "description": "Table name for storing product name, url, image, ...",
41      "type": "string",
42      "editor": "textfield"
43    },
44    "PG_DATA_PRICE_TABLE": {
45      "title": "Postgres table name for price data",
46      "description": "Table name for storing price, original price, stock status, ...",
47      "type": "string",
48      "editor": "textfield"
49    }
50  },
51  "required": [
52    "mode"
53  ]
54}

apify.json

1{
2  "name": "bikerboarder-biker-boarder-de-scraper",
3  "version": "0.1",
4  "buildTag": "latest",
5  "env": null,
6  "defaultRunOptions": {
7    "build": "latest",
8    "timeoutSecs": 3600,
9    "memoryMbytes": 1024
10  }
11}

main.js

1/**
2 * Dev notes
3 * ===
4 * original price is not available in listing page,
5 * so we have to calculate it from the price and discount percentage
6 */
7
8import { URL } from "node:url";
9import { Actor } from "apify3";
10import { CheerioCrawler, createCheerioRouter } from "crawlee";
11import { init, save } from "./_utils/common.js";
12
13const LABELS = {
14  INDEX: `INDEX`,
15  PRODUCTS: `PRODUCTS`,
16};
17
18var MODE;
19
20(function (MODE) {
21  MODE["TEST"] = "TEST";
22  MODE["FULL"] = "FULL";
23})(MODE || (MODE = {}));
24
25const BASE_URL = `https://www.biker-boarder.de`;
26
27async function enqueueInitial(mode, crawler) {
28  if (mode === MODE.FULL) {
29    await crawler.addRequests([
30      {
31        userData: { label: LABELS.INDEX },
32        url: `${BASE_URL}/brands`,
33      },
34    ]);
35  } else if (mode === MODE.TEST) {
36    await crawler.addRequests([
37      {
38        userData: { label: LABELS.PRODUCTS },
39        url: `${BASE_URL}/evoc`,
40      },
41    ]);
42    await crawler.addRequests([
43      {
44        userData: { label: LABELS.PRODUCTS },
45        url: `${BASE_URL}/fox`,
46      },
47    ]);
48  }
49}
50
51async function handleIndex({ $, crawler }) {
52  const requests = [];
53  $(`.brand__list a.brand`).each((i, el) => {
54    const relUrl = $(el).attr(`href`); // urls are relative
55    const url = `${BASE_URL}${relUrl}`;
56    const name = $(el).attr(`title`);
57    requests.push({
58      userData: { label: LABELS.PRODUCTS, category: name },
59      url,
60    });
61  });
62  await crawler.addRequests(requests);
63}
64
65async function handleProducts({ $, request, crawler }) {
66  if (!request.url.includes(`?page=`)) {
67    // on first page
68    const totalPages = $(`.productlist__head .pagination .page`).last().text(); // e.g. `8`
69    for (let i = 2; i <= totalPages; i++) {
70      // skip first page, that is already handled
71      const url = new URL(request.url);
72      url.searchParams.set(`page`, i.toString()); // toString() to make TS happy
73      void crawler.addRequests([
74        {
75          userData: { label: LABELS.PRODUCTS },
76          url: url.toString(),
77        },
78      ]);
79    }
80  }
81
82  const products = [];
83  $(`.articlebox__list .articlebox`).each((i, el) => {
84    const id = $(el).attr(`id`).replace(`pr-`, ``); // e.g. 226707
85    const relUrl = $(el).children(`a`).attr(`href`); // relative url
86    const url = `${BASE_URL}${relUrl}`;
87    const title = $(el).find(`h4`).text().trim();
88    const priceRaw = $(el).find(`.price .number`).text().trim(); // e.g. `ab €28,90`
89    const price = priceRaw.replace(/[^\d,]/g, ``).replace(`,`, `.`); // keep comma as it's decimal separator
90    let priceOrig = price;
91    let discount = 0;
92    const discountRaw = $(el).find(`.price .discount`).text().trim(); // e.g. `-42%`
93    if (discountRaw) {
94      discount = discountRaw.replace(/[^\d,]/g, ``); // keep comma as it's decimal separator
95      priceOrig = (price / (1 - discount / 100)).toFixed(2);
96    }
97    const img = $(el).find(`img.defaultimg`).attr(`src`);
98    const inStock = true;
99    const product = {
100      pid: id,
101      name: title,
102      url: url,
103      img: img,
104      inStock,
105      currentPrice: price,
106      originalPrice: priceOrig,
107      currency: `EUR`,
108    };
109    products.push(product);
110  });
111  void save(products);
112}
113
114const router = createCheerioRouter();
115router.addHandler(LABELS.INDEX, handleIndex);
116router.addHandler(LABELS.PRODUCTS, handleProducts);
117
118void Actor.main(async () => {
119  const input = await Actor.getInput();
120  const { mode = MODE.FULL, ...rest } = input ?? {};
121  await init({ actorNameOverride: `biker-boarder-de` }, rest);
122  const crawler = new CheerioCrawler({ requestHandler: router });
123  await enqueueInitial(mode, crawler);
124  await crawler.run();
125});

package.json

1{
2  "name": "bikerboarder-biker-boarder-de-scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "module",
5  "scripts": {
6    "start": "node ./main.js",
7    "push-to-apify-platform": "npx apify push"
8  },
9  "dependencies": {
10    "apify3": "npm:apify@^3.0.2",
11    "crawlee": "*",
12    "pg": "*",
13    "pg-connection-string": "*",
14    "dotenv": "*",
15    "find-config": "*",
16    "@elastic/elasticsearch": "*",
17    "filenamify": "*"
18  },
19  "apify": {
20    "title": "BikerBoarder (biker-boarder.de) scraper",
21    "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
22    "isPublic": true,
23    "isDeprecated": false,
24    "isAnonymouslyRunnable": true,
25    "notice": "",
26    "pictureUrl": "",
27    "seoTitle": "",
28    "seoDescription": "",
29    "categories": [
30      "ECOMMERCE"
31    ]
32  }
33}

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "bikerboarder-biker-boarder-de-scraper",
4  "title": "BikerBoarder (biker-boarder.de) scraper",
5  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6  "version": "0.1.0",
7  "storages": {
8    "dataset": {
9      "actorSpecification": 1,
10      "title": "BikerBoarder (biker-boarder.de) scraper",
11      "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12      "views": {
13        "overview": {
14          "title": "Overview",
15          "description": "Overview of the most important fields",
16          "transformation": {
17            "fields": [
18              "pid",
19              "name",
20              "url",
21              "img",
22              "inStock",
23              "currentPrice",
24              "originalPrice",
25              "currency"
26            ]
27          },
28          "display": {
29            "component": "table",
30            "columns": [
31              {
32                "label": "Pid",
33                "field": "pid",
34                "format": "text"
35              },
36              {
37                "label": "Name",
38                "field": "name",
39                "format": "text"
40              },
41              {
42                "label": "Url",
43                "field": "url",
44                "format": "link"
45              },
46              {
47                "label": "Img",
48                "field": "img",
49                "format": "image"
50              },
51              {
52                "label": "In Stock",
53                "field": "inStock",
54                "format": "boolean"
55              },
56              {
57                "label": "Current Price",
58                "field": "currentPrice",
59                "format": "number"
60              },
61              {
62                "label": "Original Price",
63                "field": "originalPrice",
64                "format": "number"
65              },
66              {
67                "label": "Currency",
68                "field": "currency",
69                "format": "text"
70              }
71            ]
72          }
73        }
74      }
75    }
76  }
77}

_utils/common.js

1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Dataset } from 'crawlee'
13
14config({ path: findConfig(`.env`) })
15
16const elasticIndexName = `actors-monorepo-shops`
17
18const globalLogsProps = {
19  __NODE_STARTED: new Date().toISOString(),
20}
21
22let actorName
23let pgClient
24let pgClientNormalized
25let elasticClient
26export async function init ({ actorNameOverride }, restInput) {
27  parseEnvFromInput(restInput)
28
29  if (os.platform() === `darwin`) {
30    const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
31    const basename = path.basename(filePath) // foo.ts
32    actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
33    const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
34      .split(` `)[1]
35      .trim()
36      .replace(`refs/heads/`, ``)
37    const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
38    const gitCommitShort = gitCommit.substring(0, 7)
39    globalLogsProps.__GIT_COMMIT = gitCommitShort
40  }
41
42  if (process.env.APIFY_IS_AT_HOME) {
43    actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
44  }
45
46  /* ELASTIC */
47  /* ======= */
48  if (process.env.ELASTIC_CLOUD_ID) {
49    elasticClient = new ElasticClient({
50      cloud: { id: process.env.ELASTIC_CLOUD_ID },
51      auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
52    })
53
54    // const mapping = await elasticClient.indices.getMapping({ index: actorName })
55
56    // eslint-disable-next-line no-inner-declarations
57    async function enforceIndexMapping () {
58      const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
59      if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
60      await elasticClient.indices.putMapping({
61        index: elasticIndexName,
62        body: {
63          properties: {
64            _discount: { type: `float` },
65            originalPrice: { type: `float` },
66            currentPrice: { type: `float` },
67          },
68        },
69      })
70    }
71
72    try {
73      await enforceIndexMapping()
74    } catch (err) {
75      if (err.message.includes(`cannot be changed from type`)) {
76        console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
77        await elasticClient.indices.delete({ index: elasticIndexName })
78        await enforceIndexMapping()
79      }
80    }
81  }
82
83  /* POSTGRESQL */
84  /* ========== */
85  if (process.env.PG_CONNECTION_STRING) {
86    const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
87    // const pgPool = new pg.Pool(pgConfig)
88
89    pgClient = new pg.Client(pgConfig)
90    await pgClient.connect()
91
92    // Check if table exists and have proper columns
93    const { rows: tables } = await pgClient.query(`
94    SELECT table_name
95    FROM information_schema.tables
96    WHERE table_schema = 'public'
97  `)
98
99    // eslint-disable-next-line camelcase
100    const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
101    if (!tableExists) {
102      throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
103    }
104
105  // TODO: Handle pgClient closing
106  }
107
108  if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
109    const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
110
111    pgClientNormalized = new pg.Client(pgConfig)
112    await pgClientNormalized.connect()
113
114    // Check if table exists and have proper columns
115    const { rows: tables } = await pgClientNormalized.query(`
116    SELECT table_name
117    FROM information_schema.tables
118    WHERE table_schema = 'public'
119  `)
120
121    // eslint-disable-next-line camelcase
122    const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
123    // eslint-disable-next-line camelcase
124    const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
125    if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
126    if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
127
128  // TODO: Handle pgClient closing
129  }
130}
131
132// inspired by @drobnikj
133// TODO: Similar, but less obfuscated for easier debugging
134export const createUniqueKeyFromUrl = (url) => {
135  const hash = createHash(`sha256`)
136  const cleanUrl = url.split(`://`)[1] // Remove protocol
137  hash.update(cleanUrl)
138  return hash.digest(`hex`)
139}
140
141/**
142 *
143 * @param {Date} datetime
144 * @return {Promise<void>}
145 */
146export const sleepUntil = async (datetime) => {
147  const now = new Date()
148  const difference = datetime - now
149  if (difference > 0) {
150    return new Promise((resolve) => {
151      setTimeout(resolve, difference)
152    })
153  }
154  return Promise.resolve()
155}
156
157export function parsePrice (string) {
158  let amount, currency
159  const noText = string.replace(/[^\d,.]/g, ``)
160  const decimals = noText.match(/([,.])(\d{2})$/)
161  if (decimals) {
162    const decimalSeparator = decimals[1]
163    // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
164    const decimalAmount = decimals[2]
165    amount = parseInt(noText.split(decimalSeparator)[0])
166  } {
167    const justNumbers = noText.replace(/[,.]/g, ``)
168    amount = parseInt(justNumbers)
169  }
170  return { amount, currency }
171}
172
173export function toNumberOrNull (str) {
174  // TODO: Handle better, but only after adding test
175  if (str === undefined) return null
176  if (str === null) return null
177  if (str === ``) return null
178  const num = Number(str)
179  if (Number.isNaN(num)) return null
180  return num
181}
182
183export async function save (objs) {
184  if (!Array.isArray(objs)) objs = [objs]
185  if (objs.length === 0) return
186
187  const objsExtended = objs.map((obj) => {
188    const objExtended = {
189      ...obj,
190      actorName,
191      ...globalLogsProps,
192      // __NODE_VERSION: global.process.versions.node,
193      // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
194    }
195    // if run on Apify
196    if (process.env.APIFY_IS_AT_HOME) {
197      objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
198      objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
199      objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
200      objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
201      objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
202      if (!process.env.APIFY_DONT_STORE_IN_DATASET) void Dataset.pushData(obj)
203    }
204    return objExtended
205  })
206  // if runs on local machine (MacOS)
207  if (os.platform() === `darwin`) {
208    const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
209    const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
210    if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
211    const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
212    if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
213    for (const objExtended of objsExtended) {
214      const id = objExtended.id ?? objExtended.pid // ?? uuidv4()
215      const fileName = `${filenamify(id)}.json`
216      const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
217      fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
218    }
219  }
220
221  if (pgClient) {
222    const objsPg = objs.map((obj) => ({
223      ...obj,
224      // TODO: This is becoming not nice, and not clear
225      shop: actorName,
226      scrapedAt: new Date().toISOString().split(`T`)[0],
227    }))
228
229    const columns = getColumns(objsPg)
230    const values = getValues(objsPg)
231    const queryString = `
232        INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
233        VALUES (${values})
234    `
235    try {
236      const { rowCount } = await pgClient.query(queryString)
237      console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
238    } catch (err) {
239      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
240      else throw err
241    }
242  }
243
244  // Only make sense for HlidacShopu
245  if (pgClientNormalized) {
246    const objsPgData = objs.map((obj) => ({
247      shop: actorName,
248      pid: obj.pid,
249      name: obj.name,
250      url: obj.url,
251      img: obj.img,
252    }))
253
254    const objsPgDataPrice = objs.map((obj) => ({
255      shop: actorName,
256      pid: obj.pid,
257      scrapedAt: new Date().toISOString().split(`T`)[0],
258      currentPrice: obj.currentPrice,
259      originalPrice: obj.originalPrice,
260      inStock: obj.inStock,
261    }))
262
263    const queryString = `
264        INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
265        VALUES (${getValues(objsPgData)})
266        ON CONFLICT DO NOTHING
267    `
268    try {
269      const { rowCount } = await pgClientNormalized.query(queryString)
270      console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
271    } catch (err) {
272      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
273      else throw err
274    }
275
276    const queryStringPrice = `
277        INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
278        VALUES (${getValues(objsPgDataPrice)})
279        ON CONFLICT DO NOTHING
280    `
281    try {
282      const { rowCount } = await pgClientNormalized.query(queryStringPrice)
283      console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
284    } catch (err) {
285      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
286      else throw err
287    }
288  }
289
290  if (elasticClient) {
291    // .index creates or updates the document
292    // .create creates a new document if it doesn't exist, 409 if it does
293    // try {
294    //   const res = await elasticClient.index({
295    //     index: `actors-monorepo-shops`, // TODO: Consider using actorName
296    //     id, // foo-bar
297    //     document: objExtended, // {...}
298    //   })
299    // } catch (err) {
300    //   // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
301    //   if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
302    //   else throw err
303    // }
304  }
305}
306
307function getColumns (objs) {
308  return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
309}
310
311function getValues (objs) {
312  return objs.map(objPg => Object.values(objPg).map((value) => {
313    // escape strings to prevent SQL injection
314    if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
315    // convert to DB specific null
316    if (typeof value === `undefined` || value === null) return `NULL`
317    return value
318  }).join(`, `)).join(`), (`)
319}
320
321export function parseEnvFromInput (input) {
322  const env = {}
323  for (const key in input) {
324    if (key === key.toUpperCase()) env[key] = input[key]
325  }
326  console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
327  Object.assign(process.env, env)
328}
Developer
Maintained by Community