B(

BikerBoarder (biker-boarder.de) scraper

  • strajk/bikerboarder-biker-boarder-de-scraper
  • Modified
  • Users 4
  • Runs 439
  • Created by Author's avatarPavel Dolecek

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

B(
BikerBoarder (biker-boarder.de) scraper

Dockerfile

FROM apify/actor-node:16

COPY package.json ./

RUN npm --quiet set progress=false \
  && npm install --only=prod --no-optional

COPY . ./

INPUT_SCHEMA.json

{
  "title": "BikerBoarder (biker-boarder.de) scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "type": "object",
  "schemaVersion": 1,
  "properties": {
    "mode": {
      "title": "Mode",
      "description": "",
      "type": "string",
      "editor": "select",
      "default": "TEST",
      "prefill": "TEST",
      "enum": [
        "TEST",
        "FULL"
      ],
      "enumTitles": [
        "TEST mode (scrapes only \"Evoc\" & \"Fox\" brands)",
        "FULL"
      ]
    },
    "APIFY_DONT_STORE_IN_DATASET": {
      "sectionCaption": "Advanced",
      "sectionDescription": "Advanced options, use only if you know what you're doing.",
      "title": "Don't store in dataset",
      "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
      "type": "boolean",
      "default": false,
      "editor": "checkbox"
    },
    "PG_CONNECTION_STRING_NORMALIZED": {
      "title": "Postgres connection string for normalized data",
      "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
      "type": "string",
      "editor": "textfield"
    },
    "PG_DATA_TABLE": {
      "title": "Postgres table name for product data",
      "description": "Table name for storing product name, url, image, ...",
      "type": "string",
      "editor": "textfield"
    },
    "PG_DATA_PRICE_TABLE": {
      "title": "Postgres table name for price data",
      "description": "Table name for storing price, original price, stock status, ...",
      "type": "string",
      "editor": "textfield"
    }
  },
  "required": [
    "mode"
  ]
}

README.md

# BikerBoarder (biker-boarder.de) scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

## Output example

* **pid** `string` e.g. *226707*
* **name** `string` e.g. *Evoc Line 28l, heather ruby*
* **url** `string` e.g. *https://www.biker-boarder.de/evoc/1942902_pa.html*
* **img** `string`
* **inStock** `boolean` e.g. *true*
* **currentPrice** `number` e.g. *115.90*
* **originalPrice** `number` e.g. *145.00*
* **currency** `string` e.g. *EUR*

apify.json

{
  "name": "bikerboarder-biker-boarder-de-scraper",
  "version": "0.1",
  "buildTag": "latest",
  "env": null,
  "defaultRunOptions": {
    "build": "latest",
    "timeoutSecs": 3600,
    "memoryMbytes": 1024
  }
}

main.js

This file is 125 lines long. Only the first 50 are shown. Show all

/**
 * Dev notes
 * ===
 * original price is not available in listing page,
 * so we have to calculate it from the price and discount percentage
 */

import { URL } from "node:url";
import { Actor } from "apify3";
import { CheerioCrawler, createCheerioRouter } from "crawlee";
import { init, save } from "./_utils/common.js";

const LABELS = {
  INDEX: `INDEX`,
  PRODUCTS: `PRODUCTS`,
};

var MODE;

(function (MODE) {
  MODE["TEST"] = "TEST";
  MODE["FULL"] = "FULL";
})(MODE || (MODE = {}));

const BASE_URL = `https://www.biker-boarder.de`;

async function enqueueInitial(mode, crawler) {
  if (mode === MODE.FULL) {
    await crawler.addRequests([
      {
        userData: { label: LABELS.INDEX },
        url: `${BASE_URL}/brands`,
      },
    ]);
  } else if (mode === MODE.TEST) {
    await crawler.addRequests([
      {
        userData: { label: LABELS.PRODUCTS },
        url: `${BASE_URL}/evoc`,
      },
    ]);
    await crawler.addRequests([
      {
        userData: { label: LABELS.PRODUCTS },
        url: `${BASE_URL}/fox`,
      },
    ]);
  }
}

package.json

{
  "name": "bikerboarder-biker-boarder-de-scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "type": "module",
  "scripts": {
    "start": "node ./main.js",
    "push-to-apify-platform": "npx apify push"
  },
  "dependencies": {
    "apify3": "npm:apify@^3.0.2",
    "crawlee": "*",
    "pg": "*",
    "pg-connection-string": "*",
    "dotenv": "*",
    "find-config": "*",
    "@elastic/elasticsearch": "*",
    "filenamify": "*"
  },
  "apify": {
    "title": "BikerBoarder (biker-boarder.de) scraper",
    "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
    "isPublic": true,
    "isDeprecated": false,
    "isAnonymouslyRunnable": true,
    "notice": "",
    "pictureUrl": "",
    "seoTitle": "",
    "seoDescription": "",
    "categories": [
      "ECOMMERCE"
    ]
  }
}

.actor/actor.json

{
  "actorSpecification": 1,
  "name": "bikerboarder-biker-boarder-de-scraper",
  "title": "BikerBoarder (biker-boarder.de) scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "version": "0.1.0",
  "storages": {
    "dataset": {
      "actorSpecification": 1,
      "title": "BikerBoarder (biker-boarder.de) scraper",
      "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
      "views": {
        "overview": {
          "title": "Overview",
          "description": "Overview of the most important fields",
          "transformation": {
            "fields": [
              "pid",
              "name",
              "url",
              "img",
              "inStock",
              "currentPrice",
              "originalPrice",
              "currency"
            ]
          },
          "display": {
            "component": "table",
            "columns": [
              {
                "label": "Pid",
                "field": "pid",
                "format": "text"
              },
              {
                "label": "Name",
                "field": "name",
                "format": "text"
              },
              {
                "label": "Url",
                "field": "url",
                "format": "link"
              },
              {
                "label": "Img",
                "field": "img",
                "format": "image"
              },
              {
                "label": "In Stock",
                "field": "inStock",
                "format": "boolean"
              },
              {
                "label": "Current Price",
                "field": "currentPrice",
                "format": "number"
              },
              {
                "label": "Original Price",
                "field": "originalPrice",
                "format": "number"
              },
              {
                "label": "Currency",
                "field": "currency",
                "format": "text"
              }
            ]
          }
        }
      }
    }
  }
}

_utils/common.js

This file is 328 lines long. Only the first 50 are shown. Show all

import { createHash } from 'crypto'
import os from "os"
import path from "path"
// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
import fs from "fs"
import pg from "pg"
import pgConnectionString from 'pg-connection-string'
import { config } from 'dotenv'
import findConfig from "find-config"
import { Client as ElasticClient } from "@elastic/elasticsearch"
import filenamify from 'filenamify'
import { Dataset } from 'crawlee'

config({ path: findConfig(`.env`) })

const elasticIndexName = `actors-monorepo-shops`

const globalLogsProps = {
  __NODE_STARTED: new Date().toISOString(),
}

let actorName
let pgClient
let pgClientNormalized
let elasticClient
export async function init ({ actorNameOverride }, restInput) {
  parseEnvFromInput(restInput)

  if (os.platform() === `darwin`) {
    const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
    const basename = path.basename(filePath) // foo.ts
    actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
    const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
      .split(` `)[1]
      .trim()
      .replace(`refs/heads/`, ``)
    const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
    const gitCommitShort = gitCommit.substring(0, 7)
    globalLogsProps.__GIT_COMMIT = gitCommitShort
  }

  if (process.env.APIFY_IS_AT_HOME) {
    actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
  }

  /* ELASTIC */
  /* ======= */
  if (process.env.ELASTIC_CLOUD_ID) {
    elasticClient = new ElasticClient({
      cloud: { id: process.env.ELASTIC_CLOUD_ID },