BikerBoarder (biker-boarder.de) scraper
- strajk/bikerboarder-biker-boarder-de-scraper
- Modified
- Users 4
- Runs 439
- Created by
Pavel Dolecek
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
Dockerfile
FROM apify/actor-node:16
COPY package.json ./
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{
"title": "BikerBoarder (biker-boarder.de) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"type": "object",
"schemaVersion": 1,
"properties": {
"mode": {
"title": "Mode",
"description": "",
"type": "string",
"editor": "select",
"default": "TEST",
"prefill": "TEST",
"enum": [
"TEST",
"FULL"
],
"enumTitles": [
"TEST mode (scrapes only \"Evoc\" & \"Fox\" brands)",
"FULL"
]
},
"APIFY_DONT_STORE_IN_DATASET": {
"sectionCaption": "Advanced",
"sectionDescription": "Advanced options, use only if you know what you're doing.",
"title": "Don't store in dataset",
"description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
"type": "boolean",
"default": false,
"editor": "checkbox"
},
"PG_CONNECTION_STRING_NORMALIZED": {
"title": "Postgres connection string for normalized data",
"description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
"type": "string",
"editor": "textfield"
},
"PG_DATA_TABLE": {
"title": "Postgres table name for product data",
"description": "Table name for storing product name, url, image, ...",
"type": "string",
"editor": "textfield"
},
"PG_DATA_PRICE_TABLE": {
"title": "Postgres table name for price data",
"description": "Table name for storing price, original price, stock status, ...",
"type": "string",
"editor": "textfield"
}
},
"required": [
"mode"
]
}
README.md
# BikerBoarder (biker-boarder.de) scraper
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
## Output example
* **pid** `string` e.g. *226707*
* **name** `string` e.g. *Evoc Line 28l, heather ruby*
* **url** `string` e.g. *https://www.biker-boarder.de/evoc/1942902_pa.html*
* **img** `string`
* **inStock** `boolean` e.g. *true*
* **currentPrice** `number` e.g. *115.90*
* **originalPrice** `number` e.g. *145.00*
* **currency** `string` e.g. *EUR*
apify.json
{
"name": "bikerboarder-biker-boarder-de-scraper",
"version": "0.1",
"buildTag": "latest",
"env": null,
"defaultRunOptions": {
"build": "latest",
"timeoutSecs": 3600,
"memoryMbytes": 1024
}
}
main.js
This file is 125 lines long. Only the first 50 are shown. Show all
/**
* Dev notes
* ===
* original price is not available in listing page,
* so we have to calculate it from the price and discount percentage
*/
import { URL } from "node:url";
import { Actor } from "apify3";
import { CheerioCrawler, createCheerioRouter } from "crawlee";
import { init, save } from "./_utils/common.js";
const LABELS = {
INDEX: `INDEX`,
PRODUCTS: `PRODUCTS`,
};
var MODE;
(function (MODE) {
MODE["TEST"] = "TEST";
MODE["FULL"] = "FULL";
})(MODE || (MODE = {}));
const BASE_URL = `https://www.biker-boarder.de`;
async function enqueueInitial(mode, crawler) {
if (mode === MODE.FULL) {
await crawler.addRequests([
{
userData: { label: LABELS.INDEX },
url: `${BASE_URL}/brands`,
},
]);
} else if (mode === MODE.TEST) {
await crawler.addRequests([
{
userData: { label: LABELS.PRODUCTS },
url: `${BASE_URL}/evoc`,
},
]);
await crawler.addRequests([
{
userData: { label: LABELS.PRODUCTS },
url: `${BASE_URL}/fox`,
},
]);
}
}
package.json
{
"name": "bikerboarder-biker-boarder-de-scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"type": "module",
"scripts": {
"start": "node ./main.js",
"push-to-apify-platform": "npx apify push"
},
"dependencies": {
"apify3": "npm:apify@^3.0.2",
"crawlee": "*",
"pg": "*",
"pg-connection-string": "*",
"dotenv": "*",
"find-config": "*",
"@elastic/elasticsearch": "*",
"filenamify": "*"
},
"apify": {
"title": "BikerBoarder (biker-boarder.de) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"isPublic": true,
"isDeprecated": false,
"isAnonymouslyRunnable": true,
"notice": "",
"pictureUrl": "",
"seoTitle": "",
"seoDescription": "",
"categories": [
"ECOMMERCE"
]
}
}
.actor/actor.json
{
"actorSpecification": 1,
"name": "bikerboarder-biker-boarder-de-scraper",
"title": "BikerBoarder (biker-boarder.de) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"version": "0.1.0",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "BikerBoarder (biker-boarder.de) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"views": {
"overview": {
"title": "Overview",
"description": "Overview of the most important fields",
"transformation": {
"fields": [
"pid",
"name",
"url",
"img",
"inStock",
"currentPrice",
"originalPrice",
"currency"
]
},
"display": {
"component": "table",
"columns": [
{
"label": "Pid",
"field": "pid",
"format": "text"
},
{
"label": "Name",
"field": "name",
"format": "text"
},
{
"label": "Url",
"field": "url",
"format": "link"
},
{
"label": "Img",
"field": "img",
"format": "image"
},
{
"label": "In Stock",
"field": "inStock",
"format": "boolean"
},
{
"label": "Current Price",
"field": "currentPrice",
"format": "number"
},
{
"label": "Original Price",
"field": "originalPrice",
"format": "number"
},
{
"label": "Currency",
"field": "currency",
"format": "text"
}
]
}
}
}
}
}
}
_utils/common.js
This file is 328 lines long. Only the first 50 are shown. Show all
import { createHash } from 'crypto'
import os from "os"
import path from "path"
// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
import fs from "fs"
import pg from "pg"
import pgConnectionString from 'pg-connection-string'
import { config } from 'dotenv'
import findConfig from "find-config"
import { Client as ElasticClient } from "@elastic/elasticsearch"
import filenamify from 'filenamify'
import { Dataset } from 'crawlee'
config({ path: findConfig(`.env`) })
const elasticIndexName = `actors-monorepo-shops`
const globalLogsProps = {
__NODE_STARTED: new Date().toISOString(),
}
let actorName
let pgClient
let pgClientNormalized
let elasticClient
export async function init ({ actorNameOverride }, restInput) {
parseEnvFromInput(restInput)
if (os.platform() === `darwin`) {
const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
const basename = path.basename(filePath) // foo.ts
actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
.split(` `)[1]
.trim()
.replace(`refs/heads/`, ``)
const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
const gitCommitShort = gitCommit.substring(0, 7)
globalLogsProps.__GIT_COMMIT = gitCommitShort
}
if (process.env.APIFY_IS_AT_HOME) {
actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
}
/* ELASTIC */
/* ======= */
if (process.env.ELASTIC_CLOUD_ID) {
elasticClient = new ElasticClient({
cloud: { id: process.env.ELASTIC_CLOUD_ID },