Actor picture

FDF Bike Shop (fdfbikeshop.cz) scraper

strajk/fdf-bike-shop-fdfbikeshop-cz-scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

No credit card required

Author's avatarPavel Dolecek
  • Modified
  • Users4
  • Runs297
Actor picture
FDF Bike Shop (fdfbikeshop.cz) scraper

Dockerfile

FROM apify/actor-node:16

COPY package.json ./

RUN npm --quiet set progress=false \
  && npm install --only=prod --no-optional

COPY . ./

INPUT_SCHEMA.json

{
  "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "type": "object",
  "schemaVersion": 1,
  "properties": {
    "mode": {
      "title": "Mode",
      "description": "",
      "type": "string",
      "editor": "select",
      "default": "TEST",
      "prefill": "TEST",
      "enum": [
        "TEST",
        "FULL"
      ],
      "enumTitles": [
        "TEST",
        "FULL"
      ]
    },
    "APIFY_DONT_STORE_IN_DATASET": {
      "sectionCaption": "Advanced",
      "sectionDescription": "Advanced options, use only if you know what you're doing.",
      "title": "Don't store in dataset",
      "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
      "type": "boolean",
      "default": false,
      "editor": "checkbox"
    },
    "PG_CONNECTION_STRING_NORMALIZED": {
      "title": "Postgres connection string for normalized data",
      "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
      "type": "string",
      "editor": "textfield"
    },
    "PG_DATA_TABLE": {
      "title": "Postgres table name for product data",
      "description": "Table name for storing product name, url, image, ...",
      "type": "string",
      "editor": "textfield"
    },
    "PG_DATA_PRICE_TABLE": {
      "title": "Postgres table name for price data",
      "description": "Table name for storing price, original price, stock status, ...",
      "type": "string",
      "editor": "textfield"
    }
  },
  "required": [
    "mode"
  ]
}

README.md

# FDF Bike Shop (fdfbikeshop.cz) scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

## Output example

* **pid** `string` e.g. *16678*
* **name** `string` e.g. *SHIMANO pedály PD-M520*
* **url** `string` e.g. *https://www.fdfbikeshop.cz/naslapne/shimano-pedaly-pd-m520/*
* **img** `string` e.g. *https://cdn.myshoptet.com/usr/www.fdfbikeshop.cz/user/shop/detail_small/16678_shimano-pedaly-pd-m520-cerne-v.jpg?622b077c*
* **inStock** `boolean` e.g. *true*
* **currentPrice** `number` e.g. *599*
* **originalPrice** `number` e.g. *799*
* **currency** `string` e.g. *CZK*

apify.json

{
  "name": "fdf-bike-shop-fdfbikeshop-cz-scraper",
  "version": "0.1",
  "buildTag": "latest",
  "env": null,
  "defaultRunOptions": {
    "build": "latest",
    "timeoutSecs": 3600,
    "memoryMbytes": 1024
  }
}

main.js

This file is 148 lines long. Only the first 50 are shown. Show all

import { Actor } from "apify3";
import { CheerioCrawler, createCheerioRouter } from "crawlee";
import { init, save, toNumberOrNull } from "./_utils/common.js";

const LABELS = {
  INDEX: `INDEX`,
  PRODUCTS: `PRODUCTS`,
};

var MODE;

(function (MODE) {
  MODE["TEST"] = "TEST";
  MODE["FULL"] = "FULL";
})(MODE || (MODE = {}));

const baseUrl = `https://www.fdfbikeshop.cz`;

async function enqueueInitial(mode, crawler) {
  if (mode === MODE.FULL) {
    await crawler.addRequests([
      {
        userData: { label: LABELS.INDEX },
        url: `${baseUrl}/sitemap.xml`,
      },
    ]);
  } else if (mode === MODE.TEST) {
    await crawler.addRequests([
      {
        userData: { label: LABELS.PRODUCTS },
        url: `${baseUrl}/znacka/crankbrothers/`,
      },
    ]);
    await crawler.addRequests([
      {
        userData: { label: LABELS.PRODUCTS },
        url: `${baseUrl}/znacka/dt-swiss/`,
      },
    ]);
    await crawler.addRequests([
      {
        userData: { label: LABELS.PRODUCTS },
        url: `${baseUrl}/znacka/galfer/`,
      },
    ]);
  }
}

const router = createCheerioRouter();

package.json

{
  "name": "fdf-bike-shop-fdfbikeshop-cz-scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "type": "module",
  "scripts": {
    "start": "node ./main.js",
    "push-to-apify-platform": "npx apify push"
  },
  "dependencies": {
    "apify3": "npm:apify@^3.0.2",
    "crawlee": "*",
    "pg": "*",
    "pg-connection-string": "*",
    "dotenv": "*",
    "find-config": "*",
    "@elastic/elasticsearch": "*",
    "filenamify": "*"
  },
  "apify": {
    "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
    "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
    "isPublic": true,
    "isDeprecated": false,
    "isAnonymouslyRunnable": true,
    "notice": "",
    "pictureUrl": "",
    "seoTitle": "",
    "seoDescription": "",
    "categories": [
      "ECOMMERCE"
    ]
  }
}

.actor/actor.json

{
  "actorSpecification": 1,
  "name": "fdf-bike-shop-fdfbikeshop-cz-scraper",
  "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "version": "0.1.0",
  "storages": {
    "dataset": {
      "actorSpecification": 1,
      "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
      "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
      "views": {
        "overview": {
          "title": "Overview",
          "description": "Overview of the most important fields",
          "transformation": {
            "fields": [
              "pid",
              "name",
              "url",
              "img",
              "inStock",
              "currentPrice",
              "originalPrice",
              "currency"
            ]
          },
          "display": {
            "component": "table",
            "columns": [
              {
                "label": "Pid",
                "field": "pid",
                "format": "text"
              },
              {
                "label": "Name",
                "field": "name",
                "format": "text"
              },
              {
                "label": "Url",
                "field": "url",
                "format": "link"
              },
              {
                "label": "Img",
                "field": "img",
                "format": "image"
              },
              {
                "label": "In Stock",
                "field": "inStock",
                "format": "boolean"
              },
              {
                "label": "Current Price",
                "field": "currentPrice",
                "format": "number"
              },
              {
                "label": "Original Price",
                "field": "originalPrice",
                "format": "number"
              },
              {
                "label": "Currency",
                "field": "currency",
                "format": "text"
              }
            ]
          }
        }
      }
    }
  }
}

.actor/logo.png

_utils/common.js

This file is 328 lines long. Only the first 50 are shown. Show all

import { createHash } from 'crypto'
import os from "os"
import path from "path"
// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
import fs from "fs"
import pg from "pg"
import pgConnectionString from 'pg-connection-string'
import { config } from 'dotenv'
import findConfig from "find-config"
import { Client as ElasticClient } from "@elastic/elasticsearch"
import filenamify from 'filenamify'
import { Dataset } from 'crawlee'

config({ path: findConfig(`.env`) })

const elasticIndexName = `actors-monorepo-shops`

const globalLogsProps = {
  __NODE_STARTED: new Date().toISOString(),
}

let actorName
let pgClient
let pgClientNormalized
let elasticClient
export async function init ({ actorNameOverride }, restInput) {
  parseEnvFromInput(restInput)

  if (os.platform() === `darwin`) {
    const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
    const basename = path.basename(filePath) // foo.ts
    actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
    const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
      .split(` `)[1]
      .trim()
      .replace(`refs/heads/`, ``)
    const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
    const gitCommitShort = gitCommit.substring(0, 7)
    globalLogsProps.__GIT_COMMIT = gitCommitShort
  }

  if (process.env.APIFY_IS_AT_HOME) {
    actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
  }

  /* ELASTIC */
  /* ======= */
  if (process.env.ELASTIC_CLOUD_ID) {
    elasticClient = new ElasticClient({
      cloud: { id: process.env.ELASTIC_CLOUD_ID },