Český ráj (ceskyraj.com) scraper

  • strajk-old/cesky-raj-ceskyraj-com-scraper
  • Modified
  • Users 2
  • Runs 298
  • Created by Author's avatarPavel Dolecek

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

Český ráj (ceskyraj.com) scraper

Dockerfile

FROM apify/actor-node:16

COPY package.json ./

RUN npm --quiet set progress=false \
  && npm install --only=prod --no-optional

COPY . ./

INPUT_SCHEMA.json

{
  "title": "Český ráj (ceskyraj.com) scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "type": "object",
  "schemaVersion": 1,
  "properties": {
    "mode": {
      "title": "Mode",
      "description": "",
      "type": "string",
      "editor": "select",
      "default": "TEST",
      "prefill": "TEST",
      "enumTitles": [
        "TEST",
        "FULL",
        "SINGLE"
      ],
      "enum": [
        "TEST",
        "FULL",
        "SINGLE"
      ]
    },
    "country": {
      "title": "Country",
      "description": "",
      "type": "string",
      "editor": "select",
      "default": "CZ",
      "prefill": "CZ",
      "enumTitles": [
        "CZ",
        "SK",
        "UK",
        "DE",
        "AT",
        "HU"
      ],
      "enum": [
        "CZ",
        "SK",
        "UK",
        "DE",
        "AT",
        "HU"
      ]
    },
    "debug": {
      "title": "Debug",
      "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
      "type": "boolean",
      "editor": "checkbox",
      "default": false
    }
  },
  "required": [
    "mode",
    "country"
  ]
}

README.md

# Český ráj (ceskyraj.com) scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

## Output example

* **itemId** `string` e.g. *p1390167*
* **itemName** `string` e.g. *Pedály Crankbrothers Stamp 7 orange*
* **itemUrl** `string` e.g. *https://www.ceskyraj.com/pedaly-crankbrothers-stamp-7-orange-p1141669/*
* **img** `string`
* **inStock** `boolean` e.g. *true*
* **currentPrice** `number` e.g. *3896*
* **originalPrice** `number` e.g. *3999*
* **currency** `string` e.g. *CZK*

apify.json

{
  "name": "cesky-raj-ceskyraj-com-scraper",
  "version": "0.1",
  "buildTag": "latest",
  "env": null,
  "template": "project_cheerio_crawler"
}

main.js

This file is 293 lines long. Only the first 50 are shown. Show all

/**
 * TODO:
 * - consider proxies
 * - DRY price parsing
 * - different countries to input
 *
 * Beware that same product can have multiple valid urls
 * - https://www.alza.cz/iphone-13-512gb-cervena-levne-d6839524.htm
 * - https://www.alza.cz/sport/victorias-secret-st-11128877-cc-4vmq-cerna
 * - https://www.alza.cz/sport/victorias-secret-st-11156655-cc-38h2-bezova?dq=6920061
 *
 * Variants can affect price
 * - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6804643 38k
 * - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6771118 39k
 *
 * Pagination:
 * beware: only first next 3 pages are listed
 * -> need to use total amount of products & product per page to calculate total amount of pages
 */

import Apify from "apify";
import cheerio from "cheerio";
import { gotScraping } from "got-scraping";
import { withPersistedStats } from "./_utils/stats.js";

const { log } = Apify.utils;

var LABEL;

(function (LABEL) {
  LABEL["INDEX"] = "INDEX";
  LABEL["PRODUCTS"] = "PRODUCTS";
})(LABEL || (LABEL = {}));
var MODE;

(function (MODE) {
  MODE["TEST"] = "TEST";
  MODE["FULL"] = "FULL";
  MODE["SINGLE"] = "SINGLE";
})(MODE || (MODE = {}));
var Country;

// TODO: Maybe unify with Country enum
(function (Country) {
  Country["CZ"] = "CZ";
  Country["SK"] = "SK";
  Country["UK"] = "UK";
  Country["DE"] = "DE";
  Country["AT"] = "AT";
  Country["HU"] = "HU";

package.json

{
  "name": "cesky-raj-ceskyraj-com-scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "type": "module",
  "scripts": {
    "start": "node ./main.js",
    "push-to-apify-platform": "npx apify push"
  },
  "dependencies": {
    "apify": "*",
    "@thi.ng/atom": "*",
    "cheerio": "*",
    "got-scraping": "*"
  },
  "apify": {
    "title": "Český ráj (ceskyraj.com) scraper",
    "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
    "isPublic": true,
    "isDeprecated": false,
    "issuesEnabled": true,
    "isAnonymouslyRunnable": true,
    "notice": "",
    "pictureUrl": "",
    "seoTitle": "",
    "seoDescription": "",
    "categories": [
      "ECOMMERCE"
    ]
  }
}

.actor/actor.json

{
  "actorSpecification": 1,
  "name": "cesky-raj-ceskyraj-com-scraper",
  "title": "Český ráj (ceskyraj.com) scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "version": "0.1.0",
  "storages": {
    "dataset": {
      "actorSpecification": 1,
      "title": "Český ráj (ceskyraj.com) scraper",
      "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
      "views": {
        "overview": {
          "title": "Overview",
          "description": "Overview of the most important fields",
          "transformation": {
            "fields": [
              "itemId",
              "itemName",
              "itemUrl",
              "img",
              "inStock",
              "currentPrice",
              "originalPrice",
              "currency"
            ]
          },
          "display": {
            "component": "table",
            "columns": [
              {
                "label": "Item ID",
                "field": "itemUrl",
                "format": "link",
                "textField": "itemId"
              },
              {
                "label": "Item Name",
                "field": "itemName",
                "format": "text"
              },
              {
                "label": "Img",
                "field": "img",
                "format": "image"
              },
              {
                "label": "In Stock",
                "field": "inStock",
                "format": "boolean"
              },
              {
                "label": "Current Price",
                "field": "currentPrice",
                "format": "number"
              },
              {
                "label": "Original Price",
                "field": "originalPrice",
                "format": "number"
              },
              {
                "label": "Currency",
                "field": "currency",
                "format": "text"
              }
            ]
          }
        }
      }
    }
  }
}

_utils/CloudflareUnblocker.js

This file is 428 lines long. Only the first 50 are shown. Show all

import Apify from "apify"
import { Session } from "apify/build/session_pool/session.js"
import { BrowserPool, PlaywrightPlugin } from "browser-pool"
import playwright from "playwright"

const { utils: { log, requestAsBrowser, puppeteer } } = Apify

export default class CloudflareUnblocker {
  // unblockUrl: string
  // proxyConfiguration: any
  // browserPool: BrowserPool
  // _shouldUseCustomTLS: boolean
  // log: any
  // name: string

  constructor (options) {
    // Store options
    this.unblockUrl = options.unblockUrl
    this.proxyConfiguration = options.proxyConfiguration

    // Create browser pool
    this.browserPool = new BrowserPool({
      retireBrowserAfterPageCount: 50, // TODO: explain
      maxOpenPagesPerBrowser: 10, // TODO: explain
      browserPlugins: [
        new PlaywrightPlugin(playwright.firefox, {
          launchOptions: { headless: false },
        }),
      ],
    })

    this._shouldUseCustomTLS = false

    // Extending CrawlerExtension caused error:
    // Class extends value #<Object> is not a constructor or null
    // But all it should do is to add log method, so I'm adding it here
    this.name = this.constructor.name
    this.log = Apify.utils.log.child({ prefix: this.constructor.name })
  }

  /**
   * Main function that unblocks your session.
   */
  async unblock ({ session, request }) {
    if (this._isSessionBeingRenewed(session)) {
      request.retryCount = 0
      this._throwError("Session is being renewed")
    }

    const oldShouldUseTLS = this._shouldUseCustomTLS

_utils/common.js

import { createHash } from 'crypto';

// inspired by @drobnikj
const createUniqueKeyFromUrl = (url) => {
  const hash = createHash('sha256');
  const cleanUrl = url.split('://')[1]; // Remove protocol
  hash.update(cleanUrl);
  return hash.digest('hex');
};

_utils/stats.js

// inspired by hlidac-shopu
import Apify from "apify";
import { defAtom } from "@thi.ng/atom";

// TODO: make this lowes common denominator
const defaultStats = {
  urls: 0,
  items: 0,
  itemsDuplicate: 0,
  totalItems: 0,
  denied: 0,
  ok: 0
};

const inc = x => x + 1;
const dec = x => x - 1;

// TODO: stats should be in atom and updated via swap function atomically
class Stats {
  constructor(init) {
    this.stats = defAtom(init);
    this.interval = setInterval(() => this.log(), 20 * 1000);
  }

  inc(key) {
    this.stats.swapIn(key, inc);
  }

  dec(key) {
    this.stats.swapIn(key, dec);
  }

  add(key, value) {
    this.stats.swapIn(key, x => x + value);
  }

  get() {
    return this.stats.deref();
  }

  log() {
    const stats = this.stats.deref();
    Apify.utils.log.info(`stats: ${JSON.stringify(stats)}`);
  }

  /**
   * @param final {boolean} - If true, clearInterval apply
   */
  async save(final = false) {
    if (final) {
      clearInterval(this.interval);
    }
    const stats = this.stats.deref();
    await Apify.setValue("STATS", this.get());
    Apify.utils.log.info("STATS saved!");
    if (stats.ok) {
      Apify.utils.log.info(
        `Denied ratio: ${(stats.denied ?? 0 / stats.ok) * 100} %`
      );
    }
    this.log();
  }
}

/**
 *
 * @param {function} fn
 * @param {*} init
 * @returns {Promise<Stats>}
 */
export async function withPersistedStats(fn, init) {
  const stats = (await Apify.getValue("STATS")) ?? init ?? defaultStats;
  const state = new Stats(fn(stats));
  const persistState = () => state.save();

  Apify.events.on("persistState", persistState);
  Apify.events.on("migrating", persistState);

  return state;
}