Alert

Actor is under maintenance

This actor may be unreliable while under maintenance. Would you like to try a similar actor instead?

Actor picture

Alza (alza.cz/sk/co.uk/de/at/hu) scraper

strajk/alza-alza-cz-sk-co-uk-de-at-hu-scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

No credit card required

Author's avatarPavel Dolecek
  • Modified
  • Users17
  • Runs233
Actor picture
Alza (alza.cz/sk/co.uk/de/at/hu) scraper

Dockerfile

FROM apify/actor-node:16

COPY package.json ./

RUN npm --quiet set progress=false \
  && npm install --only=prod --no-optional

COPY . ./

INPUT_SCHEMA.json

{
  "title": "Alza (alza.cz/sk/co.uk/de/at/hu) scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "type": "object",
  "schemaVersion": 1,
  "properties": {
    "mode": {
      "title": "Mode",
      "description": "",
      "type": "string",
      "editor": "select",
      "default": "TEST",
      "prefill": "TEST",
      "enum": [
        "TEST",
        "FULL",
        "SINGLE"
      ],
      "enumTitles": [
        "TEST mode (scrapes only few categories)",
        "FULL",
        "SINGLE"
      ]
    },
    "country": {
      "title": "Country",
      "description": "",
      "type": "string",
      "editor": "select",
      "default": "CZ",
      "prefill": "CZ",
      "enum": [
        "CZ",
        "SK",
        "UK",
        "DE",
        "AT",
        "HU"
      ],
      "enumTitles": [
        "CZ",
        "SK",
        "UK",
        "DE",
        "AT",
        "HU"
      ]
    },
    "debug": {
      "title": "Debug",
      "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
      "type": "boolean",
      "editor": "checkbox",
      "default": false
    }
  },
  "required": [
    "mode",
    "country"
  ]
}

README.md

# Alza (alza.cz/sk/co.uk/de/at/hu) scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

## Output example

* **itemId** `string` e.g. *6731144*
* **itemName** `string` e.g. *iPhone 13 Pro 128GB*
* **itemUrl** `string` e.g. *https://alza.cz/iphone-13-pro-128gb-grafitovo-siva-d6731144.htm*
* **img** `string`
* **inStock** `boolean` e.g. *true*
* **currentPrice** `number` e.g. *12990*
* **originalPrice** `number` e.g. *14990*
* **currency** `string` e.g. *CZK*

apify.json

{
  "name": "alza-alza-cz-sk-co-uk-de-at-hu-scraper",
  "version": "0.1",
  "buildTag": "latest",
  "env": null,
  "defaultRunOptions": {
    "build": "latest",
    "timeoutSecs": 3600,
    "memoryMbytes": 1024
  }
}

main.js

This file is 307 lines long. Only the first 50 are shown. Show all

/**
 * TODO:
 * - consider proxies
 * - DRY price parsing
 * - different countries to input
 *
 * Beware that same product can have multiple valid urls
 * - https://www.alza.cz/iphone-13-512gb-cervena-levne-d6839524.htm
 * - https://www.alza.cz/sport/victorias-secret-st-11128877-cc-4vmq-cerna
 * - https://www.alza.cz/sport/victorias-secret-st-11156655-cc-38h2-bezova?dq=6920061
 *
 * Variants can affect price
 * - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6804643 38k
 * - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6771118 39k
 *
 * Pagination:
 * beware: only first next 3 pages are listed
 * -> need to use total amount of products & product per page to calculate total amount of pages
 */

import Apify from "apify";
import cheerio from "cheerio";
import _ from "lodash";
import { gotScraping } from "got-scraping";
import { createUniqueKeyFromUrl } from "./_utils/common.js";

const { log } = Apify.utils;

var LABEL;

(function (LABEL) {
  LABEL["INDEX"] = "INDEX";
  LABEL["PRODUCTS"] = "PRODUCTS";
})(LABEL || (LABEL = {}));
var MODE;

(function (MODE) {
  MODE["TEST"] = "TEST";
  MODE["FULL"] = "FULL";
  MODE["SINGLE"] = "SINGLE";
})(MODE || (MODE = {}));
var Country;

// TODO: Maybe unify with Country enum
(function (Country) {
  Country["CZ"] = "CZ";
  Country["SK"] = "SK";
  Country["UK"] = "UK";
  Country["DE"] = "DE";
  Country["AT"] = "AT";

package.json

{
  "name": "alza-alza-cz-sk-co-uk-de-at-hu-scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "type": "module",
  "scripts": {
    "start": "node ./main.js",
    "push-to-apify-platform": "npx apify push"
  },
  "dependencies": {
    "apify": "*",
    "cheerio": "*",
    "lodash": "*",
    "got-scraping": "*"
  },
  "apify": {
    "title": "Alza (alza.cz/sk/co.uk/de/at/hu) scraper",
    "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
    "isPublic": true,
    "isDeprecated": false,
    "isAnonymouslyRunnable": true,
    "notice": "",
    "pictureUrl": "",
    "seoTitle": "",
    "seoDescription": "",
    "categories": [
      "ECOMMERCE"
    ]
  }
}

.actor/actor.json

{
  "actorSpecification": 1,
  "name": "alza-alza-cz-sk-co-uk-de-at-hu-scraper",
  "title": "Alza (alza.cz/sk/co.uk/de/at/hu) scraper",
  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
  "version": "0.1.0",
  "storages": {
    "dataset": {
      "actorSpecification": 1,
      "title": "Alza (alza.cz/sk/co.uk/de/at/hu) scraper",
      "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
      "views": {
        "overview": {
          "title": "Overview",
          "description": "Overview of the most important fields",
          "transformation": {
            "fields": [
              "itemId",
              "itemName",
              "itemUrl",
              "img",
              "inStock",
              "currentPrice",
              "originalPrice",
              "currency"
            ]
          },
          "display": {
            "component": "table",
            "columns": [
              {
                "label": "Item ID",
                "field": "itemUrl",
                "format": "link",
                "textField": "itemId"
              },
              {
                "label": "Item Name",
                "field": "itemName",
                "format": "text"
              },
              {
                "label": "Img",
                "field": "img",
                "format": "image"
              },
              {
                "label": "In Stock",
                "field": "inStock",
                "format": "boolean"
              },
              {
                "label": "Current Price",
                "field": "currentPrice",
                "format": "number"
              },
              {
                "label": "Original Price",
                "field": "originalPrice",
                "format": "number"
              },
              {
                "label": "Currency",
                "field": "currency",
                "format": "text"
              }
            ]
          }
        }
      }
    }
  }
}

.actor/logo.png

_utils/common.js

import { createHash } from 'crypto'

// inspired by @drobnikj
// TODO: Similar, but less obfuscated for easier debugging
export const createUniqueKeyFromUrl = (url) => {
  const hash = createHash(`sha256`)
  const cleanUrl = url.split(`://`)[1] // Remove protocol
  hash.update(cleanUrl)
  return hash.digest(`hex`)
}

/**
 *
 * @param {Date} datetime
 * @return {Promise<void>}
 */
export const sleepUntil = (datetime) => {
  const now = new Date()
  const difference = datetime - now
  if (difference > 0) {
    return new Promise((resolve) => {
      setTimeout(resolve, difference)
    })
  }
  return Promise.resolve()
}

export function parsePrice (string) {
  let amount, currency
  const noText = string.replace(/[^\d,.]/g, ``)
  const decimals = noText.match(/([,.])(\d{2})$/)
  if (decimals) {
    const decimalSeparator = decimals[1]
    // eslint-disable-next-line @typescript-eslint/no-unused-vars
    const decimalAmount = decimals[2]
    amount = parseInt(noText.split(decimalSeparator)[0])
  } {
    const justNumbers = noText.replace(/[,.]/g, ``)
    amount = parseInt(justNumbers)
  }
  return { amount, currency }
}