Actor is under maintenance
This actor may be unreliable while under maintenance. Would you like to try a similar actor instead?
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
- Modified
- Users17
- Runs233
Dockerfile
FROM apify/actor-node:16
COPY package.json ./
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{
"title": "Alza (alza.cz/sk/co.uk/de/at/hu) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"type": "object",
"schemaVersion": 1,
"properties": {
"mode": {
"title": "Mode",
"description": "",
"type": "string",
"editor": "select",
"default": "TEST",
"prefill": "TEST",
"enum": [
"TEST",
"FULL",
"SINGLE"
],
"enumTitles": [
"TEST mode (scrapes only few categories)",
"FULL",
"SINGLE"
]
},
"country": {
"title": "Country",
"description": "",
"type": "string",
"editor": "select",
"default": "CZ",
"prefill": "CZ",
"enum": [
"CZ",
"SK",
"UK",
"DE",
"AT",
"HU"
],
"enumTitles": [
"CZ",
"SK",
"UK",
"DE",
"AT",
"HU"
]
},
"debug": {
"title": "Debug",
"description": "Debug mode prints more logs, disables concurrency and other optimizations.",
"type": "boolean",
"editor": "checkbox",
"default": false
}
},
"required": [
"mode",
"country"
]
}
README.md
# Alza (alza.cz/sk/co.uk/de/at/hu) scraper
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
## Output example
* **itemId** `string` e.g. *6731144*
* **itemName** `string` e.g. *iPhone 13 Pro 128GB*
* **itemUrl** `string` e.g. *https://alza.cz/iphone-13-pro-128gb-grafitovo-siva-d6731144.htm*
* **img** `string`
* **inStock** `boolean` e.g. *true*
* **currentPrice** `number` e.g. *12990*
* **originalPrice** `number` e.g. *14990*
* **currency** `string` e.g. *CZK*
apify.json
{
"name": "alza-alza-cz-sk-co-uk-de-at-hu-scraper",
"version": "0.1",
"buildTag": "latest",
"env": null,
"defaultRunOptions": {
"build": "latest",
"timeoutSecs": 3600,
"memoryMbytes": 1024
}
}
main.js
This file is 307 lines long. Only the first 50 are shown. Show all
/**
* TODO:
* - consider proxies
* - DRY price parsing
* - different countries to input
*
* Beware that same product can have multiple valid urls
* - https://www.alza.cz/iphone-13-512gb-cervena-levne-d6839524.htm
* - https://www.alza.cz/sport/victorias-secret-st-11128877-cc-4vmq-cerna
* - https://www.alza.cz/sport/victorias-secret-st-11156655-cc-38h2-bezova?dq=6920061
*
* Variants can affect price
* - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6804643 38k
* - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6771118 39k
*
* Pagination:
* beware: only first next 3 pages are listed
* -> need to use total amount of products & product per page to calculate total amount of pages
*/
import Apify from "apify";
import cheerio from "cheerio";
import _ from "lodash";
import { gotScraping } from "got-scraping";
import { createUniqueKeyFromUrl } from "./_utils/common.js";
const { log } = Apify.utils;
var LABEL;
(function (LABEL) {
LABEL["INDEX"] = "INDEX";
LABEL["PRODUCTS"] = "PRODUCTS";
})(LABEL || (LABEL = {}));
var MODE;
(function (MODE) {
MODE["TEST"] = "TEST";
MODE["FULL"] = "FULL";
MODE["SINGLE"] = "SINGLE";
})(MODE || (MODE = {}));
var Country;
// TODO: Maybe unify with Country enum
(function (Country) {
Country["CZ"] = "CZ";
Country["SK"] = "SK";
Country["UK"] = "UK";
Country["DE"] = "DE";
Country["AT"] = "AT";
package.json
{
"name": "alza-alza-cz-sk-co-uk-de-at-hu-scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"type": "module",
"scripts": {
"start": "node ./main.js",
"push-to-apify-platform": "npx apify push"
},
"dependencies": {
"apify": "*",
"cheerio": "*",
"lodash": "*",
"got-scraping": "*"
},
"apify": {
"title": "Alza (alza.cz/sk/co.uk/de/at/hu) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"isPublic": true,
"isDeprecated": false,
"isAnonymouslyRunnable": true,
"notice": "",
"pictureUrl": "",
"seoTitle": "",
"seoDescription": "",
"categories": [
"ECOMMERCE"
]
}
}
.actor/actor.json
{
"actorSpecification": 1,
"name": "alza-alza-cz-sk-co-uk-de-at-hu-scraper",
"title": "Alza (alza.cz/sk/co.uk/de/at/hu) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"version": "0.1.0",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "Alza (alza.cz/sk/co.uk/de/at/hu) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"views": {
"overview": {
"title": "Overview",
"description": "Overview of the most important fields",
"transformation": {
"fields": [
"itemId",
"itemName",
"itemUrl",
"img",
"inStock",
"currentPrice",
"originalPrice",
"currency"
]
},
"display": {
"component": "table",
"columns": [
{
"label": "Item ID",
"field": "itemUrl",
"format": "link",
"textField": "itemId"
},
{
"label": "Item Name",
"field": "itemName",
"format": "text"
},
{
"label": "Img",
"field": "img",
"format": "image"
},
{
"label": "In Stock",
"field": "inStock",
"format": "boolean"
},
{
"label": "Current Price",
"field": "currentPrice",
"format": "number"
},
{
"label": "Original Price",
"field": "originalPrice",
"format": "number"
},
{
"label": "Currency",
"field": "currency",
"format": "text"
}
]
}
}
}
}
}
}
.actor/logo.png
_utils/common.js
import { createHash } from 'crypto'
// inspired by @drobnikj
// TODO: Similar, but less obfuscated for easier debugging
export const createUniqueKeyFromUrl = (url) => {
const hash = createHash(`sha256`)
const cleanUrl = url.split(`://`)[1] // Remove protocol
hash.update(cleanUrl)
return hash.digest(`hex`)
}
/**
*
* @param {Date} datetime
* @return {Promise<void>}
*/
export const sleepUntil = (datetime) => {
const now = new Date()
const difference = datetime - now
if (difference > 0) {
return new Promise((resolve) => {
setTimeout(resolve, difference)
})
}
return Promise.resolve()
}
export function parsePrice (string) {
let amount, currency
const noText = string.replace(/[^\d,.]/g, ``)
const decimals = noText.match(/([,.])(\d{2})$/)
if (decimals) {
const decimalSeparator = decimals[1]
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const decimalAmount = decimals[2]
amount = parseInt(noText.split(decimalSeparator)[0])
} {
const justNumbers = noText.replace(/[,.]/g, ``)
amount = parseInt(justNumbers)
}
return { amount, currency }
}