
Český ráj (ceskyraj.com) scraper
- strajk-old/cesky-raj-ceskyraj-com-scraper
- Modified
- Users 2
- Runs 298
- Created by
Pavel Dolecek
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
Dockerfile
FROM apify/actor-node:16
COPY package.json ./
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{
"title": "Český ráj (ceskyraj.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"type": "object",
"schemaVersion": 1,
"properties": {
"mode": {
"title": "Mode",
"description": "",
"type": "string",
"editor": "select",
"default": "TEST",
"prefill": "TEST",
"enumTitles": [
"TEST",
"FULL",
"SINGLE"
],
"enum": [
"TEST",
"FULL",
"SINGLE"
]
},
"country": {
"title": "Country",
"description": "",
"type": "string",
"editor": "select",
"default": "CZ",
"prefill": "CZ",
"enumTitles": [
"CZ",
"SK",
"UK",
"DE",
"AT",
"HU"
],
"enum": [
"CZ",
"SK",
"UK",
"DE",
"AT",
"HU"
]
},
"debug": {
"title": "Debug",
"description": "Debug mode prints more logs, disables concurrency and other optimizations.",
"type": "boolean",
"editor": "checkbox",
"default": false
}
},
"required": [
"mode",
"country"
]
}
README.md
# Český ráj (ceskyraj.com) scraper
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
## Output example
* **itemId** `string` e.g. *p1390167*
* **itemName** `string` e.g. *Pedály Crankbrothers Stamp 7 orange*
* **itemUrl** `string` e.g. *https://www.ceskyraj.com/pedaly-crankbrothers-stamp-7-orange-p1141669/*
* **img** `string`
* **inStock** `boolean` e.g. *true*
* **currentPrice** `number` e.g. *3896*
* **originalPrice** `number` e.g. *3999*
* **currency** `string` e.g. *CZK*
apify.json
{
"name": "cesky-raj-ceskyraj-com-scraper",
"version": "0.1",
"buildTag": "latest",
"env": null,
"template": "project_cheerio_crawler"
}
main.js
This file is 293 lines long. Only the first 50 are shown. Show all
/**
* TODO:
* - consider proxies
* - DRY price parsing
* - different countries to input
*
* Beware that same product can have multiple valid urls
* - https://www.alza.cz/iphone-13-512gb-cervena-levne-d6839524.htm
* - https://www.alza.cz/sport/victorias-secret-st-11128877-cc-4vmq-cerna
* - https://www.alza.cz/sport/victorias-secret-st-11156655-cc-38h2-bezova?dq=6920061
*
* Variants can affect price
* - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6804643 38k
* - https://www.alza.cz/asus-rog-zephyrus-g14-ga401?dq=6771118 39k
*
* Pagination:
* beware: only first next 3 pages are listed
* -> need to use total amount of products & product per page to calculate total amount of pages
*/
import Apify from "apify";
import cheerio from "cheerio";
import { gotScraping } from "got-scraping";
import { withPersistedStats } from "./_utils/stats.js";
const { log } = Apify.utils;
var LABEL;
(function (LABEL) {
LABEL["INDEX"] = "INDEX";
LABEL["PRODUCTS"] = "PRODUCTS";
})(LABEL || (LABEL = {}));
var MODE;
(function (MODE) {
MODE["TEST"] = "TEST";
MODE["FULL"] = "FULL";
MODE["SINGLE"] = "SINGLE";
})(MODE || (MODE = {}));
var Country;
// TODO: Maybe unify with Country enum
(function (Country) {
Country["CZ"] = "CZ";
Country["SK"] = "SK";
Country["UK"] = "UK";
Country["DE"] = "DE";
Country["AT"] = "AT";
Country["HU"] = "HU";
package.json
{
"name": "cesky-raj-ceskyraj-com-scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"type": "module",
"scripts": {
"start": "node ./main.js",
"push-to-apify-platform": "npx apify push"
},
"dependencies": {
"apify": "*",
"@thi.ng/atom": "*",
"cheerio": "*",
"got-scraping": "*"
},
"apify": {
"title": "Český ráj (ceskyraj.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"isPublic": true,
"isDeprecated": false,
"issuesEnabled": true,
"isAnonymouslyRunnable": true,
"notice": "",
"pictureUrl": "",
"seoTitle": "",
"seoDescription": "",
"categories": [
"ECOMMERCE"
]
}
}
.actor/actor.json
{
"actorSpecification": 1,
"name": "cesky-raj-ceskyraj-com-scraper",
"title": "Český ráj (ceskyraj.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"version": "0.1.0",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "Český ráj (ceskyraj.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
"views": {
"overview": {
"title": "Overview",
"description": "Overview of the most important fields",
"transformation": {
"fields": [
"itemId",
"itemName",
"itemUrl",
"img",
"inStock",
"currentPrice",
"originalPrice",
"currency"
]
},
"display": {
"component": "table",
"columns": [
{
"label": "Item ID",
"field": "itemUrl",
"format": "link",
"textField": "itemId"
},
{
"label": "Item Name",
"field": "itemName",
"format": "text"
},
{
"label": "Img",
"field": "img",
"format": "image"
},
{
"label": "In Stock",
"field": "inStock",
"format": "boolean"
},
{
"label": "Current Price",
"field": "currentPrice",
"format": "number"
},
{
"label": "Original Price",
"field": "originalPrice",
"format": "number"
},
{
"label": "Currency",
"field": "currency",
"format": "text"
}
]
}
}
}
}
}
}
_utils/CloudflareUnblocker.js
This file is 428 lines long. Only the first 50 are shown. Show all
import Apify from "apify"
import { Session } from "apify/build/session_pool/session.js"
import { BrowserPool, PlaywrightPlugin } from "browser-pool"
import playwright from "playwright"
const { utils: { log, requestAsBrowser, puppeteer } } = Apify
export default class CloudflareUnblocker {
// unblockUrl: string
// proxyConfiguration: any
// browserPool: BrowserPool
// _shouldUseCustomTLS: boolean
// log: any
// name: string
constructor (options) {
// Store options
this.unblockUrl = options.unblockUrl
this.proxyConfiguration = options.proxyConfiguration
// Create browser pool
this.browserPool = new BrowserPool({
retireBrowserAfterPageCount: 50, // TODO: explain
maxOpenPagesPerBrowser: 10, // TODO: explain
browserPlugins: [
new PlaywrightPlugin(playwright.firefox, {
launchOptions: { headless: false },
}),
],
})
this._shouldUseCustomTLS = false
// Extending CrawlerExtension caused error:
// Class extends value #<Object> is not a constructor or null
// But all it should do is to add log method, so I'm adding it here
this.name = this.constructor.name
this.log = Apify.utils.log.child({ prefix: this.constructor.name })
}
/**
* Main function that unblocks your session.
*/
async unblock ({ session, request }) {
if (this._isSessionBeingRenewed(session)) {
request.retryCount = 0
this._throwError("Session is being renewed")
}
const oldShouldUseTLS = this._shouldUseCustomTLS
_utils/common.js
import { createHash } from 'crypto';
// inspired by @drobnikj
const createUniqueKeyFromUrl = (url) => {
const hash = createHash('sha256');
const cleanUrl = url.split('://')[1]; // Remove protocol
hash.update(cleanUrl);
return hash.digest('hex');
};
_utils/stats.js
// inspired by hlidac-shopu
import Apify from "apify";
import { defAtom } from "@thi.ng/atom";
// TODO: make this lowes common denominator
const defaultStats = {
urls: 0,
items: 0,
itemsDuplicate: 0,
totalItems: 0,
denied: 0,
ok: 0
};
const inc = x => x + 1;
const dec = x => x - 1;
// TODO: stats should be in atom and updated via swap function atomically
class Stats {
constructor(init) {
this.stats = defAtom(init);
this.interval = setInterval(() => this.log(), 20 * 1000);
}
inc(key) {
this.stats.swapIn(key, inc);
}
dec(key) {
this.stats.swapIn(key, dec);
}
add(key, value) {
this.stats.swapIn(key, x => x + value);
}
get() {
return this.stats.deref();
}
log() {
const stats = this.stats.deref();
Apify.utils.log.info(`stats: ${JSON.stringify(stats)}`);
}
/**
* @param final {boolean} - If true, clearInterval apply
*/
async save(final = false) {
if (final) {
clearInterval(this.interval);
}
const stats = this.stats.deref();
await Apify.setValue("STATS", this.get());
Apify.utils.log.info("STATS saved!");
if (stats.ok) {
Apify.utils.log.info(
`Denied ratio: ${(stats.denied ?? 0 / stats.ok) * 100} %`
);
}
this.log();
}
}
/**
*
* @param {function} fn
* @param {*} init
* @returns {Promise<Stats>}
*/
export async function withPersistedStats(fn, init) {
const stats = (await Apify.getValue("STATS")) ?? init ?? defaultStats;
const state = new Stats(fn(stats));
const persistState = () => state.save();
Apify.events.on("persistState", persistState);
Apify.events.on("migrating", persistState);
return state;
}