Český ráj (ceskyraj.com) scraper avatar

Český ráj (ceskyraj.com) scraper

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Český ráj (ceskyraj.com) scraper

Český ráj (ceskyraj.com) scraper

strajk/cesky-raj-ceskyraj-com-scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

Dockerfile

1FROM apify/actor-node:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6  && npm install --only=prod --no-optional
7
8COPY . ./

INPUT_SCHEMA.json

1{
2  "title": "Český ráj (ceskyraj.com) scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "mode": {
8      "title": "Mode",
9      "description": "",
10      "type": "string",
11      "editor": "select",
12      "default": "TEST",
13      "prefill": "TEST",
14      "enum": [
15        "TEST",
16        "FULL"
17      ],
18      "enumTitles": [
19        "TEST",
20        "FULL"
21      ]
22    },
23    "APIFY_DONT_STORE_IN_DATASET": {
24      "sectionCaption": "Advanced",
25      "sectionDescription": "Advanced options, use only if you know what you're doing.",
26      "title": "Don't store in dataset",
27      "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
28      "type": "boolean",
29      "default": false,
30      "editor": "checkbox"
31    },
32    "PG_CONNECTION_STRING_NORMALIZED": {
33      "title": "Postgres connection string for normalized data",
34      "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
35      "type": "string",
36      "editor": "textfield"
37    },
38    "PG_DATA_TABLE": {
39      "title": "Postgres table name for product data",
40      "description": "Table name for storing product name, url, image, ...",
41      "type": "string",
42      "editor": "textfield"
43    },
44    "PG_DATA_PRICE_TABLE": {
45      "title": "Postgres table name for price data",
46      "description": "Table name for storing price, original price, stock status, ...",
47      "type": "string",
48      "editor": "textfield"
49    }
50  },
51  "required": [
52    "mode"
53  ]
54}

apify.json

1{
2  "name": "cesky-raj-ceskyraj-com-scraper",
3  "version": "0.1",
4  "buildTag": "latest",
5  "env": null,
6  "defaultRunOptions": {
7    "build": "latest",
8    "timeoutSecs": 3600,
9    "memoryMbytes": 1024
10  }
11}

main.js

1import { URL } from "url";
2import { Actor } from "apify3";
3import { CheerioCrawler, createCheerioRouter } from "crawlee";
4import { init, save, toNumberOrNull } from "./_utils/common.js";
5
6var LABEL;
7
8(function (LABEL) {
9  LABEL["INDEX"] = "INDEX";
10  LABEL["PRODUCTS"] = "PRODUCTS";
11})(LABEL || (LABEL = {}));
12var MODE;
13
14(function (MODE) {
15  MODE["TEST"] = "TEST";
16  MODE["FULL"] = "FULL";
17})(MODE || (MODE = {}));
18
19async function enqueueInitial(type, crawler) {
20  if (type === MODE.FULL) {
21    await crawler.addRequests([
22      {
23        userData: { label: LABEL.INDEX },
24        url: `https://www.ceskyraj.com/nase-znacky/`,
25      },
26    ]);
27  } else if (type === MODE.TEST) {
28    const requests = [
29      {
30        userData: { label: LABEL.PRODUCTS },
31        url: `https://www.ceskyraj.com/hydrapak/`,
32      },
33      {
34        userData: { label: LABEL.PRODUCTS },
35        url: `https://www.ceskyraj.com/crankbrothers/`,
36      },
37      {
38        userData: { label: LABEL.PRODUCTS },
39        url: `https://www.ceskyraj.com/camelbak/`,
40      },
41    ];
42    await crawler.addRequests(requests);
43  }
44}
45
46const router = createCheerioRouter();
47
48router.addHandler(LABEL.INDEX, async ({ crawler, $ }) => {
49  const requests = [];
50  $(`#SubCategories ul.root li.leaf a.name`).each((i, el) => {
51    const url = $(el).attr(`href`);
52    const name = $(el).text();
53    requests.push({
54      url: `https://www.ceskyraj.com` + url,
55      userData: { label: LABEL.PRODUCTS, category: name },
56    });
57  });
58  await crawler.addRequests(requests);
59});
60
61router.addHandler(LABEL.PRODUCTS, async ({ crawler, $, request }) => {
62  console.log(`handleProducts`, request.url);
63  const PER_PAGE = 24;
64  if (!request.url.includes(`?f=`)) {
65    // on first page
66    const hasMore = !!$(`#CompoundPagingBottom a.nextProducts`).length;
67    const totalRaw = $(`#CompoundPagingBottom .displayedProducts`).text(); // e.g. `Zobrazeno 1-24 ze 52 produktů`
68    const total = parseInt(totalRaw.match(/ze (\d+) produktů/)[1]); // e.g. 52
69    let offset = PER_PAGE; // eg. 24, 48, 72, ...
70    while (offset < total) {
71      console.log(
72        `handleProducts`,
73        request.url,
74        `offset`,
75        offset,
76        `...enqueuing`
77      );
78      const paginatedUrl = new URL(request.url);
79      paginatedUrl.searchParams.set(`f`, offset.toString());
80      void crawler.addRequests([
81        {
82          userData: { label: LABEL.PRODUCTS },
83          url: paginatedUrl.toString(),
84        },
85      ]);
86      offset += PER_PAGE;
87    }
88  }
89
90  const products = [];
91  $(`#ProductsHost .ProductView`).each((i, el) => {
92    const id = $(el).attr(`id`);
93    const url = $(el).find(`h2 a`).attr(`href`);
94    const fullUrl = `https://www.ceskyraj.com` + url;
95    const title = $(el).find(`h2`).text();
96    const priceRaw = $(el).find(`.price.user`).text(); // `640&nbsp;Kč`
97    const price = priceRaw.replace(/\D/g, ``);
98    const priceOrigRaw = $(el).find(`.price.retail`).text(); // `640&nbsp;Kč`
99    const priceOrig = priceOrigRaw.replace(/\D/g, ``);
100    const img = $(el).find(`.crImages img.thumbnail`).attr(`data-src`);
101    const inStock = $(el).find(`.AvailabilityView`).text().includes(`skladem`);
102
103    const product = {
104      pid: id,
105      name: title,
106      url: fullUrl,
107      img: img,
108      inStock,
109      currentPrice: toNumberOrNull(price),
110      originalPrice: toNumberOrNull(priceOrig),
111      currency: `CZK`,
112    };
113    products.push(product);
114  });
115  await save(products);
116});
117
118void Actor.main(async () => {
119  const input = await Actor.getInput();
120  const { mode = MODE.FULL, ...rest } = input ?? {};
121  await init({ actorNameOverride: `ceskyraj-com` }, rest);
122  const crawler = new CheerioCrawler({ requestHandler: router });
123  await enqueueInitial(mode, crawler);
124  await crawler.run();
125});

package.json

1{
2  "name": "cesky-raj-ceskyraj-com-scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "module",
5  "scripts": {
6    "start": "node ./main.js",
7    "push-to-apify-platform": "npx apify push"
8  },
9  "dependencies": {
10    "apify3": "npm:apify@^3.0.2",
11    "crawlee": "*",
12    "pg": "*",
13    "pg-connection-string": "*",
14    "dotenv": "*",
15    "find-config": "*",
16    "@elastic/elasticsearch": "*",
17    "filenamify": "*"
18  },
19  "apify": {
20    "title": "Český ráj (ceskyraj.com) scraper",
21    "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
22    "isPublic": true,
23    "isDeprecated": false,
24    "isAnonymouslyRunnable": true,
25    "notice": "",
26    "pictureUrl": "",
27    "seoTitle": "",
28    "seoDescription": "",
29    "categories": [
30      "ECOMMERCE"
31    ]
32  }
33}

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "cesky-raj-ceskyraj-com-scraper",
4  "title": "Český ráj (ceskyraj.com) scraper",
5  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6  "version": "0.1.0",
7  "storages": {
8    "dataset": {
9      "actorSpecification": 1,
10      "title": "Český ráj (ceskyraj.com) scraper",
11      "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12      "views": {
13        "overview": {
14          "title": "Overview",
15          "description": "Overview of the most important fields",
16          "transformation": {
17            "fields": [
18              "pid",
19              "name",
20              "url",
21              "img",
22              "inStock",
23              "currentPrice",
24              "originalPrice",
25              "currency"
26            ]
27          },
28          "display": {
29            "component": "table",
30            "columns": [
31              {
32                "label": "Pid",
33                "field": "pid",
34                "format": "text"
35              },
36              {
37                "label": "Name",
38                "field": "name",
39                "format": "text"
40              },
41              {
42                "label": "Url",
43                "field": "url",
44                "format": "link"
45              },
46              {
47                "label": "Img",
48                "field": "img",
49                "format": "image"
50              },
51              {
52                "label": "In Stock",
53                "field": "inStock",
54                "format": "boolean"
55              },
56              {
57                "label": "Current Price",
58                "field": "currentPrice",
59                "format": "number"
60              },
61              {
62                "label": "Original Price",
63                "field": "originalPrice",
64                "format": "number"
65              },
66              {
67                "label": "Currency",
68                "field": "currency",
69                "format": "text"
70              }
71            ]
72          }
73        }
74      }
75    }
76  }
77}

.actor/logo.png

_utils/common.js

1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Dataset } from 'crawlee'
13
14config({ path: findConfig(`.env`) })
15
16const elasticIndexName = `actors-monorepo-shops`
17
18const globalLogsProps = {
19  __NODE_STARTED: new Date().toISOString(),
20}
21
22let actorName
23let pgClient
24let pgClientNormalized
25let elasticClient
26export async function init ({ actorNameOverride }, restInput) {
27  parseEnvFromInput(restInput)
28
29  if (os.platform() === `darwin`) {
30    const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
31    const basename = path.basename(filePath) // foo.ts
32    actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
33    const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
34      .split(` `)[1]
35      .trim()
36      .replace(`refs/heads/`, ``)
37    const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
38    const gitCommitShort = gitCommit.substring(0, 7)
39    globalLogsProps.__GIT_COMMIT = gitCommitShort
40  }
41
42  if (process.env.APIFY_IS_AT_HOME) {
43    actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
44  }
45
46  /* ELASTIC */
47  /* ======= */
48  if (process.env.ELASTIC_CLOUD_ID) {
49    elasticClient = new ElasticClient({
50      cloud: { id: process.env.ELASTIC_CLOUD_ID },
51      auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
52    })
53
54    // const mapping = await elasticClient.indices.getMapping({ index: actorName })
55
56    // eslint-disable-next-line no-inner-declarations
57    async function enforceIndexMapping () {
58      const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
59      if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
60      await elasticClient.indices.putMapping({
61        index: elasticIndexName,
62        body: {
63          properties: {
64            _discount: { type: `float` },
65            originalPrice: { type: `float` },
66            currentPrice: { type: `float` },
67          },
68        },
69      })
70    }
71
72    try {
73      await enforceIndexMapping()
74    } catch (err) {
75      if (err.message.includes(`cannot be changed from type`)) {
76        console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
77        await elasticClient.indices.delete({ index: elasticIndexName })
78        await enforceIndexMapping()
79      }
80    }
81  }
82
83  /* POSTGRESQL */
84  /* ========== */
85  if (process.env.PG_CONNECTION_STRING) {
86    const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
87    // const pgPool = new pg.Pool(pgConfig)
88
89    pgClient = new pg.Client(pgConfig)
90    await pgClient.connect()
91
92    // Check if table exists and have proper columns
93    const { rows: tables } = await pgClient.query(`
94    SELECT table_name
95    FROM information_schema.tables
96    WHERE table_schema = 'public'
97  `)
98
99    // eslint-disable-next-line camelcase
100    const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
101    if (!tableExists) {
102      throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
103    }
104
105  // TODO: Handle pgClient closing
106  }
107
108  if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
109    const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
110
111    pgClientNormalized = new pg.Client(pgConfig)
112    await pgClientNormalized.connect()
113
114    // Check if table exists and have proper columns
115    const { rows: tables } = await pgClientNormalized.query(`
116    SELECT table_name
117    FROM information_schema.tables
118    WHERE table_schema = 'public'
119  `)
120
121    // eslint-disable-next-line camelcase
122    const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
123    // eslint-disable-next-line camelcase
124    const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
125    if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
126    if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
127
128  // TODO: Handle pgClient closing
129  }
130}
131
132// inspired by @drobnikj
133// TODO: Similar, but less obfuscated for easier debugging
134export const createUniqueKeyFromUrl = (url) => {
135  const hash = createHash(`sha256`)
136  const cleanUrl = url.split(`://`)[1] // Remove protocol
137  hash.update(cleanUrl)
138  return hash.digest(`hex`)
139}
140
141/**
142 *
143 * @param {Date} datetime
144 * @return {Promise<void>}
145 */
146export const sleepUntil = async (datetime) => {
147  const now = new Date()
148  const difference = datetime - now
149  if (difference > 0) {
150    return new Promise((resolve) => {
151      setTimeout(resolve, difference)
152    })
153  }
154  return Promise.resolve()
155}
156
157export function parsePrice (string) {
158  let amount, currency
159  const noText = string.replace(/[^\d,.]/g, ``)
160  const decimals = noText.match(/([,.])(\d{2})$/)
161  if (decimals) {
162    const decimalSeparator = decimals[1]
163    // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
164    const decimalAmount = decimals[2]
165    amount = parseInt(noText.split(decimalSeparator)[0])
166  } {
167    const justNumbers = noText.replace(/[,.]/g, ``)
168    amount = parseInt(justNumbers)
169  }
170  return { amount, currency }
171}
172
173export function toNumberOrNull (str) {
174  // TODO: Handle better, but only after adding test
175  if (str === undefined) return null
176  if (str === null) return null
177  if (str === ``) return null
178  const num = Number(str)
179  if (Number.isNaN(num)) return null
180  return num
181}
182
183export async function save (objs) {
184  if (!Array.isArray(objs)) objs = [objs]
185  if (objs.length === 0) return
186
187  const objsExtended = objs.map((obj) => {
188    const objExtended = {
189      ...obj,
190      actorName,
191      ...globalLogsProps,
192      // __NODE_VERSION: global.process.versions.node,
193      // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
194    }
195    // if run on Apify
196    if (process.env.APIFY_IS_AT_HOME) {
197      objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
198      objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
199      objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
200      objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
201      objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
202      if (!process.env.APIFY_DONT_STORE_IN_DATASET) void Dataset.pushData(obj)
203    }
204    return objExtended
205  })
206  // if runs on local machine (MacOS)
207  if (os.platform() === `darwin`) {
208    const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
209    const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
210    if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
211    const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
212    if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
213    for (const objExtended of objsExtended) {
214      const id = objExtended.id ?? objExtended.pid // ?? uuidv4()
215      const fileName = `${filenamify(id)}.json`
216      const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
217      fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
218    }
219  }
220
221  if (pgClient) {
222    const objsPg = objs.map((obj) => ({
223      ...obj,
224      // TODO: This is becoming not nice, and not clear
225      shop: actorName,
226      scrapedAt: new Date().toISOString().split(`T`)[0],
227    }))
228
229    const columns = getColumns(objsPg)
230    const values = getValues(objsPg)
231    const queryString = `
232        INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
233        VALUES (${values})
234    `
235    try {
236      const { rowCount } = await pgClient.query(queryString)
237      console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
238    } catch (err) {
239      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
240      else throw err
241    }
242  }
243
244  // Only make sense for HlidacShopu
245  if (pgClientNormalized) {
246    const objsPgData = objs.map((obj) => ({
247      shop: actorName,
248      pid: obj.pid,
249      name: obj.name,
250      url: obj.url,
251      img: obj.img,
252    }))
253
254    const objsPgDataPrice = objs.map((obj) => ({
255      shop: actorName,
256      pid: obj.pid,
257      scrapedAt: new Date().toISOString().split(`T`)[0],
258      currentPrice: obj.currentPrice,
259      originalPrice: obj.originalPrice,
260      inStock: obj.inStock,
261    }))
262
263    const queryString = `
264        INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
265        VALUES (${getValues(objsPgData)})
266        ON CONFLICT DO NOTHING
267    `
268    try {
269      const { rowCount } = await pgClientNormalized.query(queryString)
270      console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
271    } catch (err) {
272      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
273      else throw err
274    }
275
276    const queryStringPrice = `
277        INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
278        VALUES (${getValues(objsPgDataPrice)})
279        ON CONFLICT DO NOTHING
280    `
281    try {
282      const { rowCount } = await pgClientNormalized.query(queryStringPrice)
283      console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
284    } catch (err) {
285      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
286      else throw err
287    }
288  }
289
290  if (elasticClient) {
291    // .index creates or updates the document
292    // .create creates a new document if it doesn't exist, 409 if it does
293    // try {
294    //   const res = await elasticClient.index({
295    //     index: `actors-monorepo-shops`, // TODO: Consider using actorName
296    //     id, // foo-bar
297    //     document: objExtended, // {...}
298    //   })
299    // } catch (err) {
300    //   // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
301    //   if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
302    //   else throw err
303    // }
304  }
305}
306
307function getColumns (objs) {
308  return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
309}
310
311function getValues (objs) {
312  return objs.map(objPg => Object.values(objPg).map((value) => {
313    // escape strings to prevent SQL injection
314    if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
315    // convert to DB specific null
316    if (typeof value === `undefined` || value === null) return `NULL`
317    return value
318  }).join(`, `)).join(`), (`)
319}
320
321export function parseEnvFromInput (input) {
322  const env = {}
323  for (const key in input) {
324    if (key === key.toUpperCase()) env[key] = input[key]
325  }
326  console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
327  Object.assign(process.env, env)
328}
Developer
Maintained by Community
Categories