
Koloshop (koloshop.cz) scraper
Deprecated
Pricing
Pay per usage
Go to Store


Koloshop (koloshop.cz) scraper
Deprecated
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
0.0 (0)
Pricing
Pay per usage
1
Total users
5
Monthly users
1
Last modified
3 years ago
Dockerfile
FROM apify/actor-node:16
COPY package.json ./
RUN npm --quiet set progress=false \ && npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{ "title": "Koloshop (koloshop.cz) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "type": "object", "schemaVersion": 1, "properties": { "mode": { "title": "Mode", "description": "", "type": "string", "editor": "select", "default": "TEST", "prefill": "TEST", "enum": [ "TEST", "FULL" ], "enumTitles": [ "TEST", "FULL" ] }, "APIFY_DONT_STORE_IN_DATASET": { "sectionCaption": "Advanced", "sectionDescription": "Advanced options, use only if you know what you're doing.", "title": "Don't store in dataset", "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database", "type": "boolean", "default": false, "editor": "checkbox" }, "PG_CONNECTION_STRING_NORMALIZED": { "title": "Postgres connection string for normalized data", "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables", "type": "string", "editor": "textfield" }, "PG_DATA_TABLE": { "title": "Postgres table name for product data", "description": "Table name for storing product name, url, image, ...", "type": "string", "editor": "textfield" }, "PG_DATA_PRICE_TABLE": { "title": "Postgres table name for price data", "description": "Table name for storing price, original price, stock status, ...", "type": "string", "editor": "textfield" } }, "required": [ "mode" ]}
apify.json
{ "name": "koloshop-koloshop-cz-scraper", "version": "0.1", "buildTag": "latest", "env": null, "defaultRunOptions": { "build": "latest", "timeoutSecs": 3600, "memoryMbytes": 1024 }}
main.js
1import { Actor } from "apify3";2import { BasicCrawler, createBasicRouter } from "crawlee";3import cheerio from "cheerio";4import { gotScraping } from "got-scraping";5import { init, save } from "./_utils/common.js";6
7var LABEL;8
9(function (LABEL) {10 LABEL["INDEX"] = "INDEX";11 LABEL["PRODUCTS"] = "PRODUCTS";12})(LABEL || (LABEL = {}));13var MODE;14
15(function (MODE) {16 MODE["TEST"] = "TEST";17 MODE["FULL"] = "FULL";18})(MODE || (MODE = {}));19
20async function enqueueInitial(type, crawler) {21 if (type === MODE.FULL) {22 await crawler.addRequests([23 {24 url: `https://fake.xyz`,25 uniqueKey: `index`,26 userData: { label: LABEL.INDEX },27 },28 ]);29 } else if (type === MODE.TEST) {30 for (const brand of [`bikeworkx`, `crankbrothers`]) {31 await crawler.addRequests([32 {33 url: `https://fake.xyz`,34 uniqueKey: `brand:${brand}`,35 userData: { label: LABEL.PRODUCTS, brand },36 },37 ]);38 }39 }40}41
42const router = createBasicRouter();43
44router.addHandler(LABEL.INDEX, async ({ crawler }) => {45 const response = await gotScraping({46 url: `https://www.koloshop.cz/znacky/`,47 });48 const $ = cheerio.load(response.body);49 const requests = [];50 $(`#brands .brand a`).each((i, el) => {51 const slug = $(el).attr(`href`).replace(/\/$/, ``); // replace trailing slash52 requests.push({53 url: `https://fake.xyz`,54 uniqueKey: `brand:${slug}`,55 userData: { label: LABEL.PRODUCTS, brand: slug },56 });57 });58 await crawler.addRequests(requests);59});60
61router.addHandler(LABEL.PRODUCTS, async ({ request, crawler }) => {62 const { userData } = request;63
64 const response = await gotScraping({65 url: `https://www.koloshop.cz/ajax/parametryNew.php`,66 method: `POST`,67 body: new URLSearchParams({68 page_url: userData.brand,69 "params_data[stranka][]": `all`, // all pages at once, no pagination needed70 "params_data[paging]": `normal`, // not sure what this does71 }).toString(),72 headers: {73 accept: `*/*`,74 "accept-language": `en-US,en;q=0.9,cs;q=0.8,sk;q=0.7`,75 "content-type": `application/x-www-form-urlencoded; charset=UTF-8`,76 },77 }).json();78
79 const $ = cheerio.load(response?.snippets?.productsList);80
81 const products = [];82 const $products = $(`.thumb.relative a`);83 $products.each((i, el) => {84 const pid = $(el).find(`.product-id`).text().trim();85 const relUrl = $(el).attr(`href`);86 const url = `https://www.koloshop.cz/${relUrl}`;87 const title = $(el).find(`.product-name`).text().trim();88
89 const priceRaw = $(el)90 .find(`.price-info .price, .price-info .price-new`)91 .text()92 .trim(); // `640 Kč`93 const price = priceRaw.replace(/\D/g, ``);94
95 const priceOrigRaw = $(el)96 .find(`.price-info .price-previous`)97 .text()98 .trim();99 const priceOrig = priceOrigRaw.replace(/\D/g, ``);100 const img = $(el).find(`img`).attr(`data-src`);101
102 const inStock = $(el).find(`.labels .in-stock`).length > 0;103
104 const product = {105 pid,106 name: title,107 url: url,108 img: img,109 inStock,110 currentPrice: Number(price),111 originalPrice: priceOrig ? Number(priceOrig) : null, // TODO: Unify112 currency: `CZK`,113 };114 products.push(product);115 });116 await save(products);117});118
119void Actor.main(async () => {120 const input = await Actor.getInput();121 const { mode = MODE.FULL, ...rest } = input ?? {};122 await init({ actorNameOverride: `koloshop-cz` }, rest);123 const crawler = new BasicCrawler({ requestHandler: router });124 await enqueueInitial(mode, crawler);125 await crawler.run();126});
package.json
{ "name": "koloshop-koloshop-cz-scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "type": "module", "scripts": { "start": "node ./main.js", "push-to-apify-platform": "npx apify push" }, "dependencies": { "apify3": "npm:apify@^3.0.2", "crawlee": "*", "cheerio": "*", "got-scraping": "*", "pg": "*", "pg-connection-string": "*", "dotenv": "*", "find-config": "*", "@elastic/elasticsearch": "*", "filenamify": "*" }, "apify": { "title": "Koloshop (koloshop.cz) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "isPublic": true, "isDeprecated": false, "isAnonymouslyRunnable": true, "notice": "", "pictureUrl": "", "seoTitle": "", "seoDescription": "", "categories": [ "ECOMMERCE" ] }}
.actor/actor.json
{ "actorSpecification": 1, "name": "koloshop-koloshop-cz-scraper", "title": "Koloshop (koloshop.cz) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "version": "0.1.0", "storages": { "dataset": { "actorSpecification": 1, "title": "Koloshop (koloshop.cz) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", "views": { "overview": { "title": "Overview", "description": "Overview of the most important fields", "transformation": { "fields": [ "pid", "name", "url", "img", "inStock", "currentPrice", "originalPrice", "currency" ] }, "display": { "component": "table", "columns": [ { "label": "Pid", "field": "pid", "format": "text" }, { "label": "Name", "field": "name", "format": "text" }, { "label": "Url", "field": "url", "format": "link" }, { "label": "Img", "field": "img", "format": "image" }, { "label": "In Stock", "field": "inStock", "format": "boolean" }, { "label": "Current Price", "field": "currentPrice", "format": "number" }, { "label": "Original Price", "field": "originalPrice", "format": "number" }, { "label": "Currency", "field": "currency", "format": "text" } ] } } } } }}
.actor/logo.png
_utils/common.js
1import { createHash } from 'crypto'2import os from "os"3import path from "path"4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals5import fs from "fs"6import pg from "pg"7import pgConnectionString from 'pg-connection-string'8import { config } from 'dotenv'9import findConfig from "find-config"10import { Client as ElasticClient } from "@elastic/elasticsearch"11import filenamify from 'filenamify'12import { Dataset } from 'crawlee'13
14config({ path: findConfig(`.env`) })15
16const elasticIndexName = `actors-monorepo-shops`17
18const globalLogsProps = {19 __NODE_STARTED: new Date().toISOString(),20}21
22let actorName23let pgClient24let pgClientNormalized25let elasticClient26export async function init ({ actorNameOverride }, restInput) {27 parseEnvFromInput(restInput)28
29 if (os.platform() === `darwin`) {30 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts31 const basename = path.basename(filePath) // foo.ts32 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo33 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)34 .split(` `)[1]35 .trim()36 .replace(`refs/heads/`, ``)37 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)38 const gitCommitShort = gitCommit.substring(0, 7)39 globalLogsProps.__GIT_COMMIT = gitCommitShort40 }41
42 if (process.env.APIFY_IS_AT_HOME) {43 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV44 }45
46 /* ELASTIC */47 /* ======= */48 if (process.env.ELASTIC_CLOUD_ID) {49 elasticClient = new ElasticClient({50 cloud: { id: process.env.ELASTIC_CLOUD_ID },51 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },52 })53
54 // const mapping = await elasticClient.indices.getMapping({ index: actorName })55
56 // eslint-disable-next-line no-inner-declarations57 async function enforceIndexMapping () {58 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })59 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })60 await elasticClient.indices.putMapping({61 index: elasticIndexName,62 body: {63 properties: {64 _discount: { type: `float` },65 originalPrice: { type: `float` },66 currentPrice: { type: `float` },67 },68 },69 })70 }71
72 try {73 await enforceIndexMapping()74 } catch (err) {75 if (err.message.includes(`cannot be changed from type`)) {76 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)77 await elasticClient.indices.delete({ index: elasticIndexName })78 await enforceIndexMapping()79 }80 }81 }82
83 /* POSTGRESQL */84 /* ========== */85 if (process.env.PG_CONNECTION_STRING) {86 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)87 // const pgPool = new pg.Pool(pgConfig)88
89 pgClient = new pg.Client(pgConfig)90 await pgClient.connect()91
92 // Check if table exists and have proper columns93 const { rows: tables } = await pgClient.query(`94 SELECT table_name95 FROM information_schema.tables96 WHERE table_schema = 'public'97 `)98
99 // eslint-disable-next-line camelcase100 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)101 if (!tableExists) {102 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)103 }104
105 // TODO: Handle pgClient closing106 }107
108 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {109 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)110
111 pgClientNormalized = new pg.Client(pgConfig)112 await pgClientNormalized.connect()113
114 // Check if table exists and have proper columns115 const { rows: tables } = await pgClientNormalized.query(`116 SELECT table_name117 FROM information_schema.tables118 WHERE table_schema = 'public'119 `)120
121 // eslint-disable-next-line camelcase122 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)123 // eslint-disable-next-line camelcase124 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)125 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)126 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)127
128 // TODO: Handle pgClient closing129 }130}131
132// inspired by @drobnikj133// TODO: Similar, but less obfuscated for easier debugging134export const createUniqueKeyFromUrl = (url) => {135 const hash = createHash(`sha256`)136 const cleanUrl = url.split(`://`)[1] // Remove protocol137 hash.update(cleanUrl)138 return hash.digest(`hex`)139}140
141/**142 *143 * @param {Date} datetime144 * @return {Promise<void>}145 */146export const sleepUntil = async (datetime) => {147 const now = new Date()148 const difference = datetime - now149 if (difference > 0) {150 return new Promise((resolve) => {151 setTimeout(resolve, difference)152 })153 }154 return Promise.resolve()155}156
157export function parsePrice (string) {158 let amount, currency159 const noText = string.replace(/[^\d,.]/g, ``)160 const decimals = noText.match(/([,.])(\d{2})$/)161 if (decimals) {162 const decimalSeparator = decimals[1]163 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars164 const decimalAmount = decimals[2]165 amount = parseInt(noText.split(decimalSeparator)[0])166 } {167 const justNumbers = noText.replace(/[,.]/g, ``)168 amount = parseInt(justNumbers)169 }170 return { amount, currency }171}172
173export function toNumberOrNull (str) {174 // TODO: Handle better, but only after adding test175 if (str === undefined) return null176 if (str === null) return null177 if (str === ``) return null178 const num = Number(str)179 if (Number.isNaN(num)) return null180 return num181}182
183export async function save (objs) {184 if (!Array.isArray(objs)) objs = [objs]185 if (objs.length === 0) return186
187 const objsExtended = objs.map((obj) => {188 const objExtended = {189 ...obj,190 actorName,191 ...globalLogsProps,192 // __NODE_VERSION: global.process.versions.node,193 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals194 }195 // if run on Apify196 if (process.env.APIFY_IS_AT_HOME) {197 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID198 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID199 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID200 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER201 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID202 if (!process.env.APIFY_DONT_STORE_IN_DATASET) void Dataset.pushData(obj)203 }204 return objExtended205 })206 // if runs on local machine (MacOS)207 if (os.platform() === `darwin`) {208 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors209 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage210 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)211 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data212 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)213 for (const objExtended of objsExtended) {214 const id = objExtended.id ?? objExtended.pid // ?? uuidv4()215 const fileName = `${filenamify(id)}.json`216 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json217 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))218 }219 }220
221 if (pgClient) {222 const objsPg = objs.map((obj) => ({223 ...obj,224 // TODO: This is becoming not nice, and not clear225 shop: actorName,226 scrapedAt: new Date().toISOString().split(`T`)[0],227 }))228
229 const columns = getColumns(objsPg)230 const values = getValues(objsPg)231 const queryString = `232 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})233 VALUES (${values})234 `235 try {236 const { rowCount } = await pgClient.query(queryString)237 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)238 } catch (err) {239 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)240 else throw err241 }242 }243
244 // Only make sense for HlidacShopu245 if (pgClientNormalized) {246 const objsPgData = objs.map((obj) => ({247 shop: actorName,248 pid: obj.pid,249 name: obj.name,250 url: obj.url,251 img: obj.img,252 }))253
254 const objsPgDataPrice = objs.map((obj) => ({255 shop: actorName,256 pid: obj.pid,257 scrapedAt: new Date().toISOString().split(`T`)[0],258 currentPrice: obj.currentPrice,259 originalPrice: obj.originalPrice,260 inStock: obj.inStock,261 }))262
263 const queryString = `264 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})265 VALUES (${getValues(objsPgData)})266 ON CONFLICT DO NOTHING267 `268 try {269 const { rowCount } = await pgClientNormalized.query(queryString)270 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)271 } catch (err) {272 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)273 else throw err274 }275
276 const queryStringPrice = `277 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})278 VALUES (${getValues(objsPgDataPrice)})279 ON CONFLICT DO NOTHING280 `281 try {282 const { rowCount } = await pgClientNormalized.query(queryStringPrice)283 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)284 } catch (err) {285 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)286 else throw err287 }288 }289
290 if (elasticClient) {291 // .index creates or updates the document292 // .create creates a new document if it doesn't exist, 409 if it does293 // try {294 // const res = await elasticClient.index({295 // index: `actors-monorepo-shops`, // TODO: Consider using actorName296 // id, // foo-bar297 // document: objExtended, // {...}298 // })299 // } catch (err) {300 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583301 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)302 // else throw err303 // }304 }305}306
307function getColumns (objs) {308 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)309}310
311function getValues (objs) {312 return objs.map(objPg => Object.values(objPg).map((value) => {313 // escape strings to prevent SQL injection314 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`315 // convert to DB specific null316 if (typeof value === `undefined` || value === null) return `NULL`317 return value318 }).join(`, `)).join(`), (`)319}320
321export function parseEnvFromInput (input) {322 const env = {}323 for (const key in input) {324 if (key === key.toUpperCase()) env[key] = input[key]325 }326 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)327 Object.assign(process.env, env)328}