
Axit (axit.cz) scraper
Deprecated
Pricing
Pay per usage
Go to Store


Axit (axit.cz) scraper
Deprecated
Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).
0.0 (0)
Pricing
Pay per usage
1
Total users
2
Monthly users
1
Last modified
3 years ago
Dockerfile
FROM apify/actor-node:16
COPY package.json ./
RUN npm --quiet set progress=false \ && npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{ "title": "Axit (axit.cz) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "type": "object", "schemaVersion": 1, "properties": { "mode": { "title": "Mode", "description": "", "type": "string", "editor": "select", "default": "TEST", "prefill": "TEST", "enum": [ "TEST", "FULL" ], "enumTitles": [ "TEST", "FULL" ] }, "APIFY_DONT_STORE_IN_DATASET": { "sectionCaption": "Advanced", "sectionDescription": "Advanced options, use only if you know what you're doing.", "title": "Don't store in dataset", "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database", "type": "boolean", "default": false, "editor": "checkbox" }, "PG_CONNECTION_STRING_NORMALIZED": { "title": "Postgres connection string for normalized data", "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables", "type": "string", "editor": "textfield" }, "PG_DATA_TABLE": { "title": "Postgres table name for product data", "description": "Table name for storing product name, url, image, ...", "type": "string", "editor": "textfield" }, "PG_DATA_PRICE_TABLE": { "title": "Postgres table name for price data", "description": "Table name for storing price, original price, stock status, ...", "type": "string", "editor": "textfield" } }, "required": [ "mode" ]}
apify.json
{ "name": "axit-axit-cz-scraper", "version": "0.1", "buildTag": "latest", "env": null, "defaultRunOptions": { "build": "latest", "timeoutSecs": 3600, "memoryMbytes": 1024 }}
main.js
1import { Actor } from "apify3";2import { CheerioCrawler, createCheerioRouter } from "crawlee";3import { gotScraping } from "got-scraping";4import cheerio from "cheerio";5import { init, save, toNumberOrNull } from "./_utils/common.js";6
7var LABEL;8
9(function (LABEL) {10 LABEL["INDEX"] = "INDEX";11 LABEL["PRODUCTS"] = "PRODUCTS";12})(LABEL || (LABEL = {}));13var MODE;14
15(function (MODE) {16 MODE["TEST"] = "TEST";17 MODE["FULL"] = "FULL";18})(MODE || (MODE = {}));19
20async function enqueueInitial(type, crawler) {21 if (type === MODE.FULL) {22 await crawler.addRequests([23 {24 url: `https://www.axit.cz`,25 userData: { label: LABEL.INDEX },26 },27 ]);28 } else if (type === MODE.TEST) {29 const requests = [`shimano`, `crankbrothers`].map((brand) => ({30 url: `https://www.axit.cz/vyrobci/${brand}`,31 userData: { label: LABEL.PRODUCTS, brand },32 }));33 await crawler.addRequests(requests);34 }35}36
37async function parseAndSaveProducts($) {38 const products = [];39 const $products = $(`.products .item`);40 $products.each(async (i, el) => {41 const relUrl = $(`a.image`, el).attr(`href`);42 const url = `https://www.axit.cz${relUrl}`;43
44 // id = relUrl.match(/\/d(\d\w+)_/)[1] // e.g. /d123456_... -> 123456 // BEWARE: not every item has id in url45 const pid = $(el).find(`.compare`).attr(`id`).replace(`compare_add_`, ``); // e.g. compare_add_172702 -> 17270246
47 const name = $(`a.image`, el).attr(`title`);48
49 const priceRaw = $(el).find(`.price`).text().trim(); // `640,00 Kč`50 const price = priceRaw.split(`,`)[0].replace(/\D/g, ``); // 2 999,00 Kč -> 299951
52 const priceOrigRaw = $(el).find(`.original-price`).text().trim();53 const priceOrig = priceOrigRaw.replace(/\D/g, ``); // 3 990 Kč -> 399054 const imgRel = $(`a.image img`, el).attr(`src`);55 const img = `https://www.axit.cz${imgRel}`;56
57 const inStock = $(el).find(`.availability.instock`).length > 0;58
59 const product = {60 pid,61 name,62 url,63 img,64 inStock,65 currentPrice: toNumberOrNull(price),66 originalPrice: toNumberOrNull(priceOrig),67 currency: `CZK`,68 };69 products.push(product);70 });71 await save(products);72}73
74async function fetchProducts({ brandId, PHPSESSID, csrfToken, page }) {75 const ajaxRes = await gotScraping({76 url: `https://www.axit.cz/ajax/products_content.php`,77 method: `POST`,78 headers: {79 "Content-Type": `application/x-www-form-urlencoded; charset=UTF-8`,80 "x-requested-with": `XMLHttpRequest`,81 referer: `https://www.axit.cz/vyrobci/shimano/`, // TODO82 cookie: `PHPSESSID=${PHPSESSID}`,83 origin: `https://www.axit.cz`,84 pragma: `no-cache`,85 "cache-control": `no-cache`,86 authority: `www.axit.cz`,87 accept: `*/*`,88 "accept-language": `en-US,en;q=0.9,cs;q=0.8,sk;q=0.7`,89 },90 form: {91 list_brand: brandId,92 cpage: page, // = page number93 epage: 80, // = per page94 razeni: `p_name`, // = sort by name95 CSRFtoken: csrfToken,96
97 nextpage: 0,98 categoryid: 0,99 subcategoryid: 0,100 subsubcategoryid: 0,101 subsubsubcategoryid: 0,102 cphrase: ``,103 list_label: 0,104 base_products: ``,105 // eprice: 0 - 13150,106 // eprice2: 0 - 13150,107 initialize: 1,108 },109 });110 const ajaxBody = ajaxRes.body; // kinda weird HTML https://share.cleanshot.com/pFGlpy111 return cheerio.load(ajaxBody);112}113
114const router = createCheerioRouter();115
116router.addHandler(LABEL.INDEX, async function ({ $, crawler }) {117 const requests = [];118 $(`#header .maker a[href^="/vyrobci"]`).each((i, el) => {119 // /vyrobci/shimano/ -> shimano120 const slug = $(el).attr(`href`).replace(`/vyrobci/`, ``).replace(/\/$/, ``); // replace trailing slash121 requests.push({122 url: `https://www.axit.cz/vyrobci/${slug}/#razeni=p_name`,123 userData: { label: LABEL.PRODUCTS, brand: slug },124 });125 });126 await crawler.addRequests(requests);127});128router.addHandler(LABEL.PRODUCTS, async function ({ request, response, $ }) {129 const { userData } = request;130
131 const brandId = $(`input[name="list_brand"]`).attr(`value`);132 if (!brandId) {133 console.log(134 `No brandId found for ${userData.brand} – that does not mean anything, just FYI`135 );136 console.log(`Url: ${request.url}`);137 return;138 }139
140 const setCookieResHeader = response.headers[`set-cookie`][0]; // PHPSESSID=abc133; expires=Mon, 05-Sep-2022 12:31:52 GMT; Max-Age=172800; path=/; secure141 const PHPSESSID = setCookieResHeader.split(`;`)[0].split(`=`)[1]; // abc133142 if (!PHPSESSID) throw new Error(`PHPSESSID not found`);143
144 const csrfToken = $(`input[name="CSRFtoken"]`).attr(`value`);145 if (!csrfToken) throw new Error(`CSRFtoken not found`);146
147 // Pagination – Axit does not show the last page, only that there are more pages148 let hasMorePages = true;149 let page = 0;150 while (hasMorePages) {151 const $ajax = await fetchProducts({152 brandId,153 PHPSESSID,154 csrfToken,155 page,156 });157 console.log(`fetchProducts`, {158 brand: userData.brand,159 brandId,160 page,161 products: $ajax(`.products .item`).length,162 });163 await parseAndSaveProducts($ajax);164 if ($ajax(`.pagination [rel=next]`).length) {165 page++;166 } else {167 hasMorePages = false;168 }169 }170});171
172void Actor.main(async () => {173 const input = await Actor.getInput();174 const { mode = MODE.FULL, ...rest } = input ?? {};175 await init({ actorNameOverride: `axit-cz` }, rest);176 const crawler = new CheerioCrawler({ requestHandler: router });177 await enqueueInitial(mode, crawler);178 await crawler.run();179});
package.json
{ "name": "axit-axit-cz-scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "type": "module", "scripts": { "start": "node ./main.js", "push-to-apify-platform": "npx apify push" }, "dependencies": { "apify3": "npm:apify@^3.0.2", "crawlee": "*", "got-scraping": "*", "cheerio": "*", "pg": "*", "pg-connection-string": "*", "dotenv": "*", "find-config": "*", "@elastic/elasticsearch": "*", "filenamify": "*" }, "apify": { "title": "Axit (axit.cz) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "isPublic": true, "isDeprecated": false, "isAnonymouslyRunnable": true, "notice": "", "pictureUrl": "", "seoTitle": "", "seoDescription": "", "categories": [ "ECOMMERCE" ] }}
.actor/actor.json
{ "actorSpecification": 1, "name": "axit-axit-cz-scraper", "title": "Axit (axit.cz) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "version": "0.1.0", "storages": { "dataset": { "actorSpecification": 1, "title": "Axit (axit.cz) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "views": { "overview": { "title": "Overview", "description": "Overview of the most important fields", "transformation": { "fields": [ "pid", "name", "url", "img", "inStock", "currentPrice", "originalPrice", "currency" ] }, "display": { "component": "table", "columns": [ { "label": "Pid", "field": "pid", "format": "text" }, { "label": "Name", "field": "name", "format": "text" }, { "label": "Url", "field": "url", "format": "link" }, { "label": "Img", "field": "img", "format": "image" }, { "label": "In Stock", "field": "inStock", "format": "boolean" }, { "label": "Current Price", "field": "currentPrice", "format": "number" }, { "label": "Original Price", "field": "originalPrice", "format": "number" }, { "label": "Currency", "field": "currency", "format": "text" } ] } } } } }}
.actor/logo.png
_utils/common.js
1import { createHash } from 'crypto'2import os from "os"3import path from "path"4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals5import fs from "fs"6import pg from "pg"7import pgConnectionString from 'pg-connection-string'8import { config } from 'dotenv'9import findConfig from "find-config"10import { Client as ElasticClient } from "@elastic/elasticsearch"11import filenamify from 'filenamify'12import { Dataset } from 'crawlee'13
14config({ path: findConfig(`.env`) })15
16const elasticIndexName = `actors-monorepo-shops`17
18const globalLogsProps = {19 __NODE_STARTED: new Date().toISOString(),20}21
22let actorName23let pgClient24let pgClientNormalized25let elasticClient26export async function init ({ actorNameOverride }, restInput) {27 parseEnvFromInput(restInput)28
29 if (os.platform() === `darwin`) {30 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts31 const basename = path.basename(filePath) // foo.ts32 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo33 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)34 .split(` `)[1]35 .trim()36 .replace(`refs/heads/`, ``)37 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)38 const gitCommitShort = gitCommit.substring(0, 7)39 globalLogsProps.__GIT_COMMIT = gitCommitShort40 }41
42 if (process.env.APIFY_IS_AT_HOME) {43 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV44 }45
46 /* ELASTIC */47 /* ======= */48 if (process.env.ELASTIC_CLOUD_ID) {49 elasticClient = new ElasticClient({50 cloud: { id: process.env.ELASTIC_CLOUD_ID },51 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },52 })53
54 // const mapping = await elasticClient.indices.getMapping({ index: actorName })55
56 // eslint-disable-next-line no-inner-declarations57 async function enforceIndexMapping () {58 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })59 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })60 await elasticClient.indices.putMapping({61 index: elasticIndexName,62 body: {63 properties: {64 _discount: { type: `float` },65 originalPrice: { type: `float` },66 currentPrice: { type: `float` },67 },68 },69 })70 }71
72 try {73 await enforceIndexMapping()74 } catch (err) {75 if (err.message.includes(`cannot be changed from type`)) {76 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)77 await elasticClient.indices.delete({ index: elasticIndexName })78 await enforceIndexMapping()79 }80 }81 }82
83 /* POSTGRESQL */84 /* ========== */85 if (process.env.PG_CONNECTION_STRING) {86 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)87 // const pgPool = new pg.Pool(pgConfig)88
89 pgClient = new pg.Client(pgConfig)90 await pgClient.connect()91
92 // Check if table exists and have proper columns93 const { rows: tables } = await pgClient.query(`94 SELECT table_name95 FROM information_schema.tables96 WHERE table_schema = 'public'97 `)98
99 // eslint-disable-next-line camelcase100 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)101 if (!tableExists) {102 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)103 }104
105 // TODO: Handle pgClient closing106 }107
108 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {109 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)110
111 pgClientNormalized = new pg.Client(pgConfig)112 await pgClientNormalized.connect()113
114 // Check if table exists and have proper columns115 const { rows: tables } = await pgClientNormalized.query(`116 SELECT table_name117 FROM information_schema.tables118 WHERE table_schema = 'public'119 `)120
121 // eslint-disable-next-line camelcase122 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)123 // eslint-disable-next-line camelcase124 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)125 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)126 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)127
128 // TODO: Handle pgClient closing129 }130}131
132// inspired by @drobnikj133// TODO: Similar, but less obfuscated for easier debugging134export const createUniqueKeyFromUrl = (url) => {135 const hash = createHash(`sha256`)136 const cleanUrl = url.split(`://`)[1] // Remove protocol137 hash.update(cleanUrl)138 return hash.digest(`hex`)139}140
141/**142 *143 * @param {Date} datetime144 * @return {Promise<void>}145 */146export const sleepUntil = async (datetime) => {147 const now = new Date()148 const difference = datetime - now149 if (difference > 0) {150 return new Promise((resolve) => {151 setTimeout(resolve, difference)152 })153 }154 return Promise.resolve()155}156
157export function parsePrice (string) {158 let amount, currency159 const noText = string.replace(/[^\d,.]/g, ``)160 const decimals = noText.match(/([,.])(\d{2})$/)161 if (decimals) {162 const decimalSeparator = decimals[1]163 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars164 const decimalAmount = decimals[2]165 amount = parseInt(noText.split(decimalSeparator)[0])166 } {167 const justNumbers = noText.replace(/[,.]/g, ``)168 amount = parseInt(justNumbers)169 }170 return { amount, currency }171}172
173export function toNumberOrNull (str) {174 // TODO: Handle better, but only after adding test175 if (str === undefined) return null176 if (str === null) return null177 if (str === ``) return null178 const num = Number(str)179 if (Number.isNaN(num)) return null180 return num181}182
183export async function save (objs) {184 if (!Array.isArray(objs)) objs = [objs]185 if (objs.length === 0) return186
187 const objsExtended = objs.map((obj) => {188 const objExtended = {189 ...obj,190 actorName,191 ...globalLogsProps,192 // __NODE_VERSION: global.process.versions.node,193 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals194 }195 // if run on Apify196 if (process.env.APIFY_IS_AT_HOME) {197 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID198 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID199 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID200 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER201 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID202 if (!process.env.APIFY_DONT_STORE_IN_DATASET) void Dataset.pushData(obj)203 }204 return objExtended205 })206 // if runs on local machine (MacOS)207 if (os.platform() === `darwin`) {208 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors209 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage210 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)211 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data212 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)213 for (const objExtended of objsExtended) {214 const id = objExtended.id ?? objExtended.pid // ?? uuidv4()215 const fileName = `${filenamify(id)}.json`216 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json217 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))218 }219 }220
221 if (pgClient) {222 const objsPg = objs.map((obj) => ({223 ...obj,224 // TODO: This is becoming not nice, and not clear225 shop: actorName,226 scrapedAt: new Date().toISOString().split(`T`)[0],227 }))228
229 const columns = getColumns(objsPg)230 const values = getValues(objsPg)231 const queryString = `232 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})233 VALUES (${values})234 `235 try {236 const { rowCount } = await pgClient.query(queryString)237 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)238 } catch (err) {239 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)240 else throw err241 }242 }243
244 // Only make sense for HlidacShopu245 if (pgClientNormalized) {246 const objsPgData = objs.map((obj) => ({247 shop: actorName,248 pid: obj.pid,249 name: obj.name,250 url: obj.url,251 img: obj.img,252 }))253
254 const objsPgDataPrice = objs.map((obj) => ({255 shop: actorName,256 pid: obj.pid,257 scrapedAt: new Date().toISOString().split(`T`)[0],258 currentPrice: obj.currentPrice,259 originalPrice: obj.originalPrice,260 inStock: obj.inStock,261 }))262
263 const queryString = `264 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})265 VALUES (${getValues(objsPgData)})266 ON CONFLICT DO NOTHING267 `268 try {269 const { rowCount } = await pgClientNormalized.query(queryString)270 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)271 } catch (err) {272 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)273 else throw err274 }275
276 const queryStringPrice = `277 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})278 VALUES (${getValues(objsPgDataPrice)})279 ON CONFLICT DO NOTHING280 `281 try {282 const { rowCount } = await pgClientNormalized.query(queryStringPrice)283 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)284 } catch (err) {285 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)286 else throw err287 }288 }289
290 if (elasticClient) {291 // .index creates or updates the document292 // .create creates a new document if it doesn't exist, 409 if it does293 // try {294 // const res = await elasticClient.index({295 // index: `actors-monorepo-shops`, // TODO: Consider using actorName296 // id, // foo-bar297 // document: objExtended, // {...}298 // })299 // } catch (err) {300 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583301 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)302 // else throw err303 // }304 }305}306
307function getColumns (objs) {308 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)309}310
311function getValues (objs) {312 return objs.map(objPg => Object.values(objPg).map((value) => {313 // escape strings to prevent SQL injection314 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`315 // convert to DB specific null316 if (typeof value === `undefined` || value === null) return `NULL`317 return value318 }).join(`, `)).join(`), (`)319}320
321export function parseEnvFromInput (input) {322 const env = {}323 for (const key in input) {324 if (key === key.toUpperCase()) env[key] = input[key]325 }326 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)327 Object.assign(process.env, env)328}