Fitanu (fitanu.com) scraper
Deprecated
Pricing
Pay per usage
Go to Store
Fitanu (fitanu.com) scraper
Deprecated
Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).
0.0 (0)
Pricing
Pay per usage
1
Total users
3
Monthly users
2
Last modified
3 years ago
Dockerfile
1FROM apify/actor-node:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install --only=prod --no-optional
7
8COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "Fitanu (fitanu.com) scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "mode": {
8 "title": "Mode",
9 "description": "",
10 "type": "string",
11 "editor": "select",
12 "default": "TEST",
13 "prefill": "TEST",
14 "enum": [
15 "TEST",
16 "FULL"
17 ],
18 "enumTitles": [
19 "TEST",
20 "FULL"
21 ]
22 },
23 "APIFY_DONT_STORE_IN_DATASET": {
24 "sectionCaption": "Advanced",
25 "sectionDescription": "Advanced options, use only if you know what you're doing.",
26 "title": "Don't store in dataset",
27 "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
28 "type": "boolean",
29 "default": false,
30 "editor": "checkbox"
31 },
32 "PG_CONNECTION_STRING_NORMALIZED": {
33 "title": "Postgres connection string for normalized data",
34 "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
35 "type": "string",
36 "editor": "textfield"
37 },
38 "PG_DATA_TABLE": {
39 "title": "Postgres table name for product data",
40 "description": "Table name for storing product name, url, image, ...",
41 "type": "string",
42 "editor": "textfield"
43 },
44 "PG_DATA_PRICE_TABLE": {
45 "title": "Postgres table name for price data",
46 "description": "Table name for storing price, original price, stock status, ...",
47 "type": "string",
48 "editor": "textfield"
49 }
50 },
51 "required": [
52 "mode"
53 ]
54}
apify.json
1{
2 "name": "fitanu-fitanu-com-scraper",
3 "version": "0.1",
4 "buildTag": "latest",
5 "env": null,
6 "defaultRunOptions": {
7 "build": "latest",
8 "timeoutSecs": 3600,
9 "memoryMbytes": 1024
10 }
11}
main.js
1import { Actor } from "apify3";
2import { CheerioCrawler, createCheerioRouter } from "crawlee";
3import { init, save, toNumberOrNull } from "./_utils/common.js";
4
5const LABELS = {
6 INDEX: `INDEX`,
7 PRODUCTS: `PRODUCTS`,
8};
9
10var MODE;
11
12(function (MODE) {
13 MODE["TEST"] = "TEST";
14 MODE["FULL"] = "FULL";
15})(MODE || (MODE = {}));
16
17const BASE_URL = `https://fitanu.com`;
18
19async function enqueueInitial(mode, crawler) {
20 if (mode === MODE.FULL) {
21 await crawler.addRequests([
22 {
23 userData: { label: LABELS.INDEX },
24 url: `https://fitanu.com/cz/nase-znacky`,
25 },
26 ]);
27 } else if (mode === MODE.TEST) {
28 await crawler.addRequests([
29 {
30 userData: { label: LABELS.PRODUCTS },
31 url: `https://fitanu.com/cz/produkty/zna%C4%8Dka/nike`,
32 },
33 ]);
34 }
35}
36
37const router = createCheerioRouter();
38
39router.addHandler(LABELS.INDEX, async ({ enqueueLinks }) => {
40 await enqueueLinks({
41 selector: `[href^="${BASE_URL}/cz/produkty/značka/"]`,
42 userData: { label: LABELS.PRODUCTS },
43 });
44});
45
46router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request, log }) => {
47 log.info(`[PRODUCTS] ${request.url}`);
48 if (!request.url.match(/\/page\/\d+$/)) {
49 // on first page, handle navigation
50 const totalPages = Number($(`.pages-items .page.last`).first().text()); // e.g. 6
51 for (let i = 2; i <= totalPages; i++) {
52 // skip first page, that is already handled
53 void crawler.addRequests([
54 {
55 userData: { label: LABELS.PRODUCTS },
56 url: `${request.url}/page/${i}`,
57 },
58 ]);
59 }
60 }
61
62 const products = [];
63 // Some .product-item are .category-description, not real products,
64 // that's why we need to filter just ones with data-product-sku
65 $(`.products-grid .product-item[data-product-sku]`).each((i, el) => {
66 const pid = $(el).attr(`data-product-sku`); // e.g. C92800472927
67 const url = $(el).find(`a.product-item-photo`).attr(`href`); // absolute url
68 const img = $(el).find(`img.product-image-photo`).attr(`src`); // absolute url
69 const name = $(el)
70 .find(`.product-item-name`)
71 .text()
72 .replace(/\n/g, ` `) // replace new lines with space
73 .replace(/\s+/g, ` `); // replace multiple spaces with single space
74 const price = $(el)
75 .find(`[data-price-type="finalPrice"]`)
76 .attr(`data-price-amount`); // e.g. 1099
77 const priceOrig = $(el)
78 .find(`[data-price-type="oldPrice"]`)
79 .attr(`data-price-amount`); // e.g. 1299
80 const inStock = undefined; // not available
81 const product = {
82 pid,
83 name,
84 url,
85 img,
86 inStock,
87 currentPrice: toNumberOrNull(price),
88 originalPrice: toNumberOrNull(priceOrig),
89 currency: `CZK`,
90 };
91 products.push(product);
92 });
93 await save(products);
94});
95
96void Actor.main(async () => {
97 const input = await Actor.getInput();
98 const { mode = MODE.FULL, ...rest } = input ?? {};
99 await init({ actorNameOverride: `fitanu` }, rest);
100 const crawler = new CheerioCrawler({ requestHandler: router });
101 await enqueueInitial(mode, crawler);
102 await crawler.run();
103});
package.json
1{
2 "name": "fitanu-fitanu-com-scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
4 "type": "module",
5 "scripts": {
6 "start": "node ./main.js",
7 "push-to-apify-platform": "npx apify push"
8 },
9 "dependencies": {
10 "apify3": "npm:apify@^3.0.2",
11 "crawlee": "*",
12 "pg": "*",
13 "pg-connection-string": "*",
14 "dotenv": "*",
15 "find-config": "*",
16 "@elastic/elasticsearch": "*",
17 "filenamify": "*"
18 },
19 "apify": {
20 "title": "Fitanu (fitanu.com) scraper",
21 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
22 "isPublic": true,
23 "isDeprecated": false,
24 "isAnonymouslyRunnable": true,
25 "notice": "",
26 "pictureUrl": "",
27 "seoTitle": "",
28 "seoDescription": "",
29 "categories": [
30 "ECOMMERCE"
31 ]
32 }
33}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "fitanu-fitanu-com-scraper",
4 "title": "Fitanu (fitanu.com) scraper",
5 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
6 "version": "0.1.0",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "Fitanu (fitanu.com) scraper",
11 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
12 "views": {
13 "overview": {
14 "title": "Overview",
15 "description": "Overview of the most important fields",
16 "transformation": {
17 "fields": [
18 "pid",
19 "name",
20 "url",
21 "img",
22 "inStock",
23 "currentPrice",
24 "originalPrice",
25 "currency"
26 ]
27 },
28 "display": {
29 "component": "table",
30 "columns": [
31 {
32 "label": "Pid",
33 "field": "pid",
34 "format": "text"
35 },
36 {
37 "label": "Name",
38 "field": "name",
39 "format": "text"
40 },
41 {
42 "label": "Url",
43 "field": "url",
44 "format": "link"
45 },
46 {
47 "label": "Img",
48 "field": "img",
49 "format": "image"
50 },
51 {
52 "label": "In Stock",
53 "field": "inStock",
54 "format": "boolean"
55 },
56 {
57 "label": "Current Price",
58 "field": "currentPrice",
59 "format": "number"
60 },
61 {
62 "label": "Original Price",
63 "field": "originalPrice",
64 "format": "number"
65 },
66 {
67 "label": "Currency",
68 "field": "currency",
69 "format": "text"
70 }
71 ]
72 }
73 }
74 }
75 }
76 }
77}
_utils/common.js
1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Dataset } from 'crawlee'
13
14config({ path: findConfig(`.env`) })
15
16const elasticIndexName = `actors-monorepo-shops`
17
18const globalLogsProps = {
19 __NODE_STARTED: new Date().toISOString(),
20}
21
22let actorName
23let pgClient
24let pgClientNormalized
25let elasticClient
26export async function init ({ actorNameOverride }, restInput) {
27 parseEnvFromInput(restInput)
28
29 if (os.platform() === `darwin`) {
30 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
31 const basename = path.basename(filePath) // foo.ts
32 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
33 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
34 .split(` `)[1]
35 .trim()
36 .replace(`refs/heads/`, ``)
37 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
38 const gitCommitShort = gitCommit.substring(0, 7)
39 globalLogsProps.__GIT_COMMIT = gitCommitShort
40 }
41
42 if (process.env.APIFY_IS_AT_HOME) {
43 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
44 }
45
46 /* ELASTIC */
47 /* ======= */
48 if (process.env.ELASTIC_CLOUD_ID) {
49 elasticClient = new ElasticClient({
50 cloud: { id: process.env.ELASTIC_CLOUD_ID },
51 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
52 })
53
54 // const mapping = await elasticClient.indices.getMapping({ index: actorName })
55
56 // eslint-disable-next-line no-inner-declarations
57 async function enforceIndexMapping () {
58 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
59 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
60 await elasticClient.indices.putMapping({
61 index: elasticIndexName,
62 body: {
63 properties: {
64 _discount: { type: `float` },
65 originalPrice: { type: `float` },
66 currentPrice: { type: `float` },
67 },
68 },
69 })
70 }
71
72 try {
73 await enforceIndexMapping()
74 } catch (err) {
75 if (err.message.includes(`cannot be changed from type`)) {
76 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
77 await elasticClient.indices.delete({ index: elasticIndexName })
78 await enforceIndexMapping()
79 }
80 }
81 }
82
83 /* POSTGRESQL */
84 /* ========== */
85 if (process.env.PG_CONNECTION_STRING) {
86 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
87 // const pgPool = new pg.Pool(pgConfig)
88
89 pgClient = new pg.Client(pgConfig)
90 await pgClient.connect()
91
92 // Check if table exists and have proper columns
93 const { rows: tables } = await pgClient.query(`
94 SELECT table_name
95 FROM information_schema.tables
96 WHERE table_schema = 'public'
97 `)
98
99 // eslint-disable-next-line camelcase
100 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
101 if (!tableExists) {
102 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
103 }
104
105 // TODO: Handle pgClient closing
106 }
107
108 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
109 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
110
111 pgClientNormalized = new pg.Client(pgConfig)
112 await pgClientNormalized.connect()
113
114 // Check if table exists and have proper columns
115 const { rows: tables } = await pgClientNormalized.query(`
116 SELECT table_name
117 FROM information_schema.tables
118 WHERE table_schema = 'public'
119 `)
120
121 // eslint-disable-next-line camelcase
122 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
123 // eslint-disable-next-line camelcase
124 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
125 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
126 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
127
128 // TODO: Handle pgClient closing
129 }
130}
131
132// inspired by @drobnikj
133// TODO: Similar, but less obfuscated for easier debugging
134export const createUniqueKeyFromUrl = (url) => {
135 const hash = createHash(`sha256`)
136 const cleanUrl = url.split(`://`)[1] // Remove protocol
137 hash.update(cleanUrl)
138 return hash.digest(`hex`)
139}
140
141/**
142 *
143 * @param {Date} datetime
144 * @return {Promise<void>}
145 */
146export const sleepUntil = async (datetime) => {
147 const now = new Date()
148 const difference = datetime - now
149 if (difference > 0) {
150 return new Promise((resolve) => {
151 setTimeout(resolve, difference)
152 })
153 }
154 return Promise.resolve()
155}
156
157export function parsePrice (string) {
158 let amount, currency
159 const noText = string.replace(/[^\d,.]/g, ``)
160 const decimals = noText.match(/([,.])(\d{2})$/)
161 if (decimals) {
162 const decimalSeparator = decimals[1]
163 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
164 const decimalAmount = decimals[2]
165 amount = parseInt(noText.split(decimalSeparator)[0])
166 } {
167 const justNumbers = noText.replace(/[,.]/g, ``)
168 amount = parseInt(justNumbers)
169 }
170 return { amount, currency }
171}
172
173export function toNumberOrNull (str) {
174 // TODO: Handle better, but only after adding test
175 if (str === undefined) return null
176 if (str === null) return null
177 if (str === ``) return null
178 const num = Number(str)
179 if (Number.isNaN(num)) return null
180 return num
181}
182
183export async function save (objs) {
184 if (!Array.isArray(objs)) objs = [objs]
185 if (objs.length === 0) return
186
187 const objsExtended = objs.map((obj) => {
188 const objExtended = {
189 ...obj,
190 actorName,
191 ...globalLogsProps,
192 // __NODE_VERSION: global.process.versions.node,
193 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
194 }
195 // if run on Apify
196 if (process.env.APIFY_IS_AT_HOME) {
197 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
198 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
199 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
200 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
201 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
202 if (!process.env.APIFY_DONT_STORE_IN_DATASET) void Dataset.pushData(obj)
203 }
204 return objExtended
205 })
206 // if runs on local machine (MacOS)
207 if (os.platform() === `darwin`) {
208 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
209 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
210 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
211 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
212 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
213 for (const objExtended of objsExtended) {
214 const id = objExtended.id ?? objExtended.pid // ?? uuidv4()
215 const fileName = `${filenamify(id)}.json`
216 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
217 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
218 }
219 }
220
221 if (pgClient) {
222 const objsPg = objs.map((obj) => ({
223 ...obj,
224 // TODO: This is becoming not nice, and not clear
225 shop: actorName,
226 scrapedAt: new Date().toISOString().split(`T`)[0],
227 }))
228
229 const columns = getColumns(objsPg)
230 const values = getValues(objsPg)
231 const queryString = `
232 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
233 VALUES (${values})
234 `
235 try {
236 const { rowCount } = await pgClient.query(queryString)
237 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
238 } catch (err) {
239 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
240 else throw err
241 }
242 }
243
244 // Only make sense for HlidacShopu
245 if (pgClientNormalized) {
246 const objsPgData = objs.map((obj) => ({
247 shop: actorName,
248 pid: obj.pid,
249 name: obj.name,
250 url: obj.url,
251 img: obj.img,
252 }))
253
254 const objsPgDataPrice = objs.map((obj) => ({
255 shop: actorName,
256 pid: obj.pid,
257 scrapedAt: new Date().toISOString().split(`T`)[0],
258 currentPrice: obj.currentPrice,
259 originalPrice: obj.originalPrice,
260 inStock: obj.inStock,
261 }))
262
263 const queryString = `
264 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
265 VALUES (${getValues(objsPgData)})
266 ON CONFLICT DO NOTHING
267 `
268 try {
269 const { rowCount } = await pgClientNormalized.query(queryString)
270 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
271 } catch (err) {
272 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
273 else throw err
274 }
275
276 const queryStringPrice = `
277 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
278 VALUES (${getValues(objsPgDataPrice)})
279 ON CONFLICT DO NOTHING
280 `
281 try {
282 const { rowCount } = await pgClientNormalized.query(queryStringPrice)
283 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
284 } catch (err) {
285 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
286 else throw err
287 }
288 }
289
290 if (elasticClient) {
291 // .index creates or updates the document
292 // .create creates a new document if it doesn't exist, 409 if it does
293 // try {
294 // const res = await elasticClient.index({
295 // index: `actors-monorepo-shops`, // TODO: Consider using actorName
296 // id, // foo-bar
297 // document: objExtended, // {...}
298 // })
299 // } catch (err) {
300 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
301 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
302 // else throw err
303 // }
304 }
305}
306
307function getColumns (objs) {
308 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
309}
310
311function getValues (objs) {
312 return objs.map(objPg => Object.values(objPg).map((value) => {
313 // escape strings to prevent SQL injection
314 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
315 // convert to DB specific null
316 if (typeof value === `undefined` || value === null) return `NULL`
317 return value
318 }).join(`, `)).join(`), (`)
319}
320
321export function parseEnvFromInput (input) {
322 const env = {}
323 for (const key in input) {
324 if (key === key.toUpperCase()) env[key] = input[key]
325 }
326 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
327 Object.assign(process.env, env)
328}