FDF Bike Shop (fdfbikeshop.cz) scraper
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsFDF Bike Shop (fdfbikeshop.cz) scraper
strajk/fdf-bike-shop-fdfbikeshop-cz-scraper
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
Dockerfile
1FROM apify/actor-node:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install --only=prod --no-optional
7
8COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "mode": {
8 "title": "Mode",
9 "description": "",
10 "type": "string",
11 "editor": "select",
12 "default": "TEST",
13 "prefill": "TEST",
14 "enum": [
15 "TEST",
16 "FULL"
17 ],
18 "enumTitles": [
19 "TEST",
20 "FULL"
21 ]
22 },
23 "APIFY_DONT_STORE_IN_DATASET": {
24 "sectionCaption": "Advanced",
25 "sectionDescription": "Advanced options, use only if you know what you're doing.",
26 "title": "Don't store in dataset",
27 "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
28 "type": "boolean",
29 "default": false,
30 "editor": "checkbox"
31 },
32 "PG_CONNECTION_STRING_NORMALIZED": {
33 "title": "Postgres connection string for normalized data",
34 "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
35 "type": "string",
36 "editor": "textfield"
37 },
38 "PG_DATA_TABLE": {
39 "title": "Postgres table name for product data",
40 "description": "Table name for storing product name, url, image, ...",
41 "type": "string",
42 "editor": "textfield"
43 },
44 "PG_DATA_PRICE_TABLE": {
45 "title": "Postgres table name for price data",
46 "description": "Table name for storing price, original price, stock status, ...",
47 "type": "string",
48 "editor": "textfield"
49 }
50 },
51 "required": [
52 "mode"
53 ]
54}
apify.json
1{
2 "name": "fdf-bike-shop-fdfbikeshop-cz-scraper",
3 "version": "0.1",
4 "buildTag": "latest",
5 "env": null,
6 "defaultRunOptions": {
7 "build": "latest",
8 "timeoutSecs": 3600,
9 "memoryMbytes": 1024
10 }
11}
main.js
1import { Actor } from "apify3";
2import { CheerioCrawler, createCheerioRouter } from "crawlee";
3import { init, save, toNumberOrNull } from "./_utils/common.js";
4
5const LABELS = {
6 INDEX: `INDEX`,
7 PRODUCTS: `PRODUCTS`,
8};
9
10var MODE;
11
12(function (MODE) {
13 MODE["TEST"] = "TEST";
14 MODE["FULL"] = "FULL";
15})(MODE || (MODE = {}));
16
17const baseUrl = `https://www.fdfbikeshop.cz`;
18
19async function enqueueInitial(mode, crawler) {
20 if (mode === MODE.FULL) {
21 await crawler.addRequests([
22 {
23 userData: { label: LABELS.INDEX },
24 url: `${baseUrl}/sitemap.xml`,
25 },
26 ]);
27 } else if (mode === MODE.TEST) {
28 await crawler.addRequests([
29 {
30 userData: { label: LABELS.PRODUCTS },
31 url: `${baseUrl}/znacka/crankbrothers/`,
32 },
33 ]);
34 await crawler.addRequests([
35 {
36 userData: { label: LABELS.PRODUCTS },
37 url: `${baseUrl}/znacka/dt-swiss/`,
38 },
39 ]);
40 await crawler.addRequests([
41 {
42 userData: { label: LABELS.PRODUCTS },
43 url: `${baseUrl}/znacka/galfer/`,
44 },
45 ]);
46 }
47}
48
49const router = createCheerioRouter();
50
51router.addHandler(LABELS.INDEX, async ({ $, crawler }) => {
52 const urls = $(`url loc`)
53 .map((i, x) => $(x).text())
54 .toArray();
55 // https://www.fdfbikeshop.cz/znacka/100/ » brands, only type of urls we are interested in
56 // https://www.fdfbikeshop.cz/nahradni-dily-loziska/
57 // https://www.fdfbikeshop.cz/kazety-12sp/
58 // @ts-ignore
59 const brandsUrls = urls.filter((url) => url?.includes(`/znacka/`));
60 const requests = [];
61 for (const url of brandsUrls) {
62 requests.push({
63 userData: { label: LABELS.PRODUCTS },
64 // @ts-ignore
65 url,
66 });
67 }
68 await crawler.addRequests(requests);
69});
70
71router.addHandler(LABELS.PRODUCTS, async ({ $, crawler, request }) => {
72 console.log(`Processing ${request.url}`);
73
74 // on first page » handle pagination
75 if (!request.url.match(/\/strana-\d+\//)) {
76 // /znacka/fox/strana-2/
77 let totalPages;
78 try {
79 totalPages = parseInt(
80 // $('.pagination-description-pages:eq(0) strong:eq(1)').text() » this does not work
81 $(`.pagination-description-pages`).eq(0).find(`strong`).eq(1).text()
82 );
83 } catch (e) {
84 totalPages = 1;
85 }
86 if (totalPages > 1) {
87 console.log(`> …found ${totalPages} pages, enqueuing them…`);
88 }
89 for (let i = 2; i <= totalPages; i++) {
90 // skip first page, that is already handled
91 void crawler.addRequests([
92 {
93 userData: { label: LABELS.PRODUCTS },
94 url: `${request.url}strana-${i}/`,
95 },
96 ]);
97 }
98 }
99
100 const products = [];
101 $(`#category-products-wrapper .product`).each((i, el) => {
102 const id = $(el).attr(`data-micro-product-id`); // e.g. 20080005-40200770
103
104 const relUrl = $(el).find(`a.p-name`).attr(`href`);
105 const url = `${baseUrl}${relUrl}`;
106 const title = $(el).find(`[data-micro='name']`).text();
107
108 const priceRaw = $(el).find(`.p-det-main-price`).text().trim(); // e.g. `39 990 Kč`
109 const price = priceRaw.replace(/[^\d,]/g, ``); // keep comma as it's decimal separator
110
111 const priceOrigRaw = $(el)
112 .find(`.p-standard-price span.line`)
113 .text()
114 .trim();
115 const priceOrig = priceOrigRaw.replace(/[^\d,]/g, ``);
116
117 const img = $(el).find(`img[data-micro='image']`).attr(`src`);
118
119 let inStock = false;
120 try {
121 inStock = $(el).find(`.p-cat-availability`).text().includes(`Skladem`);
122 } catch (err) {
123 // ignore
124 }
125
126 const product = {
127 pid: id,
128 name: title,
129 url: url,
130 img: img,
131 inStock,
132 currentPrice: toNumberOrNull(price),
133 originalPrice: toNumberOrNull(priceOrig),
134 currency: `CZK`,
135 };
136 products.push(product);
137 });
138 void save(products);
139});
140
141void Actor.main(async () => {
142 const input = await Actor.getInput();
143 const { mode = MODE.FULL, ...rest } = input ?? {};
144 await init({ actorNameOverride: `fdfbikeshop-cz` }, rest);
145 const crawler = new CheerioCrawler({ requestHandler: router });
146 await enqueueInitial(mode, crawler);
147 await crawler.run();
148});
package.json
1{
2 "name": "fdf-bike-shop-fdfbikeshop-cz-scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4 "type": "module",
5 "scripts": {
6 "start": "node ./main.js",
7 "push-to-apify-platform": "npx apify push"
8 },
9 "dependencies": {
10 "apify3": "npm:apify@^3.0.2",
11 "crawlee": "*",
12 "pg": "*",
13 "pg-connection-string": "*",
14 "dotenv": "*",
15 "find-config": "*",
16 "@elastic/elasticsearch": "*",
17 "filenamify": "*"
18 },
19 "apify": {
20 "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
21 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
22 "isPublic": true,
23 "isDeprecated": false,
24 "isAnonymouslyRunnable": true,
25 "notice": "",
26 "pictureUrl": "",
27 "seoTitle": "",
28 "seoDescription": "",
29 "categories": [
30 "ECOMMERCE"
31 ]
32 }
33}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "fdf-bike-shop-fdfbikeshop-cz-scraper",
4 "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
5 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6 "version": "0.1.0",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
11 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12 "views": {
13 "overview": {
14 "title": "Overview",
15 "description": "Overview of the most important fields",
16 "transformation": {
17 "fields": [
18 "pid",
19 "name",
20 "url",
21 "img",
22 "inStock",
23 "currentPrice",
24 "originalPrice",
25 "currency"
26 ]
27 },
28 "display": {
29 "component": "table",
30 "columns": [
31 {
32 "label": "Pid",
33 "field": "pid",
34 "format": "text"
35 },
36 {
37 "label": "Name",
38 "field": "name",
39 "format": "text"
40 },
41 {
42 "label": "Url",
43 "field": "url",
44 "format": "link"
45 },
46 {
47 "label": "Img",
48 "field": "img",
49 "format": "image"
50 },
51 {
52 "label": "In Stock",
53 "field": "inStock",
54 "format": "boolean"
55 },
56 {
57 "label": "Current Price",
58 "field": "currentPrice",
59 "format": "number"
60 },
61 {
62 "label": "Original Price",
63 "field": "originalPrice",
64 "format": "number"
65 },
66 {
67 "label": "Currency",
68 "field": "currency",
69 "format": "text"
70 }
71 ]
72 }
73 }
74 }
75 }
76 }
77}
.actor/logo.png
_utils/common.js
1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Dataset } from 'crawlee'
13
14config({ path: findConfig(`.env`) })
15
16const elasticIndexName = `actors-monorepo-shops`
17
18const globalLogsProps = {
19 __NODE_STARTED: new Date().toISOString(),
20}
21
22let actorName
23let pgClient
24let pgClientNormalized
25let elasticClient
26export async function init ({ actorNameOverride }, restInput) {
27 parseEnvFromInput(restInput)
28
29 if (os.platform() === `darwin`) {
30 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
31 const basename = path.basename(filePath) // foo.ts
32 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
33 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
34 .split(` `)[1]
35 .trim()
36 .replace(`refs/heads/`, ``)
37 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
38 const gitCommitShort = gitCommit.substring(0, 7)
39 globalLogsProps.__GIT_COMMIT = gitCommitShort
40 }
41
42 if (process.env.APIFY_IS_AT_HOME) {
43 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
44 }
45
46 /* ELASTIC */
47 /* ======= */
48 if (process.env.ELASTIC_CLOUD_ID) {
49 elasticClient = new ElasticClient({
50 cloud: { id: process.env.ELASTIC_CLOUD_ID },
51 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
52 })
53
54 // const mapping = await elasticClient.indices.getMapping({ index: actorName })
55
56 // eslint-disable-next-line no-inner-declarations
57 async function enforceIndexMapping () {
58 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
59 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
60 await elasticClient.indices.putMapping({
61 index: elasticIndexName,
62 body: {
63 properties: {
64 _discount: { type: `float` },
65 originalPrice: { type: `float` },
66 currentPrice: { type: `float` },
67 },
68 },
69 })
70 }
71
72 try {
73 await enforceIndexMapping()
74 } catch (err) {
75 if (err.message.includes(`cannot be changed from type`)) {
76 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
77 await elasticClient.indices.delete({ index: elasticIndexName })
78 await enforceIndexMapping()
79 }
80 }
81 }
82
83 /* POSTGRESQL */
84 /* ========== */
85 if (process.env.PG_CONNECTION_STRING) {
86 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
87 // const pgPool = new pg.Pool(pgConfig)
88
89 pgClient = new pg.Client(pgConfig)
90 await pgClient.connect()
91
92 // Check if table exists and have proper columns
93 const { rows: tables } = await pgClient.query(`
94 SELECT table_name
95 FROM information_schema.tables
96 WHERE table_schema = 'public'
97 `)
98
99 // eslint-disable-next-line camelcase
100 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
101 if (!tableExists) {
102 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
103 }
104
105 // TODO: Handle pgClient closing
106 }
107
108 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
109 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
110
111 pgClientNormalized = new pg.Client(pgConfig)
112 await pgClientNormalized.connect()
113
114 // Check if table exists and have proper columns
115 const { rows: tables } = await pgClientNormalized.query(`
116 SELECT table_name
117 FROM information_schema.tables
118 WHERE table_schema = 'public'
119 `)
120
121 // eslint-disable-next-line camelcase
122 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
123 // eslint-disable-next-line camelcase
124 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
125 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
126 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
127
128 // TODO: Handle pgClient closing
129 }
130}
131
132// inspired by @drobnikj
133// TODO: Similar, but less obfuscated for easier debugging
134export const createUniqueKeyFromUrl = (url) => {
135 const hash = createHash(`sha256`)
136 const cleanUrl = url.split(`://`)[1] // Remove protocol
137 hash.update(cleanUrl)
138 return hash.digest(`hex`)
139}
140
141/**
142 *
143 * @param {Date} datetime
144 * @return {Promise<void>}
145 */
146export const sleepUntil = async (datetime) => {
147 const now = new Date()
148 const difference = datetime - now
149 if (difference > 0) {
150 return new Promise((resolve) => {
151 setTimeout(resolve, difference)
152 })
153 }
154 return Promise.resolve()
155}
156
157export function parsePrice (string) {
158 let amount, currency
159 const noText = string.replace(/[^\d,.]/g, ``)
160 const decimals = noText.match(/([,.])(\d{2})$/)
161 if (decimals) {
162 const decimalSeparator = decimals[1]
163 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
164 const decimalAmount = decimals[2]
165 amount = parseInt(noText.split(decimalSeparator)[0])
166 } {
167 const justNumbers = noText.replace(/[,.]/g, ``)
168 amount = parseInt(justNumbers)
169 }
170 return { amount, currency }
171}
172
173export function toNumberOrNull (str) {
174 // TODO: Handle better, but only after adding test
175 if (str === undefined) return null
176 if (str === null) return null
177 if (str === ``) return null
178 const num = Number(str)
179 if (Number.isNaN(num)) return null
180 return num
181}
182
183export async function save (objs) {
184 if (!Array.isArray(objs)) objs = [objs]
185 if (objs.length === 0) return
186
187 const objsExtended = objs.map((obj) => {
188 const objExtended = {
189 ...obj,
190 actorName,
191 ...globalLogsProps,
192 // __NODE_VERSION: global.process.versions.node,
193 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
194 }
195 // if run on Apify
196 if (process.env.APIFY_IS_AT_HOME) {
197 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
198 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
199 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
200 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
201 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
202 if (!process.env.APIFY_DONT_STORE_IN_DATASET) void Dataset.pushData(obj)
203 }
204 return objExtended
205 })
206 // if runs on local machine (MacOS)
207 if (os.platform() === `darwin`) {
208 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
209 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
210 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
211 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
212 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
213 for (const objExtended of objsExtended) {
214 const id = objExtended.id ?? objExtended.pid // ?? uuidv4()
215 const fileName = `${filenamify(id)}.json`
216 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
217 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
218 }
219 }
220
221 if (pgClient) {
222 const objsPg = objs.map((obj) => ({
223 ...obj,
224 // TODO: This is becoming not nice, and not clear
225 shop: actorName,
226 scrapedAt: new Date().toISOString().split(`T`)[0],
227 }))
228
229 const columns = getColumns(objsPg)
230 const values = getValues(objsPg)
231 const queryString = `
232 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
233 VALUES (${values})
234 `
235 try {
236 const { rowCount } = await pgClient.query(queryString)
237 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
238 } catch (err) {
239 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
240 else throw err
241 }
242 }
243
244 // Only make sense for HlidacShopu
245 if (pgClientNormalized) {
246 const objsPgData = objs.map((obj) => ({
247 shop: actorName,
248 pid: obj.pid,
249 name: obj.name,
250 url: obj.url,
251 img: obj.img,
252 }))
253
254 const objsPgDataPrice = objs.map((obj) => ({
255 shop: actorName,
256 pid: obj.pid,
257 scrapedAt: new Date().toISOString().split(`T`)[0],
258 currentPrice: obj.currentPrice,
259 originalPrice: obj.originalPrice,
260 inStock: obj.inStock,
261 }))
262
263 const queryString = `
264 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
265 VALUES (${getValues(objsPgData)})
266 ON CONFLICT DO NOTHING
267 `
268 try {
269 const { rowCount } = await pgClientNormalized.query(queryString)
270 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
271 } catch (err) {
272 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
273 else throw err
274 }
275
276 const queryStringPrice = `
277 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
278 VALUES (${getValues(objsPgDataPrice)})
279 ON CONFLICT DO NOTHING
280 `
281 try {
282 const { rowCount } = await pgClientNormalized.query(queryStringPrice)
283 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
284 } catch (err) {
285 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
286 else throw err
287 }
288 }
289
290 if (elasticClient) {
291 // .index creates or updates the document
292 // .create creates a new document if it doesn't exist, 409 if it does
293 // try {
294 // const res = await elasticClient.index({
295 // index: `actors-monorepo-shops`, // TODO: Consider using actorName
296 // id, // foo-bar
297 // document: objExtended, // {...}
298 // })
299 // } catch (err) {
300 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
301 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
302 // else throw err
303 // }
304 }
305}
306
307function getColumns (objs) {
308 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
309}
310
311function getValues (objs) {
312 return objs.map(objPg => Object.values(objPg).map((value) => {
313 // escape strings to prevent SQL injection
314 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
315 // convert to DB specific null
316 if (typeof value === `undefined` || value === null) return `NULL`
317 return value
318 }).join(`, `)).join(`), (`)
319}
320
321export function parseEnvFromInput (input) {
322 const env = {}
323 for (const key in input) {
324 if (key === key.toUpperCase()) env[key] = input[key]
325 }
326 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
327 Object.assign(process.env, env)
328}
Developer
Maintained by Community
Categories