FDF Bike Shop (fdfbikeshop.cz) scraper avatar

FDF Bike Shop (fdfbikeshop.cz) scraper

Deprecated
Go to Store
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
FDF Bike Shop (fdfbikeshop.cz) scraper

FDF Bike Shop (fdfbikeshop.cz) scraper

strajk/fdf-bike-shop-fdfbikeshop-cz-scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

Dockerfile

1FROM apify/actor-node:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6  && npm install --only=prod --no-optional
7
8COPY . ./

INPUT_SCHEMA.json

1{
2  "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "mode": {
8      "title": "Mode",
9      "description": "",
10      "type": "string",
11      "editor": "select",
12      "default": "TEST",
13      "prefill": "TEST",
14      "enum": [
15        "TEST",
16        "FULL"
17      ],
18      "enumTitles": [
19        "TEST",
20        "FULL"
21      ]
22    },
23    "APIFY_DONT_STORE_IN_DATASET": {
24      "sectionCaption": "Advanced",
25      "sectionDescription": "Advanced options, use only if you know what you're doing.",
26      "title": "Don't store in dataset",
27      "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
28      "type": "boolean",
29      "default": false,
30      "editor": "checkbox"
31    },
32    "PG_CONNECTION_STRING_NORMALIZED": {
33      "title": "Postgres connection string for normalized data",
34      "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
35      "type": "string",
36      "editor": "textfield"
37    },
38    "PG_DATA_TABLE": {
39      "title": "Postgres table name for product data",
40      "description": "Table name for storing product name, url, image, ...",
41      "type": "string",
42      "editor": "textfield"
43    },
44    "PG_DATA_PRICE_TABLE": {
45      "title": "Postgres table name for price data",
46      "description": "Table name for storing price, original price, stock status, ...",
47      "type": "string",
48      "editor": "textfield"
49    }
50  },
51  "required": [
52    "mode"
53  ]
54}

apify.json

1{
2  "name": "fdf-bike-shop-fdfbikeshop-cz-scraper",
3  "version": "0.1",
4  "buildTag": "latest",
5  "env": null,
6  "defaultRunOptions": {
7    "build": "latest",
8    "timeoutSecs": 3600,
9    "memoryMbytes": 1024
10  }
11}

main.js

1import { Actor } from "apify3";
2import { CheerioCrawler, createCheerioRouter } from "crawlee";
3import { init, save, toNumberOrNull } from "./_utils/common.js";
4
5const LABELS = {
6  INDEX: `INDEX`,
7  PRODUCTS: `PRODUCTS`,
8};
9
10var MODE;
11
12(function (MODE) {
13  MODE["TEST"] = "TEST";
14  MODE["FULL"] = "FULL";
15})(MODE || (MODE = {}));
16
17const baseUrl = `https://www.fdfbikeshop.cz`;
18
19async function enqueueInitial(mode, crawler) {
20  if (mode === MODE.FULL) {
21    await crawler.addRequests([
22      {
23        userData: { label: LABELS.INDEX },
24        url: `${baseUrl}/sitemap.xml`,
25      },
26    ]);
27  } else if (mode === MODE.TEST) {
28    await crawler.addRequests([
29      {
30        userData: { label: LABELS.PRODUCTS },
31        url: `${baseUrl}/znacka/crankbrothers/`,
32      },
33    ]);
34    await crawler.addRequests([
35      {
36        userData: { label: LABELS.PRODUCTS },
37        url: `${baseUrl}/znacka/dt-swiss/`,
38      },
39    ]);
40    await crawler.addRequests([
41      {
42        userData: { label: LABELS.PRODUCTS },
43        url: `${baseUrl}/znacka/galfer/`,
44      },
45    ]);
46  }
47}
48
49const router = createCheerioRouter();
50
51router.addHandler(LABELS.INDEX, async ({ $, crawler }) => {
52  const urls = $(`url loc`)
53    .map((i, x) => $(x).text())
54    .toArray();
55  // https://www.fdfbikeshop.cz/znacka/100/ » brands, only type of urls we are interested in
56  // https://www.fdfbikeshop.cz/nahradni-dily-loziska/
57  // https://www.fdfbikeshop.cz/kazety-12sp/
58  // @ts-ignore
59  const brandsUrls = urls.filter((url) => url?.includes(`/znacka/`));
60  const requests = [];
61  for (const url of brandsUrls) {
62    requests.push({
63      userData: { label: LABELS.PRODUCTS },
64      // @ts-ignore
65      url,
66    });
67  }
68  await crawler.addRequests(requests);
69});
70
71router.addHandler(LABELS.PRODUCTS, async ({ $, crawler, request }) => {
72  console.log(`Processing ${request.url}`);
73
74  // on first page » handle pagination
75  if (!request.url.match(/\/strana-\d+\//)) {
76    // /znacka/fox/strana-2/
77    let totalPages;
78    try {
79      totalPages = parseInt(
80        // $('.pagination-description-pages:eq(0) strong:eq(1)').text() » this does not work
81        $(`.pagination-description-pages`).eq(0).find(`strong`).eq(1).text()
82      );
83    } catch (e) {
84      totalPages = 1;
85    }
86    if (totalPages > 1) {
87      console.log(`> …found ${totalPages} pages, enqueuing them…`);
88    }
89    for (let i = 2; i <= totalPages; i++) {
90      // skip first page, that is already handled
91      void crawler.addRequests([
92        {
93          userData: { label: LABELS.PRODUCTS },
94          url: `${request.url}strana-${i}/`,
95        },
96      ]);
97    }
98  }
99
100  const products = [];
101  $(`#category-products-wrapper .product`).each((i, el) => {
102    const id = $(el).attr(`data-micro-product-id`); // e.g. 20080005-40200770
103
104    const relUrl = $(el).find(`a.p-name`).attr(`href`);
105    const url = `${baseUrl}${relUrl}`;
106    const title = $(el).find(`[data-micro='name']`).text();
107
108    const priceRaw = $(el).find(`.p-det-main-price`).text().trim(); // e.g. `39 990 Kč`
109    const price = priceRaw.replace(/[^\d,]/g, ``); // keep comma as it's decimal separator
110
111    const priceOrigRaw = $(el)
112      .find(`.p-standard-price span.line`)
113      .text()
114      .trim();
115    const priceOrig = priceOrigRaw.replace(/[^\d,]/g, ``);
116
117    const img = $(el).find(`img[data-micro='image']`).attr(`src`);
118
119    let inStock = false;
120    try {
121      inStock = $(el).find(`.p-cat-availability`).text().includes(`Skladem`);
122    } catch (err) {
123      // ignore
124    }
125
126    const product = {
127      pid: id,
128      name: title,
129      url: url,
130      img: img,
131      inStock,
132      currentPrice: toNumberOrNull(price),
133      originalPrice: toNumberOrNull(priceOrig),
134      currency: `CZK`,
135    };
136    products.push(product);
137  });
138  void save(products);
139});
140
141void Actor.main(async () => {
142  const input = await Actor.getInput();
143  const { mode = MODE.FULL, ...rest } = input ?? {};
144  await init({ actorNameOverride: `fdfbikeshop-cz` }, rest);
145  const crawler = new CheerioCrawler({ requestHandler: router });
146  await enqueueInitial(mode, crawler);
147  await crawler.run();
148});

package.json

1{
2  "name": "fdf-bike-shop-fdfbikeshop-cz-scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "module",
5  "scripts": {
6    "start": "node ./main.js",
7    "push-to-apify-platform": "npx apify push"
8  },
9  "dependencies": {
10    "apify3": "npm:apify@^3.0.2",
11    "crawlee": "*",
12    "pg": "*",
13    "pg-connection-string": "*",
14    "dotenv": "*",
15    "find-config": "*",
16    "@elastic/elasticsearch": "*",
17    "filenamify": "*"
18  },
19  "apify": {
20    "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
21    "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
22    "isPublic": true,
23    "isDeprecated": false,
24    "isAnonymouslyRunnable": true,
25    "notice": "",
26    "pictureUrl": "",
27    "seoTitle": "",
28    "seoDescription": "",
29    "categories": [
30      "ECOMMERCE"
31    ]
32  }
33}

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "fdf-bike-shop-fdfbikeshop-cz-scraper",
4  "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
5  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6  "version": "0.1.0",
7  "storages": {
8    "dataset": {
9      "actorSpecification": 1,
10      "title": "FDF Bike Shop (fdfbikeshop.cz) scraper",
11      "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12      "views": {
13        "overview": {
14          "title": "Overview",
15          "description": "Overview of the most important fields",
16          "transformation": {
17            "fields": [
18              "pid",
19              "name",
20              "url",
21              "img",
22              "inStock",
23              "currentPrice",
24              "originalPrice",
25              "currency"
26            ]
27          },
28          "display": {
29            "component": "table",
30            "columns": [
31              {
32                "label": "Pid",
33                "field": "pid",
34                "format": "text"
35              },
36              {
37                "label": "Name",
38                "field": "name",
39                "format": "text"
40              },
41              {
42                "label": "Url",
43                "field": "url",
44                "format": "link"
45              },
46              {
47                "label": "Img",
48                "field": "img",
49                "format": "image"
50              },
51              {
52                "label": "In Stock",
53                "field": "inStock",
54                "format": "boolean"
55              },
56              {
57                "label": "Current Price",
58                "field": "currentPrice",
59                "format": "number"
60              },
61              {
62                "label": "Original Price",
63                "field": "originalPrice",
64                "format": "number"
65              },
66              {
67                "label": "Currency",
68                "field": "currency",
69                "format": "text"
70              }
71            ]
72          }
73        }
74      }
75    }
76  }
77}

.actor/logo.png

_utils/common.js

1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Dataset } from 'crawlee'
13
14config({ path: findConfig(`.env`) })
15
16const elasticIndexName = `actors-monorepo-shops`
17
18const globalLogsProps = {
19  __NODE_STARTED: new Date().toISOString(),
20}
21
22let actorName
23let pgClient
24let pgClientNormalized
25let elasticClient
26export async function init ({ actorNameOverride }, restInput) {
27  parseEnvFromInput(restInput)
28
29  if (os.platform() === `darwin`) {
30    const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
31    const basename = path.basename(filePath) // foo.ts
32    actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
33    const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
34      .split(` `)[1]
35      .trim()
36      .replace(`refs/heads/`, ``)
37    const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
38    const gitCommitShort = gitCommit.substring(0, 7)
39    globalLogsProps.__GIT_COMMIT = gitCommitShort
40  }
41
42  if (process.env.APIFY_IS_AT_HOME) {
43    actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
44  }
45
46  /* ELASTIC */
47  /* ======= */
48  if (process.env.ELASTIC_CLOUD_ID) {
49    elasticClient = new ElasticClient({
50      cloud: { id: process.env.ELASTIC_CLOUD_ID },
51      auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
52    })
53
54    // const mapping = await elasticClient.indices.getMapping({ index: actorName })
55
56    // eslint-disable-next-line no-inner-declarations
57    async function enforceIndexMapping () {
58      const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
59      if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
60      await elasticClient.indices.putMapping({
61        index: elasticIndexName,
62        body: {
63          properties: {
64            _discount: { type: `float` },
65            originalPrice: { type: `float` },
66            currentPrice: { type: `float` },
67          },
68        },
69      })
70    }
71
72    try {
73      await enforceIndexMapping()
74    } catch (err) {
75      if (err.message.includes(`cannot be changed from type`)) {
76        console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
77        await elasticClient.indices.delete({ index: elasticIndexName })
78        await enforceIndexMapping()
79      }
80    }
81  }
82
83  /* POSTGRESQL */
84  /* ========== */
85  if (process.env.PG_CONNECTION_STRING) {
86    const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
87    // const pgPool = new pg.Pool(pgConfig)
88
89    pgClient = new pg.Client(pgConfig)
90    await pgClient.connect()
91
92    // Check if table exists and have proper columns
93    const { rows: tables } = await pgClient.query(`
94    SELECT table_name
95    FROM information_schema.tables
96    WHERE table_schema = 'public'
97  `)
98
99    // eslint-disable-next-line camelcase
100    const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
101    if (!tableExists) {
102      throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
103    }
104
105  // TODO: Handle pgClient closing
106  }
107
108  if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
109    const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
110
111    pgClientNormalized = new pg.Client(pgConfig)
112    await pgClientNormalized.connect()
113
114    // Check if table exists and have proper columns
115    const { rows: tables } = await pgClientNormalized.query(`
116    SELECT table_name
117    FROM information_schema.tables
118    WHERE table_schema = 'public'
119  `)
120
121    // eslint-disable-next-line camelcase
122    const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
123    // eslint-disable-next-line camelcase
124    const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
125    if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
126    if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
127
128  // TODO: Handle pgClient closing
129  }
130}
131
132// inspired by @drobnikj
133// TODO: Similar, but less obfuscated for easier debugging
134export const createUniqueKeyFromUrl = (url) => {
135  const hash = createHash(`sha256`)
136  const cleanUrl = url.split(`://`)[1] // Remove protocol
137  hash.update(cleanUrl)
138  return hash.digest(`hex`)
139}
140
141/**
142 *
143 * @param {Date} datetime
144 * @return {Promise<void>}
145 */
146export const sleepUntil = async (datetime) => {
147  const now = new Date()
148  const difference = datetime - now
149  if (difference > 0) {
150    return new Promise((resolve) => {
151      setTimeout(resolve, difference)
152    })
153  }
154  return Promise.resolve()
155}
156
157export function parsePrice (string) {
158  let amount, currency
159  const noText = string.replace(/[^\d,.]/g, ``)
160  const decimals = noText.match(/([,.])(\d{2})$/)
161  if (decimals) {
162    const decimalSeparator = decimals[1]
163    // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
164    const decimalAmount = decimals[2]
165    amount = parseInt(noText.split(decimalSeparator)[0])
166  } {
167    const justNumbers = noText.replace(/[,.]/g, ``)
168    amount = parseInt(justNumbers)
169  }
170  return { amount, currency }
171}
172
173export function toNumberOrNull (str) {
174  // TODO: Handle better, but only after adding test
175  if (str === undefined) return null
176  if (str === null) return null
177  if (str === ``) return null
178  const num = Number(str)
179  if (Number.isNaN(num)) return null
180  return num
181}
182
183export async function save (objs) {
184  if (!Array.isArray(objs)) objs = [objs]
185  if (objs.length === 0) return
186
187  const objsExtended = objs.map((obj) => {
188    const objExtended = {
189      ...obj,
190      actorName,
191      ...globalLogsProps,
192      // __NODE_VERSION: global.process.versions.node,
193      // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
194    }
195    // if run on Apify
196    if (process.env.APIFY_IS_AT_HOME) {
197      objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
198      objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
199      objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
200      objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
201      objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
202      if (!process.env.APIFY_DONT_STORE_IN_DATASET) void Dataset.pushData(obj)
203    }
204    return objExtended
205  })
206  // if runs on local machine (MacOS)
207  if (os.platform() === `darwin`) {
208    const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
209    const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
210    if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
211    const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
212    if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
213    for (const objExtended of objsExtended) {
214      const id = objExtended.id ?? objExtended.pid // ?? uuidv4()
215      const fileName = `${filenamify(id)}.json`
216      const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
217      fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
218    }
219  }
220
221  if (pgClient) {
222    const objsPg = objs.map((obj) => ({
223      ...obj,
224      // TODO: This is becoming not nice, and not clear
225      shop: actorName,
226      scrapedAt: new Date().toISOString().split(`T`)[0],
227    }))
228
229    const columns = getColumns(objsPg)
230    const values = getValues(objsPg)
231    const queryString = `
232        INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
233        VALUES (${values})
234    `
235    try {
236      const { rowCount } = await pgClient.query(queryString)
237      console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
238    } catch (err) {
239      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
240      else throw err
241    }
242  }
243
244  // Only make sense for HlidacShopu
245  if (pgClientNormalized) {
246    const objsPgData = objs.map((obj) => ({
247      shop: actorName,
248      pid: obj.pid,
249      name: obj.name,
250      url: obj.url,
251      img: obj.img,
252    }))
253
254    const objsPgDataPrice = objs.map((obj) => ({
255      shop: actorName,
256      pid: obj.pid,
257      scrapedAt: new Date().toISOString().split(`T`)[0],
258      currentPrice: obj.currentPrice,
259      originalPrice: obj.originalPrice,
260      inStock: obj.inStock,
261    }))
262
263    const queryString = `
264        INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
265        VALUES (${getValues(objsPgData)})
266        ON CONFLICT DO NOTHING
267    `
268    try {
269      const { rowCount } = await pgClientNormalized.query(queryString)
270      console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
271    } catch (err) {
272      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
273      else throw err
274    }
275
276    const queryStringPrice = `
277        INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
278        VALUES (${getValues(objsPgDataPrice)})
279        ON CONFLICT DO NOTHING
280    `
281    try {
282      const { rowCount } = await pgClientNormalized.query(queryStringPrice)
283      console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
284    } catch (err) {
285      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
286      else throw err
287    }
288  }
289
290  if (elasticClient) {
291    // .index creates or updates the document
292    // .create creates a new document if it doesn't exist, 409 if it does
293    // try {
294    //   const res = await elasticClient.index({
295    //     index: `actors-monorepo-shops`, // TODO: Consider using actorName
296    //     id, // foo-bar
297    //     document: objExtended, // {...}
298    //   })
299    // } catch (err) {
300    //   // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
301    //   if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
302    //   else throw err
303    // }
304  }
305}
306
307function getColumns (objs) {
308  return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
309}
310
311function getValues (objs) {
312  return objs.map(objPg => Object.values(objPg).map((value) => {
313    // escape strings to prevent SQL injection
314    if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
315    // convert to DB specific null
316    if (typeof value === `undefined` || value === null) return `NULL`
317    return value
318  }).join(`, `)).join(`), (`)
319}
320
321export function parseEnvFromInput (input) {
322  const env = {}
323  for (const key in input) {
324    if (key === key.toUpperCase()) env[key] = input[key]
325  }
326  console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
327  Object.assign(process.env, env)
328}
Developer
Maintained by Community