Bike24 (bike24.de) scraper avatar

Bike24 (bike24.de) scraper

Under maintenance
Go to Store
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
Bike24 (bike24.de) scraper

Bike24 (bike24.de) scraper

strajk/bike24-bike24-de-scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

Developer
Maintained by Community

Dockerfile

1FROM apify/actor-node-playwright-firefox:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6  && npm install aws-crt \
7  && npm install --only=prod --no-optional
8
9COPY . ./

INPUT_SCHEMA.json

1{
2  "title": "Bike24 (bike24.de) scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "mode": {
8      "title": "Mode",
9      "description": "",
10      "type": "string",
11      "editor": "select",
12      "default": "TEST",
13      "prefill": "TEST",
14      "enum": [
15        "TEST",
16        "FULL"
17      ],
18      "enumTitles": [
19        "TEST mode (scrapes only few categories)",
20        "FULL"
21      ]
22    },
23    "proxyConfiguration": {
24      "title": "Proxy configuration",
25      "description": "Select proxies to be used by your actor.",
26      "type": "object",
27      "editor": "proxy",
28      "default": {
29        "useApifyProxy": true,
30        "apifyProxyGroups": [
31          "RESIDENTIAL"
32        ]
33      },
34      "prefill": {
35        "useApifyProxy": true,
36        "apifyProxyGroups": [
37          "RESIDENTIAL"
38        ]
39      }
40    },
41    "debug": {
42      "title": "Debug",
43      "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
44      "type": "boolean",
45      "editor": "checkbox",
46      "default": false
47    },
48    "APIFY_USE_MEMORY_REQUEST_QUEUE": {
49      "sectionCaption": "Advanced",
50      "sectionDescription": "Advanced options, use only if you know what you're doing.",
51      "title": "Use in-memory request queue instead of the native one",
52      "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
53      "type": "boolean",
54      "default": false,
55      "editor": "checkbox"
56    },
57    "APIFY_DONT_STORE_IN_DATASET": {
58      "title": "Don't store in dataset",
59      "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
60      "type": "boolean",
61      "default": false,
62      "editor": "checkbox"
63    },
64    "PG_CONNECTION_STRING_NORMALIZED": {
65      "title": "Postgres connection string for normalized data",
66      "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
67      "type": "string",
68      "editor": "textfield"
69    },
70    "PG_DATA_TABLE": {
71      "title": "Postgres table name for product data",
72      "description": "Table name for storing product name, url, image, ...",
73      "type": "string",
74      "editor": "textfield"
75    },
76    "PG_DATA_PRICE_TABLE": {
77      "title": "Postgres table name for price data",
78      "description": "Table name for storing price, original price, stock status, ...",
79      "type": "string",
80      "editor": "textfield"
81    }
82  },
83  "required": [
84    "mode",
85    "proxyConfiguration"
86  ]
87}

apify.json

1{
2  "name": "bike24-bike24-de-scraper",
3  "version": "0.1",
4  "buildTag": "latest",
5  "env": null,
6  "defaultRunOptions": {
7    "build": "latest",
8    "timeoutSecs": 3600,
9    "memoryMbytes": 4096
10  }
11}

main.js

1import { URL } from "node:url";
2import { Actor } from "apify3";
3import {
4  CheerioCrawler,
5  createCheerioRouter,
6  utils as crawleeUtils,
7} from "crawlee";
8import playwright from "playwright";
9import { Session } from "@crawlee/core";
10import { init, save } from "./_utils/common.js";
11
12const LABELS = {
13  INDEX: `INDEX`,
14  PRODUCTS: `PRODUCTS`,
15};
16
17var MODE;
18
19(function (MODE) {
20  MODE["TEST"] = "TEST";
21  MODE["FULL"] = "FULL";
22})(MODE || (MODE = {}));
23
24const BASE_URL = `https://www.bike24.com`;
25
26async function enqueueInitial(mode, crawler) {
27  if (mode === MODE.FULL) {
28    await crawler.addRequests([
29      {
30        userData: { label: LABELS.INDEX },
31        url: `https://www.bike24.com/brands`,
32      },
33    ]);
34  } else if (mode === MODE.TEST) {
35    await crawler.addRequests([
36      {
37        userData: { label: LABELS.PRODUCTS },
38        url: `https://www.bike24.com/brands/100percent`,
39      },
40    ]);
41  }
42}
43
44const router = createCheerioRouter();
45
46router.addHandler(LABELS.INDEX, async ({ crawler, $ }) => {
47  $(`.list-brands-sitemap__section-item a`).each((i, el) => {
48    const url = $(el).attr(`href`); // urls are relative
49    const fullUrl = `${BASE_URL}${url}`;
50    const name = $(el).text().trim(); // there's extra space at the beginning and end
51    void crawler.addRequests([
52      {
53        userData: { label: LABELS.PRODUCTS, category: name },
54        url: fullUrl,
55      },
56    ]);
57  });
58});
59
60router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request }) => {
61  if (!request.url.includes(`page=`)) {
62    // on first page
63    const totalPages = Number($(`.page-pagination-item`).last().text()); // e.g. `12`
64    // FIXME:
65    for (let i = 2; i <= Math.min(totalPages, 3); i++) {
66      // skip first page, that is already handled
67      const url = new URL(request.url);
68      url.searchParams.set(`page`, i.toString());
69      void crawler.addRequests([
70        {
71          url: url.toString(),
72          userData: {
73            label: LABELS.PRODUCTS,
74            category: request.userData.category, // pass category name
75          },
76        },
77      ]);
78    }
79  }
80
81  const TAX_RATE = 1.21;
82
83  const products = [];
84  const $products = $(`.product-tile`);
85  $products.each((i, el) => {
86    const pid = $(el)
87      .find(`.product-tile__anchor`)
88      .attr(`href`)
89      .replace(/\D/g, ``); // e.g. `p2421335.html` -> `2421335
90    const relUrl = $(el).find(`.product-tile__anchor`).attr(`href`); // relative url
91    const url = `${BASE_URL}${relUrl}`;
92    const name = $(el).find(`.product-tile__title`)?.text()?.trim();
93    const prices = JSON.parse($(`.productPrice`, el).attr(`data-props`));
94    const img = $(el).find(`.product-tile__picture img`).attr(`src`);
95    const inStock = !!$(`.delivery-message--success`).length;
96    const product = {
97      pid,
98      name,
99      url,
100      img,
101      inStock,
102      currentPrice: prices.price * TAX_RATE,
103      originalPrice: prices.oldPrice
104        ? prices.oldPrice * TAX_RATE
105        : prices.price * TAX_RATE,
106      currency: `EUR`,
107    };
108    products.push(product);
109  });
110  await save(products);
111});
112
113void Actor.main(async () => {
114  const input = await Actor.getInput();
115  const {
116    mode = MODE.FULL,
117    proxyConfiguration: inputProxyConfiguration,
118    ...rest
119  } = input ?? {};
120
121  // TODO: Better pattern to handle both proxy and no proxy
122  const proxyConfiguration = inputProxyConfiguration
123    ? await Actor.createProxyConfiguration(inputProxyConfiguration)
124    : undefined;
125
126  await init({ actorNameOverride: `bike-24` }, rest);
127  const crawler = new CheerioCrawler({
128    proxyConfiguration,
129    maxConcurrency: 1,
130    maxRequestRetries: 0,
131    sessionPoolOptions: {
132      maxPoolSize: 1, // not brave enough for concurrency
133      sessionOptions: {
134        maxAgeSecs: 60 * 60 * 2, // 2 hours, default is 50m
135        maxUsageCount: 1000, // default is 50, let's use as much as possible, until we get blocked
136        // TODO: Investigate why so many Firefox sessions are created
137      },
138      createSessionFunction: async (sessionPool) => {
139        console.log(
140          `[SESSION] Creating new session, will use Firefox to unblock (should take ~10s)`
141        );
142        const session = new Session({ sessionPool });
143        await unblock(session, proxyConfiguration);
144        return session;
145      },
146    },
147    persistCookiesPerSession: true,
148    preNavigationHooks: [
149      async ({ session }, gotOptions) => {
150        const userData = session.userData;
151        gotOptions.headers = userData.headers; // real-like headers obtained from Firefox
152        gotOptions.headers.Cookie = userData.cookies
153          .map((c) => `${c.name}=${c.value}`)
154          .join(`; `); // real cookies obtained from Firefox
155        // gotOptions.proxyUrl = `http://127.0.0.1:9090` // NOTE: uncomment for debugging with MITM
156      },
157    ],
158    requestHandler: router,
159  });
160  await enqueueInitial(mode, crawler);
161  await crawler.run();
162});
163
164async function unblock(session, proxyConfiguration) {
165  const browser = await playwright.firefox.launch({
166    headless: true, // NOTE: uncomment for debugging
167    // TODO: Better pattern to handle both proxy and no proxy
168    proxy: proxyConfiguration
169      ? { server: await proxyConfiguration.newUrl(session.id) }
170      : undefined,
171    // proxy: { server: `http://127.0.0.1:9090` }, // NOTE: uncomment for debugging with MITM
172  });
173  const browserContext = await browser.newContext({ ignoreHTTPSErrors: true });
174
175  const countryCode = `29`;
176  await browserContext.addCookies([
177    {
178      name: `countryTax`,
179      value: `{"shippingCountry":${countryCode},"taxRates":[{"value":21,"name":"Normaler Mehrwertsteuersatz","taxGroup":1},{"value":15,"name":"Lebensmittel mit red. MwSt.","taxGroup":2},{"value":15,"name":"Druckerzeugnisse","taxGroup":3}],"validUntil":"Wednesday, 16-Nov-2022 00:00:00 UTC"}`, // FIXME
180      domain: `www.bike24.com`,
181      path: `/`,
182    },
183    {
184      name: `deliveryLocation`,
185      value: `{"country":${countryCode},"zipCode":null}`,
186      domain: `www.bike24.com`,
187      path: `/`,
188    },
189  ]);
190
191  const page = await browserContext.newPage();
192  // page.on(`console`, msg => console.log(`⚪️ Playwright log (${msg.type()}) ${msg.text()}`))
193
194  let headersToSet;
195
196  await page.route(`**/*`, (route) => {
197    const request = route.request();
198    const url = request.url();
199    const method = request.method(); // GET, POST, etc.
200    const resourceType = request.resourceType(); // document, stylesheet, image, ...
201    // console.log(`🔵 Playwright route: ${method} ${url} (${resourceType})`)
202
203    // use the first main request to store the sent headers
204    if (!headersToSet) headersToSet = pickHeaders(request.headers());
205
206    route.continue();
207  });
208
209  await page.goto(`https://www.bike24.com/brands/shimano`);
210  // Wait for some time to pass basic Cloudflare Javascript checks
211  await crawleeUtils.sleep(5000); // TODO: Be smarter, 3000s is enough for r2-bike.com, but not for g2.com
212  // Get all cookies and store them for subsequent requests
213  const cookies = await page.context().cookies();
214  // eslint-disable-next-line dot-notation
215  const cfCookie = cookies.find((c) => c.name === `__cf_bm`).value;
216  console.log(
217    `[SESSION] Cloudflare cookie "__cf_bm": ${cfCookie ?? `😱😱😱 not found`}`
218  );
219  session.userData = { headers: headersToSet, cookies };
220  await browser.close();
221}
222
223function pickHeaders(headers) {
224  // Pick just the headers that gotScraping can correctly handle (= order)
225  // This seems to be needed mainly to avoid setting Host header, which when set, was at the end of the headers list, which Cloudflare did not like
226  //   If we skip the Host header, then gotScraping will set it automatically, and in the correct order
227
228  // taken from https://github.com/apify/header-generator/blob/1b0fd217b6fa0beaf42b9de321e47ac5f1d4cebf/src/data_files/headers-order.json#L62
229  const headersList = [
230    `sec-ch-ua`,
231    `sec-ch-ua-mobile`,
232    `user-agent`,
233    `User-Agent`,
234    `accept`,
235    `Accept`,
236    `accept-language`,
237    `Accept-Language`,
238    `accept-encoding`,
239    `Accept-Encoding`,
240    `dnt`,
241    `DNT`,
242    `referer`,
243    `Referer`,
244
245    // Handling cookies explicitly
246    // `cookie`,
247    // `Cookie`,
248
249    `Connection`,
250    `upgrade-insecure-requests`,
251    `Upgrade-Insecure-Requests`,
252    `te`,
253    `sec-fetch-site`,
254    `sec-fetch-mode`,
255    `sec-fetch-user`,
256    `sec-fetch-dest`,
257    `Sec-Fetch-Mode`,
258    `Sec-Fetch-Dest`,
259    `Sec-Fetch-Site`,
260    `Sec-Fetch-User`,
261  ];
262  return headersList.reduce((acc, header) => {
263    if (headers[header]) acc[header] = headers[header];
264    return acc;
265  }, {});
266}

package.json

1{
2  "name": "bike24-bike24-de-scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "module",
5  "scripts": {
6    "start": "node ./main.js",
7    "push-to-apify-platform": "npx apify push"
8  },
9  "dependencies": {
10    "apify3": "npm:apify@^3.0.2",
11    "crawlee": "*",
12    "playwright": "*",
13    "@crawlee/core": "*",
14    "pg": "*",
15    "pg-connection-string": "*",
16    "dotenv": "*",
17    "find-config": "*",
18    "@elastic/elasticsearch": "*",
19    "filenamify": "*",
20    "@crawlee/memory-storage": "*"
21  },
22  "apify": {
23    "title": "Bike24 (bike24.de) scraper",
24    "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
25    "isPublic": true,
26    "isDeprecated": false,
27    "isAnonymouslyRunnable": true,
28    "notice": "",
29    "pictureUrl": "",
30    "seoTitle": "",
31    "seoDescription": "",
32    "categories": [
33      "ECOMMERCE"
34    ]
35  }
36}

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "bike24-bike24-de-scraper",
4  "title": "Bike24 (bike24.de) scraper",
5  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6  "version": "0.1.0",
7  "storages": {
8    "dataset": {
9      "actorSpecification": 1,
10      "title": "Bike24 (bike24.de) scraper",
11      "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12      "views": {
13        "overview": {
14          "title": "Overview",
15          "description": "Overview of the most important fields",
16          "transformation": {
17            "fields": [
18              "pid",
19              "name",
20              "url",
21              "img",
22              "inStock",
23              "currentPrice",
24              "originalPrice",
25              "currency"
26            ]
27          },
28          "display": {
29            "component": "table",
30            "columns": [
31              {
32                "label": "Pid",
33                "field": "pid",
34                "format": "text"
35              },
36              {
37                "label": "Name",
38                "field": "name",
39                "format": "text"
40              },
41              {
42                "label": "Url",
43                "field": "url",
44                "format": "link"
45              },
46              {
47                "label": "Img",
48                "field": "img",
49                "format": "image"
50              },
51              {
52                "label": "In Stock",
53                "field": "inStock",
54                "format": "boolean"
55              },
56              {
57                "label": "Current Price",
58                "field": "currentPrice",
59                "format": "number"
60              },
61              {
62                "label": "Original Price",
63                "field": "originalPrice",
64                "format": "number"
65              },
66              {
67                "label": "Currency",
68                "field": "currency",
69                "format": "text"
70              }
71            ]
72          }
73        }
74      }
75    }
76  }
77}

.actor/logo.png

_utils/common.js

1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Configuration, Dataset } from 'crawlee'
13import { MemoryStorage } from '@crawlee/memory-storage'
14
15config({ path: findConfig(`.env`) })
16
17const elasticIndexName = `actors-monorepo-shops`
18
19const globalLogsProps = {
20  __NODE_STARTED: new Date().toISOString(),
21}
22
23let actorName
24let pgClient
25let pgClientNormalized
26let elasticClient
27export async function init ({ actorNameOverride }, restInput) {
28  parseEnvFromInput(restInput)
29
30  if (os.platform() === `darwin`) {
31    const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
32    const basename = path.basename(filePath) // foo.ts
33    actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
34    const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
35      .split(` `)[1]
36      .trim()
37      .replace(`refs/heads/`, ``)
38    const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
39    const gitCommitShort = gitCommit.substring(0, 7)
40    globalLogsProps.__GIT_COMMIT = gitCommitShort
41  }
42
43  if (process.env.APIFY_USE_MEMORY_REQUEST_QUEUE === `true`) { // dotenv -> bool-like vars are strings
44    Configuration.getGlobalConfig().useStorageClient(new MemoryStorage())
45  }
46
47  if (process.env.APIFY_IS_AT_HOME) {
48    actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
49  }
50
51  /* ELASTIC */
52  /* ======= */
53  if (process.env.ELASTIC_CLOUD_ID) {
54    elasticClient = new ElasticClient({
55      cloud: { id: process.env.ELASTIC_CLOUD_ID },
56      auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
57    })
58
59    // const mapping = await elasticClient.indices.getMapping({ index: actorName })
60
61    // eslint-disable-next-line no-inner-declarations
62    async function enforceIndexMapping () {
63      const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
64      if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
65      await elasticClient.indices.putMapping({
66        index: elasticIndexName,
67        body: {
68          properties: {
69            _discount: { type: `float` },
70            originalPrice: { type: `float` },
71            currentPrice: { type: `float` },
72          },
73        },
74      })
75    }
76
77    try {
78      await enforceIndexMapping()
79    } catch (err) {
80      if (err.message.includes(`cannot be changed from type`)) {
81        console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
82        await elasticClient.indices.delete({ index: elasticIndexName })
83        await enforceIndexMapping()
84      }
85    }
86  }
87
88  /* POSTGRESQL */
89  /* ========== */
90  if (process.env.PG_CONNECTION_STRING) {
91    const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
92    // const pgPool = new pg.Pool(pgConfig)
93
94    pgClient = new pg.Client(pgConfig)
95    await pgClient.connect()
96
97    // Check if table exists and have proper columns
98    const { rows: tables } = await pgClient.query(`
99    SELECT table_name
100    FROM information_schema.tables
101    WHERE table_schema = 'public'
102  `)
103
104    // eslint-disable-next-line camelcase
105    const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
106    if (!tableExists) {
107      throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
108    }
109
110  // TODO: Handle pgClient closing
111  }
112
113  if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
114    const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
115
116    pgClientNormalized = new pg.Client(pgConfig)
117    await pgClientNormalized.connect()
118
119    // Check if table exists and have proper columns
120    const { rows: tables } = await pgClientNormalized.query(`
121    SELECT table_name
122    FROM information_schema.tables
123    WHERE table_schema = 'public'
124  `)
125
126    // eslint-disable-next-line camelcase
127    const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
128    // eslint-disable-next-line camelcase
129    const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
130    if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
131    if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
132
133  // TODO: Handle pgClient closing
134  }
135}
136
137// inspired by @drobnikj
138// TODO: Similar, but less obfuscated for easier debugging
139export const createUniqueKeyFromUrl = (url) => {
140  const hash = createHash(`sha256`)
141  const cleanUrl = url.split(`://`)[1] // Remove protocol
142  hash.update(cleanUrl)
143  return hash.digest(`hex`)
144}
145
146/**
147 *
148 * @param {Date} datetime
149 * @return {Promise<void>}
150 */
151export const sleepUntil = async (datetime) => {
152  const now = new Date()
153  const difference = datetime - now
154  if (difference > 0) {
155    return new Promise((resolve) => {
156      setTimeout(resolve, difference)
157    })
158  }
159  return Promise.resolve()
160}
161
162// TODO: Uff, nicer! But at least it's tested
163export function parsePrice (string) {
164  let amount, currency
165  const noText = string.replace(/[^\d,.]/g, ``)
166  const decimals = noText.match(/([,.])(\d{2})$/)
167  if (decimals) {
168    const decimalSeparator = decimals[1] // ?
169    // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
170    const decimalAmount = decimals[2] // ?
171    const mainAmount = noText.split(decimalSeparator)[0].replace(/\D/g, ``)
172    amount = parseFloat(mainAmount + `.` + decimalAmount) // ?
173  } else {
174    const justNumbers = noText.replace(/[,.]/g, ``)
175    amount = parseInt(justNumbers)
176  }
177  return { amount, currency }
178}
179
180export function toNumberOrNull (str) {
181  // TODO: Handle better, but only after adding test
182  if (str === undefined) return null
183  if (str === null) return null
184  if (str === ``) return null
185  const num = Number(str)
186  if (Number.isNaN(num)) return null
187  return num
188}
189
190export async function save (objs) {
191  if (!Array.isArray(objs)) objs = [objs]
192  if (objs.length === 0) return console.log(`No data to save.`)
193
194  const objsExtended = objs.map(async (obj) => {
195    const objExtended = {
196      ...obj,
197      actorName,
198      ...globalLogsProps,
199      // __NODE_VERSION: global.process.versions.node,
200      // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
201    }
202    // if run on Apify
203    if (process.env.APIFY_IS_AT_HOME) {
204      objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
205      objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
206      objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
207      objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
208      objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
209      if (process.env.APIFY_DONT_STORE_IN_DATASET !== `true`) { // Note: dotenv is not casting vars, so they are strings
210        await Dataset.pushData(obj)
211      }
212    }
213    return objExtended
214  })
215  // if runs on local machine (MacOS)
216  if (os.platform() === `darwin`) {
217    const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
218    const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
219    if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
220    const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
221    if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
222    for (const objExtended of objsExtended) {
223      const id = String(objExtended.id ?? objExtended.pid) // ?? uuidv4()
224      const fileName = `${filenamify(id)}.json`
225      const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
226      fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
227    }
228  }
229
230  if (pgClient) {
231    const objsPg = objs.map((obj) => ({
232      ...obj,
233      // TODO: This is becoming not nice, and not clear
234      shop: actorName,
235      scrapedAt: new Date().toISOString().split(`T`)[0],
236    }))
237
238    const columns = getColumns(objsPg)
239    const values = getValues(objsPg)
240    const queryString = `
241        INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
242        VALUES (${values})
243    `
244    try {
245      const { rowCount } = await pgClient.query(queryString)
246      console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
247    } catch (err) {
248      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
249      else throw err
250    }
251  }
252
253  // Only make sense for HlidacShopu
254  if (pgClientNormalized) {
255    const objsPgData = objs.map((obj) => ({
256      shop: actorName,
257      pid: obj.pid,
258      name: obj.name,
259      url: obj.url,
260      img: obj.img,
261    }))
262
263    const objsPgDataPrice = objs.map((obj) => ({
264      shop: actorName,
265      pid: obj.pid,
266      scrapedAt: new Date().toISOString().split(`T`)[0],
267      currentPrice: obj.currentPrice,
268      originalPrice: obj.originalPrice,
269      inStock: obj.inStock,
270    }))
271
272    const queryString = `
273        INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
274        VALUES (${getValues(objsPgData)})
275        ON CONFLICT DO NOTHING
276    `
277    try {
278      const { rowCount } = await pgClientNormalized.query(queryString)
279      console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
280    } catch (err) {
281      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
282      else throw err
283    }
284
285    const queryStringPrice = `
286        INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
287        VALUES (${getValues(objsPgDataPrice)})
288        ON CONFLICT DO NOTHING
289    `
290    try {
291      const { rowCount } = await pgClientNormalized.query(queryStringPrice)
292      console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
293    } catch (err) {
294      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
295      else throw err
296    }
297  }
298
299  if (elasticClient) {
300    // .index creates or updates the document
301    // .create creates a new document if it doesn't exist, 409 if it does
302    // try {
303    //   const res = await elasticClient.index({
304    //     index: `actors-monorepo-shops`, // TODO: Consider using actorName
305    //     id, // foo-bar
306    //     document: objExtended, // {...}
307    //   })
308    // } catch (err) {
309    //   // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
310    //   if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
311    //   else throw err
312    // }
313  }
314}
315
316function getColumns (objs) {
317  return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
318}
319
320function getValues (objs) {
321  return objs.map(objPg => Object.values(objPg).map((value) => {
322    // escape strings to prevent SQL injection
323    if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
324    // convert to DB specific null
325    if (typeof value === `undefined` || value === null) return `NULL`
326    return value
327  }).join(`, `)).join(`), (`)
328}
329
330export function parseEnvFromInput (input) {
331  const env = {}
332  for (const key in input) {
333    if (key === key.toUpperCase()) env[key] = input[key]
334  }
335  console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
336  Object.assign(process.env, env)
337}
338
339export const isInspect =
340  process.execArgv.join().includes(`--inspect`) ||
341  // @ts-ignore
342  process?._preload_modules?.join(`|`)?.includes(`debug`)