Bike24 (bike24.de) scraper avatar
Bike24 (bike24.de) scraper
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Bike24 (bike24.de) scraper

Bike24 (bike24.de) scraper

strajk/bike24-bike24-de-scraper

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

Dockerfile

1FROM apify/actor-node-playwright-firefox:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6  && npm install aws-crt \
7  && npm install --only=prod --no-optional
8
9COPY . ./

INPUT_SCHEMA.json

1{
2  "title": "Bike24 (bike24.de) scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "mode": {
8      "title": "Mode",
9      "description": "",
10      "type": "string",
11      "editor": "select",
12      "default": "TEST",
13      "prefill": "TEST",
14      "enum": [
15        "TEST",
16        "FULL"
17      ],
18      "enumTitles": [
19        "TEST mode (scrapes only few categories)",
20        "FULL"
21      ]
22    },
23    "proxyConfiguration": {
24      "title": "Proxy configuration",
25      "description": "Select proxies to be used by your actor.",
26      "type": "object",
27      "editor": "proxy",
28      "default": {
29        "useApifyProxy": true,
30        "apifyProxyGroups": [
31          "RESIDENTIAL"
32        ]
33      },
34      "prefill": {
35        "useApifyProxy": true,
36        "apifyProxyGroups": [
37          "RESIDENTIAL"
38        ]
39      }
40    },
41    "debug": {
42      "title": "Debug",
43      "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
44      "type": "boolean",
45      "editor": "checkbox",
46      "default": false
47    },
48    "APIFY_USE_MEMORY_REQUEST_QUEUE": {
49      "sectionCaption": "Advanced",
50      "sectionDescription": "Advanced options, use only if you know what you're doing.",
51      "title": "Use in-memory request queue instead of the native one",
52      "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
53      "type": "boolean",
54      "default": false,
55      "editor": "checkbox"
56    },
57    "APIFY_DONT_STORE_IN_DATASET": {
58      "title": "Don't store in dataset",
59      "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
60      "type": "boolean",
61      "default": false,
62      "editor": "checkbox"
63    },
64    "PG_CONNECTION_STRING_NORMALIZED": {
65      "title": "Postgres connection string for normalized data",
66      "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
67      "type": "string",
68      "editor": "textfield"
69    },
70    "PG_DATA_TABLE": {
71      "title": "Postgres table name for product data",
72      "description": "Table name for storing product name, url, image, ...",
73      "type": "string",
74      "editor": "textfield"
75    },
76    "PG_DATA_PRICE_TABLE": {
77      "title": "Postgres table name for price data",
78      "description": "Table name for storing price, original price, stock status, ...",
79      "type": "string",
80      "editor": "textfield"
81    }
82  },
83  "required": [
84    "mode",
85    "proxyConfiguration"
86  ]
87}

apify.json

1{
2  "name": "bike24-bike24-de-scraper",
3  "version": "0.1",
4  "buildTag": "latest",
5  "env": null,
6  "defaultRunOptions": {
7    "build": "latest",
8    "timeoutSecs": 3600,
9    "memoryMbytes": 4096
10  }
11}

main.js

1import { URL } from "node:url";
2import { Actor } from "apify3";
3import {
4  CheerioCrawler,
5  createCheerioRouter,
6  utils as crawleeUtils,
7} from "crawlee";
8import playwright from "playwright";
9import { Session } from "@crawlee/core";
10import { init, save } from "./_utils/common.js";
11
12const LABELS = {
13  INDEX: `INDEX`,
14  PRODUCTS: `PRODUCTS`,
15};
16
17var MODE;
18
19(function (MODE) {
20  MODE["TEST"] = "TEST";
21  MODE["FULL"] = "FULL";
22})(MODE || (MODE = {}));
23
24const BASE_URL = `https://www.bike24.com`;
25
26async function enqueueInitial(mode, crawler) {
27  if (mode === MODE.FULL) {
28    await crawler.addRequests([
29      {
30        userData: { label: LABELS.INDEX },
31        url: `https://www.bike24.com/brands`,
32      },
33    ]);
34  } else if (mode === MODE.TEST) {
35    await crawler.addRequests([
36      {
37        userData: { label: LABELS.PRODUCTS },
38        url: `https://www.bike24.com/brands/100percent`,
39      },
40    ]);
41  }
42}
43
44const router = createCheerioRouter();
45
46router.addHandler(LABELS.INDEX, async ({ crawler, $ }) => {
47  $(`.list-brands-sitemap__section-item a`).each((i, el) => {
48    const url = $(el).attr(`href`); // urls are relative
49    const fullUrl = `${BASE_URL}${url}`;
50    const name = $(el).text().trim(); // there's extra space at the beginning and end
51    void crawler.addRequests([
52      {
53        userData: { label: LABELS.PRODUCTS, category: name },
54        url: fullUrl,
55      },
56    ]);
57  });
58});
59
60router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request }) => {
61  if (!request.url.includes(`page=`)) {
62    // on first page
63    const totalPages = Number($(`.page-pagination-item`).last().text()); // e.g. `12`
64    // FIXME:
65    for (let i = 2; i <= Math.min(totalPages, 3); i++) {
66      // skip first page, that is already handled
67      const url = new URL(request.url);
68      url.searchParams.set(`page`, i.toString());
69      void crawler.addRequests([
70        {
71          url: url.toString(),
72          userData: {
73            label: LABELS.PRODUCTS,
74            category: request.userData.category, // pass category name
75          },
76        },
77      ]);
78    }
79  }
80
81  const TAX_RATE = 1.21;
82
83  const products = [];
84  const $products = $(`.product-tile`);
85  $products.each((i, el) => {
86    const pid = $(el)
87      .find(`.product-tile__anchor`)
88      .attr(`href`)
89      .replace(/\D/g, ``); // e.g. `p2421335.html` -> `2421335
90    const relUrl = $(el).find(`.product-tile__anchor`).attr(`href`); // relative url
91    const url = `${BASE_URL}${relUrl}`;
92    const name = $(el).find(`.product-tile__title`)?.text()?.trim();
93    const prices = JSON.parse($(`.productPrice`, el).attr(`data-props`));
94    const img = $(el).find(`.product-tile__picture img`).attr(`src`);
95    const inStock = !!$(`.delivery-message--success`).length;
96    const product = {
97      pid,
98      name,
99      url,
100      img,
101      inStock,
102      currentPrice: prices.price * TAX_RATE,
103      originalPrice: prices.oldPrice
104        ? prices.oldPrice * TAX_RATE
105        : prices.price * TAX_RATE,
106      currency: `EUR`,
107    };
108    products.push(product);
109  });
110  await save(products);
111});
112
113void Actor.main(async () => {
114  const input = await Actor.getInput();
115  const {
116    mode = MODE.FULL,
117    proxyConfiguration: inputProxyConfiguration,
118    ...rest
119  } = input ?? {};
120
121  // TODO: Better pattern to handle both proxy and no proxy
122  const proxyConfiguration = inputProxyConfiguration
123    ? await Actor.createProxyConfiguration(inputProxyConfiguration)
124    : undefined;
125
126  await init({ actorNameOverride: `bike-24` }, rest);
127  const crawler = new CheerioCrawler({
128    proxyConfiguration,
129    maxConcurrency: 1,
130    maxRequestRetries: 0,
131    sessionPoolOptions: {
132      maxPoolSize: 1, // not brave enough for concurrency
133      sessionOptions: {
134        maxAgeSecs: 60 * 60 * 2, // 2 hours, default is 50m
135        maxUsageCount: 1000, // default is 50, let's use as much as possible, until we get blocked
136        // TODO: Investigate why so many Firefox sessions are created
137      },
138      createSessionFunction: async (sessionPool) => {
139        console.log(
140          `[SESSION] Creating new session, will use Firefox to unblock (should take ~10s)`
141        );
142        const session = new Session({ sessionPool });
143        await unblock(session, proxyConfiguration);
144        return session;
145      },
146    },
147    persistCookiesPerSession: true,
148    preNavigationHooks: [
149      async ({ session }, gotOptions) => {
150        const userData = session.userData;
151        gotOptions.headers = userData.headers; // real-like headers obtained from Firefox
152        gotOptions.headers.Cookie = userData.cookies
153          .map((c) => `${c.name}=${c.value}`)
154          .join(`; `); // real cookies obtained from Firefox
155        // gotOptions.proxyUrl = `http://127.0.0.1:9090` // NOTE: uncomment for debugging with MITM
156      },
157    ],
158    requestHandler: router,
159  });
160  await enqueueInitial(mode, crawler);
161  await crawler.run();
162});
163
164async function unblock(session, proxyConfiguration) {
165  const browser = await playwright.firefox.launch({
166    headless: true, // NOTE: uncomment for debugging
167    // TODO: Better pattern to handle both proxy and no proxy
168    proxy: proxyConfiguration
169      ? { server: await proxyConfiguration.newUrl(session.id) }
170      : undefined,
171    // proxy: { server: `http://127.0.0.1:9090` }, // NOTE: uncomment for debugging with MITM
172  });
173  const browserContext = await browser.newContext({ ignoreHTTPSErrors: true });
174
175  const countryCode = `29`;
176  await browserContext.addCookies([
177    {
178      name: `countryTax`,
179      value: `{"shippingCountry":${countryCode},"taxRates":[{"value":21,"name":"Normaler Mehrwertsteuersatz","taxGroup":1},{"value":15,"name":"Lebensmittel mit red. MwSt.","taxGroup":2},{"value":15,"name":"Druckerzeugnisse","taxGroup":3}],"validUntil":"Wednesday, 16-Nov-2022 00:00:00 UTC"}`, // FIXME
180      domain: `www.bike24.com`,
181      path: `/`,
182    },
183    {
184      name: `deliveryLocation`,
185      value: `{"country":${countryCode},"zipCode":null}`,
186      domain: `www.bike24.com`,
187      path: `/`,
188    },
189  ]);
190
191  const page = await browserContext.newPage();
192  // page.on(`console`, msg => console.log(`⚪️ Playwright log (${msg.type()}) ${msg.text()}`))
193
194  let headersToSet;
195
196  await page.route(`**/*`, (route) => {
197    const request = route.request();
198    const url = request.url();
199    const method = request.method(); // GET, POST, etc.
200    const resourceType = request.resourceType(); // document, stylesheet, image, ...
201    // console.log(`🔵 Playwright route: ${method} ${url} (${resourceType})`)
202
203    // use the first main request to store the sent headers
204    if (!headersToSet) headersToSet = pickHeaders(request.headers());
205
206    route.continue();
207  });
208
209  await page.goto(`https://www.bike24.com/brands/shimano`);
210  // Wait for some time to pass basic Cloudflare Javascript checks
211  await crawleeUtils.sleep(5000); // TODO: Be smarter, 3000s is enough for r2-bike.com, but not for g2.com
212  // Get all cookies and store them for subsequent requests
213  const cookies = await page.context().cookies();
214  // eslint-disable-next-line dot-notation
215  const cfCookie = cookies.find((c) => c.name === `__cf_bm`).value;
216  console.log(
217    `[SESSION] Cloudflare cookie "__cf_bm": ${cfCookie ?? `😱😱😱 not found`}`
218  );
219  session.userData = { headers: headersToSet, cookies };
220  await browser.close();
221}
222
223function pickHeaders(headers) {
224  // Pick just the headers that gotScraping can correctly handle (= order)
225  // This seems to be needed mainly to avoid setting Host header, which when set, was at the end of the headers list, which Cloudflare did not like
226  //   If we skip the Host header, then gotScraping will set it automatically, and in the correct order
227
228  // taken from https://github.com/apify/header-generator/blob/1b0fd217b6fa0beaf42b9de321e47ac5f1d4cebf/src/data_files/headers-order.json#L62
229  const headersList = [
230    `sec-ch-ua`,
231    `sec-ch-ua-mobile`,
232    `user-agent`,
233    `User-Agent`,
234    `accept`,
235    `Accept`,
236    `accept-language`,
237    `Accept-Language`,
238    `accept-encoding`,
239    `Accept-Encoding`,
240    `dnt`,
241    `DNT`,
242    `referer`,
243    `Referer`,
244
245    // Handling cookies explicitly
246    // `cookie`,
247    // `Cookie`,
248
249    `Connection`,
250    `upgrade-insecure-requests`,
251    `Upgrade-Insecure-Requests`,
252    `te`,
253    `sec-fetch-site`,
254    `sec-fetch-mode`,
255    `sec-fetch-user`,
256    `sec-fetch-dest`,
257    `Sec-Fetch-Mode`,
258    `Sec-Fetch-Dest`,
259    `Sec-Fetch-Site`,
260    `Sec-Fetch-User`,
261  ];
262  return headersList.reduce((acc, header) => {
263    if (headers[header]) acc[header] = headers[header];
264    return acc;
265  }, {});
266}

package.json

1{
2  "name": "bike24-bike24-de-scraper",
3  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4  "type": "module",
5  "scripts": {
6    "start": "node ./main.js",
7    "push-to-apify-platform": "npx apify push"
8  },
9  "dependencies": {
10    "apify3": "npm:apify@^3.0.2",
11    "crawlee": "*",
12    "playwright": "*",
13    "@crawlee/core": "*",
14    "pg": "*",
15    "pg-connection-string": "*",
16    "dotenv": "*",
17    "find-config": "*",
18    "@elastic/elasticsearch": "*",
19    "filenamify": "*",
20    "@crawlee/memory-storage": "*"
21  },
22  "apify": {
23    "title": "Bike24 (bike24.de) scraper",
24    "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
25    "isPublic": true,
26    "isDeprecated": false,
27    "isAnonymouslyRunnable": true,
28    "notice": "",
29    "pictureUrl": "",
30    "seoTitle": "",
31    "seoDescription": "",
32    "categories": [
33      "ECOMMERCE"
34    ]
35  }
36}

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "bike24-bike24-de-scraper",
4  "title": "Bike24 (bike24.de) scraper",
5  "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6  "version": "0.1.0",
7  "storages": {
8    "dataset": {
9      "actorSpecification": 1,
10      "title": "Bike24 (bike24.de) scraper",
11      "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12      "views": {
13        "overview": {
14          "title": "Overview",
15          "description": "Overview of the most important fields",
16          "transformation": {
17            "fields": [
18              "pid",
19              "name",
20              "url",
21              "img",
22              "inStock",
23              "currentPrice",
24              "originalPrice",
25              "currency"
26            ]
27          },
28          "display": {
29            "component": "table",
30            "columns": [
31              {
32                "label": "Pid",
33                "field": "pid",
34                "format": "text"
35              },
36              {
37                "label": "Name",
38                "field": "name",
39                "format": "text"
40              },
41              {
42                "label": "Url",
43                "field": "url",
44                "format": "link"
45              },
46              {
47                "label": "Img",
48                "field": "img",
49                "format": "image"
50              },
51              {
52                "label": "In Stock",
53                "field": "inStock",
54                "format": "boolean"
55              },
56              {
57                "label": "Current Price",
58                "field": "currentPrice",
59                "format": "number"
60              },
61              {
62                "label": "Original Price",
63                "field": "originalPrice",
64                "format": "number"
65              },
66              {
67                "label": "Currency",
68                "field": "currency",
69                "format": "text"
70              }
71            ]
72          }
73        }
74      }
75    }
76  }
77}

.actor/logo.png

_utils/common.js

1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Configuration, Dataset } from 'crawlee'
13import { MemoryStorage } from '@crawlee/memory-storage'
14
15config({ path: findConfig(`.env`) })
16
17const elasticIndexName = `actors-monorepo-shops`
18
19const globalLogsProps = {
20  __NODE_STARTED: new Date().toISOString(),
21}
22
23let actorName
24let pgClient
25let pgClientNormalized
26let elasticClient
27export async function init ({ actorNameOverride }, restInput) {
28  parseEnvFromInput(restInput)
29
30  if (os.platform() === `darwin`) {
31    const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
32    const basename = path.basename(filePath) // foo.ts
33    actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
34    const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
35      .split(` `)[1]
36      .trim()
37      .replace(`refs/heads/`, ``)
38    const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
39    const gitCommitShort = gitCommit.substring(0, 7)
40    globalLogsProps.__GIT_COMMIT = gitCommitShort
41  }
42
43  if (process.env.APIFY_USE_MEMORY_REQUEST_QUEUE === `true`) { // dotenv -> bool-like vars are strings
44    Configuration.getGlobalConfig().useStorageClient(new MemoryStorage())
45  }
46
47  if (process.env.APIFY_IS_AT_HOME) {
48    actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
49  }
50
51  /* ELASTIC */
52  /* ======= */
53  if (process.env.ELASTIC_CLOUD_ID) {
54    elasticClient = new ElasticClient({
55      cloud: { id: process.env.ELASTIC_CLOUD_ID },
56      auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
57    })
58
59    // const mapping = await elasticClient.indices.getMapping({ index: actorName })
60
61    // eslint-disable-next-line no-inner-declarations
62    async function enforceIndexMapping () {
63      const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
64      if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
65      await elasticClient.indices.putMapping({
66        index: elasticIndexName,
67        body: {
68          properties: {
69            _discount: { type: `float` },
70            originalPrice: { type: `float` },
71            currentPrice: { type: `float` },
72          },
73        },
74      })
75    }
76
77    try {
78      await enforceIndexMapping()
79    } catch (err) {
80      if (err.message.includes(`cannot be changed from type`)) {
81        console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
82        await elasticClient.indices.delete({ index: elasticIndexName })
83        await enforceIndexMapping()
84      }
85    }
86  }
87
88  /* POSTGRESQL */
89  /* ========== */
90  if (process.env.PG_CONNECTION_STRING) {
91    const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
92    // const pgPool = new pg.Pool(pgConfig)
93
94    pgClient = new pg.Client(pgConfig)
95    await pgClient.connect()
96
97    // Check if table exists and have proper columns
98    const { rows: tables } = await pgClient.query(`
99    SELECT table_name
100    FROM information_schema.tables
101    WHERE table_schema = 'public'
102  `)
103
104    // eslint-disable-next-line camelcase
105    const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
106    if (!tableExists) {
107      throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
108    }
109
110  // TODO: Handle pgClient closing
111  }
112
113  if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
114    const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
115
116    pgClientNormalized = new pg.Client(pgConfig)
117    await pgClientNormalized.connect()
118
119    // Check if table exists and have proper columns
120    const { rows: tables } = await pgClientNormalized.query(`
121    SELECT table_name
122    FROM information_schema.tables
123    WHERE table_schema = 'public'
124  `)
125
126    // eslint-disable-next-line camelcase
127    const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
128    // eslint-disable-next-line camelcase
129    const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
130    if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
131    if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
132
133  // TODO: Handle pgClient closing
134  }
135}
136
137// inspired by @drobnikj
138// TODO: Similar, but less obfuscated for easier debugging
139export const createUniqueKeyFromUrl = (url) => {
140  const hash = createHash(`sha256`)
141  const cleanUrl = url.split(`://`)[1] // Remove protocol
142  hash.update(cleanUrl)
143  return hash.digest(`hex`)
144}
145
146/**
147 *
148 * @param {Date} datetime
149 * @return {Promise<void>}
150 */
151export const sleepUntil = async (datetime) => {
152  const now = new Date()
153  const difference = datetime - now
154  if (difference > 0) {
155    return new Promise((resolve) => {
156      setTimeout(resolve, difference)
157    })
158  }
159  return Promise.resolve()
160}
161
162// TODO: Uff, nicer! But at least it's tested
163export function parsePrice (string) {
164  let amount, currency
165  const noText = string.replace(/[^\d,.]/g, ``)
166  const decimals = noText.match(/([,.])(\d{2})$/)
167  if (decimals) {
168    const decimalSeparator = decimals[1] // ?
169    // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
170    const decimalAmount = decimals[2] // ?
171    const mainAmount = noText.split(decimalSeparator)[0].replace(/\D/g, ``)
172    amount = parseFloat(mainAmount + `.` + decimalAmount) // ?
173  } else {
174    const justNumbers = noText.replace(/[,.]/g, ``)
175    amount = parseInt(justNumbers)
176  }
177  return { amount, currency }
178}
179
180export function toNumberOrNull (str) {
181  // TODO: Handle better, but only after adding test
182  if (str === undefined) return null
183  if (str === null) return null
184  if (str === ``) return null
185  const num = Number(str)
186  if (Number.isNaN(num)) return null
187  return num
188}
189
190export async function save (objs) {
191  if (!Array.isArray(objs)) objs = [objs]
192  if (objs.length === 0) return console.log(`No data to save.`)
193
194  const objsExtended = objs.map(async (obj) => {
195    const objExtended = {
196      ...obj,
197      actorName,
198      ...globalLogsProps,
199      // __NODE_VERSION: global.process.versions.node,
200      // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
201    }
202    // if run on Apify
203    if (process.env.APIFY_IS_AT_HOME) {
204      objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
205      objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
206      objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
207      objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
208      objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
209      if (process.env.APIFY_DONT_STORE_IN_DATASET !== `true`) { // Note: dotenv is not casting vars, so they are strings
210        await Dataset.pushData(obj)
211      }
212    }
213    return objExtended
214  })
215  // if runs on local machine (MacOS)
216  if (os.platform() === `darwin`) {
217    const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
218    const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
219    if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
220    const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
221    if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
222    for (const objExtended of objsExtended) {
223      const id = String(objExtended.id ?? objExtended.pid) // ?? uuidv4()
224      const fileName = `${filenamify(id)}.json`
225      const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
226      fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
227    }
228  }
229
230  if (pgClient) {
231    const objsPg = objs.map((obj) => ({
232      ...obj,
233      // TODO: This is becoming not nice, and not clear
234      shop: actorName,
235      scrapedAt: new Date().toISOString().split(`T`)[0],
236    }))
237
238    const columns = getColumns(objsPg)
239    const values = getValues(objsPg)
240    const queryString = `
241        INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
242        VALUES (${values})
243    `
244    try {
245      const { rowCount } = await pgClient.query(queryString)
246      console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
247    } catch (err) {
248      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
249      else throw err
250    }
251  }
252
253  // Only make sense for HlidacShopu
254  if (pgClientNormalized) {
255    const objsPgData = objs.map((obj) => ({
256      shop: actorName,
257      pid: obj.pid,
258      name: obj.name,
259      url: obj.url,
260      img: obj.img,
261    }))
262
263    const objsPgDataPrice = objs.map((obj) => ({
264      shop: actorName,
265      pid: obj.pid,
266      scrapedAt: new Date().toISOString().split(`T`)[0],
267      currentPrice: obj.currentPrice,
268      originalPrice: obj.originalPrice,
269      inStock: obj.inStock,
270    }))
271
272    const queryString = `
273        INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
274        VALUES (${getValues(objsPgData)})
275        ON CONFLICT DO NOTHING
276    `
277    try {
278      const { rowCount } = await pgClientNormalized.query(queryString)
279      console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
280    } catch (err) {
281      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
282      else throw err
283    }
284
285    const queryStringPrice = `
286        INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
287        VALUES (${getValues(objsPgDataPrice)})
288        ON CONFLICT DO NOTHING
289    `
290    try {
291      const { rowCount } = await pgClientNormalized.query(queryStringPrice)
292      console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
293    } catch (err) {
294      if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
295      else throw err
296    }
297  }
298
299  if (elasticClient) {
300    // .index creates or updates the document
301    // .create creates a new document if it doesn't exist, 409 if it does
302    // try {
303    //   const res = await elasticClient.index({
304    //     index: `actors-monorepo-shops`, // TODO: Consider using actorName
305    //     id, // foo-bar
306    //     document: objExtended, // {...}
307    //   })
308    // } catch (err) {
309    //   // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
310    //   if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
311    //   else throw err
312    // }
313  }
314}
315
316function getColumns (objs) {
317  return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
318}
319
320function getValues (objs) {
321  return objs.map(objPg => Object.values(objPg).map((value) => {
322    // escape strings to prevent SQL injection
323    if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
324    // convert to DB specific null
325    if (typeof value === `undefined` || value === null) return `NULL`
326    return value
327  }).join(`, `)).join(`), (`)
328}
329
330export function parseEnvFromInput (input) {
331  const env = {}
332  for (const key in input) {
333    if (key === key.toUpperCase()) env[key] = input[key]
334  }
335  console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
336  Object.assign(process.env, env)
337}
338
339export const isInspect =
340  process.execArgv.join().includes(`--inspect`) ||
341  // @ts-ignore
342  process?._preload_modules?.join(`|`)?.includes(`debug`)
Developer
Maintained by Community
Categories