Bike24 (bike24.de) scraper
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsBike24 (bike24.de) scraper
strajk/bike24-bike24-de-scraper
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
Dockerfile
1FROM apify/actor-node-playwright-firefox:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install aws-crt \
7 && npm install --only=prod --no-optional
8
9COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "Bike24 (bike24.de) scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "mode": {
8 "title": "Mode",
9 "description": "",
10 "type": "string",
11 "editor": "select",
12 "default": "TEST",
13 "prefill": "TEST",
14 "enum": [
15 "TEST",
16 "FULL"
17 ],
18 "enumTitles": [
19 "TEST mode (scrapes only few categories)",
20 "FULL"
21 ]
22 },
23 "proxyConfiguration": {
24 "title": "Proxy configuration",
25 "description": "Select proxies to be used by your actor.",
26 "type": "object",
27 "editor": "proxy",
28 "default": {
29 "useApifyProxy": true,
30 "apifyProxyGroups": [
31 "RESIDENTIAL"
32 ]
33 },
34 "prefill": {
35 "useApifyProxy": true,
36 "apifyProxyGroups": [
37 "RESIDENTIAL"
38 ]
39 }
40 },
41 "debug": {
42 "title": "Debug",
43 "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
44 "type": "boolean",
45 "editor": "checkbox",
46 "default": false
47 },
48 "APIFY_USE_MEMORY_REQUEST_QUEUE": {
49 "sectionCaption": "Advanced",
50 "sectionDescription": "Advanced options, use only if you know what you're doing.",
51 "title": "Use in-memory request queue instead of the native one",
52 "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
53 "type": "boolean",
54 "default": false,
55 "editor": "checkbox"
56 },
57 "APIFY_DONT_STORE_IN_DATASET": {
58 "title": "Don't store in dataset",
59 "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
60 "type": "boolean",
61 "default": false,
62 "editor": "checkbox"
63 },
64 "PG_CONNECTION_STRING_NORMALIZED": {
65 "title": "Postgres connection string for normalized data",
66 "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
67 "type": "string",
68 "editor": "textfield"
69 },
70 "PG_DATA_TABLE": {
71 "title": "Postgres table name for product data",
72 "description": "Table name for storing product name, url, image, ...",
73 "type": "string",
74 "editor": "textfield"
75 },
76 "PG_DATA_PRICE_TABLE": {
77 "title": "Postgres table name for price data",
78 "description": "Table name for storing price, original price, stock status, ...",
79 "type": "string",
80 "editor": "textfield"
81 }
82 },
83 "required": [
84 "mode",
85 "proxyConfiguration"
86 ]
87}
apify.json
1{
2 "name": "bike24-bike24-de-scraper",
3 "version": "0.1",
4 "buildTag": "latest",
5 "env": null,
6 "defaultRunOptions": {
7 "build": "latest",
8 "timeoutSecs": 3600,
9 "memoryMbytes": 4096
10 }
11}
main.js
1import { URL } from "node:url";
2import { Actor } from "apify3";
3import {
4 CheerioCrawler,
5 createCheerioRouter,
6 utils as crawleeUtils,
7} from "crawlee";
8import playwright from "playwright";
9import { Session } from "@crawlee/core";
10import { init, save } from "./_utils/common.js";
11
12const LABELS = {
13 INDEX: `INDEX`,
14 PRODUCTS: `PRODUCTS`,
15};
16
17var MODE;
18
19(function (MODE) {
20 MODE["TEST"] = "TEST";
21 MODE["FULL"] = "FULL";
22})(MODE || (MODE = {}));
23
24const BASE_URL = `https://www.bike24.com`;
25
26async function enqueueInitial(mode, crawler) {
27 if (mode === MODE.FULL) {
28 await crawler.addRequests([
29 {
30 userData: { label: LABELS.INDEX },
31 url: `https://www.bike24.com/brands`,
32 },
33 ]);
34 } else if (mode === MODE.TEST) {
35 await crawler.addRequests([
36 {
37 userData: { label: LABELS.PRODUCTS },
38 url: `https://www.bike24.com/brands/100percent`,
39 },
40 ]);
41 }
42}
43
44const router = createCheerioRouter();
45
46router.addHandler(LABELS.INDEX, async ({ crawler, $ }) => {
47 $(`.list-brands-sitemap__section-item a`).each((i, el) => {
48 const url = $(el).attr(`href`); // urls are relative
49 const fullUrl = `${BASE_URL}${url}`;
50 const name = $(el).text().trim(); // there's extra space at the beginning and end
51 void crawler.addRequests([
52 {
53 userData: { label: LABELS.PRODUCTS, category: name },
54 url: fullUrl,
55 },
56 ]);
57 });
58});
59
60router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request }) => {
61 if (!request.url.includes(`page=`)) {
62 // on first page
63 const totalPages = Number($(`.page-pagination-item`).last().text()); // e.g. `12`
64 // FIXME:
65 for (let i = 2; i <= Math.min(totalPages, 3); i++) {
66 // skip first page, that is already handled
67 const url = new URL(request.url);
68 url.searchParams.set(`page`, i.toString());
69 void crawler.addRequests([
70 {
71 url: url.toString(),
72 userData: {
73 label: LABELS.PRODUCTS,
74 category: request.userData.category, // pass category name
75 },
76 },
77 ]);
78 }
79 }
80
81 const TAX_RATE = 1.21;
82
83 const products = [];
84 const $products = $(`.product-tile`);
85 $products.each((i, el) => {
86 const pid = $(el)
87 .find(`.product-tile__anchor`)
88 .attr(`href`)
89 .replace(/\D/g, ``); // e.g. `p2421335.html` -> `2421335
90 const relUrl = $(el).find(`.product-tile__anchor`).attr(`href`); // relative url
91 const url = `${BASE_URL}${relUrl}`;
92 const name = $(el).find(`.product-tile__title`)?.text()?.trim();
93 const prices = JSON.parse($(`.productPrice`, el).attr(`data-props`));
94 const img = $(el).find(`.product-tile__picture img`).attr(`src`);
95 const inStock = !!$(`.delivery-message--success`).length;
96 const product = {
97 pid,
98 name,
99 url,
100 img,
101 inStock,
102 currentPrice: prices.price * TAX_RATE,
103 originalPrice: prices.oldPrice
104 ? prices.oldPrice * TAX_RATE
105 : prices.price * TAX_RATE,
106 currency: `EUR`,
107 };
108 products.push(product);
109 });
110 await save(products);
111});
112
113void Actor.main(async () => {
114 const input = await Actor.getInput();
115 const {
116 mode = MODE.FULL,
117 proxyConfiguration: inputProxyConfiguration,
118 ...rest
119 } = input ?? {};
120
121 // TODO: Better pattern to handle both proxy and no proxy
122 const proxyConfiguration = inputProxyConfiguration
123 ? await Actor.createProxyConfiguration(inputProxyConfiguration)
124 : undefined;
125
126 await init({ actorNameOverride: `bike-24` }, rest);
127 const crawler = new CheerioCrawler({
128 proxyConfiguration,
129 maxConcurrency: 1,
130 maxRequestRetries: 0,
131 sessionPoolOptions: {
132 maxPoolSize: 1, // not brave enough for concurrency
133 sessionOptions: {
134 maxAgeSecs: 60 * 60 * 2, // 2 hours, default is 50m
135 maxUsageCount: 1000, // default is 50, let's use as much as possible, until we get blocked
136 // TODO: Investigate why so many Firefox sessions are created
137 },
138 createSessionFunction: async (sessionPool) => {
139 console.log(
140 `[SESSION] Creating new session, will use Firefox to unblock (should take ~10s)`
141 );
142 const session = new Session({ sessionPool });
143 await unblock(session, proxyConfiguration);
144 return session;
145 },
146 },
147 persistCookiesPerSession: true,
148 preNavigationHooks: [
149 async ({ session }, gotOptions) => {
150 const userData = session.userData;
151 gotOptions.headers = userData.headers; // real-like headers obtained from Firefox
152 gotOptions.headers.Cookie = userData.cookies
153 .map((c) => `${c.name}=${c.value}`)
154 .join(`; `); // real cookies obtained from Firefox
155 // gotOptions.proxyUrl = `http://127.0.0.1:9090` // NOTE: uncomment for debugging with MITM
156 },
157 ],
158 requestHandler: router,
159 });
160 await enqueueInitial(mode, crawler);
161 await crawler.run();
162});
163
164async function unblock(session, proxyConfiguration) {
165 const browser = await playwright.firefox.launch({
166 headless: true, // NOTE: uncomment for debugging
167 // TODO: Better pattern to handle both proxy and no proxy
168 proxy: proxyConfiguration
169 ? { server: await proxyConfiguration.newUrl(session.id) }
170 : undefined,
171 // proxy: { server: `http://127.0.0.1:9090` }, // NOTE: uncomment for debugging with MITM
172 });
173 const browserContext = await browser.newContext({ ignoreHTTPSErrors: true });
174
175 const countryCode = `29`;
176 await browserContext.addCookies([
177 {
178 name: `countryTax`,
179 value: `{"shippingCountry":${countryCode},"taxRates":[{"value":21,"name":"Normaler Mehrwertsteuersatz","taxGroup":1},{"value":15,"name":"Lebensmittel mit red. MwSt.","taxGroup":2},{"value":15,"name":"Druckerzeugnisse","taxGroup":3}],"validUntil":"Wednesday, 16-Nov-2022 00:00:00 UTC"}`, // FIXME
180 domain: `www.bike24.com`,
181 path: `/`,
182 },
183 {
184 name: `deliveryLocation`,
185 value: `{"country":${countryCode},"zipCode":null}`,
186 domain: `www.bike24.com`,
187 path: `/`,
188 },
189 ]);
190
191 const page = await browserContext.newPage();
192 // page.on(`console`, msg => console.log(`⚪️ Playwright log (${msg.type()}) ${msg.text()}`))
193
194 let headersToSet;
195
196 await page.route(`**/*`, (route) => {
197 const request = route.request();
198 const url = request.url();
199 const method = request.method(); // GET, POST, etc.
200 const resourceType = request.resourceType(); // document, stylesheet, image, ...
201 // console.log(`🔵 Playwright route: ${method} ${url} (${resourceType})`)
202
203 // use the first main request to store the sent headers
204 if (!headersToSet) headersToSet = pickHeaders(request.headers());
205
206 route.continue();
207 });
208
209 await page.goto(`https://www.bike24.com/brands/shimano`);
210 // Wait for some time to pass basic Cloudflare Javascript checks
211 await crawleeUtils.sleep(5000); // TODO: Be smarter, 3000s is enough for r2-bike.com, but not for g2.com
212 // Get all cookies and store them for subsequent requests
213 const cookies = await page.context().cookies();
214 // eslint-disable-next-line dot-notation
215 const cfCookie = cookies.find((c) => c.name === `__cf_bm`).value;
216 console.log(
217 `[SESSION] Cloudflare cookie "__cf_bm": ${cfCookie ?? `😱😱😱 not found`}`
218 );
219 session.userData = { headers: headersToSet, cookies };
220 await browser.close();
221}
222
223function pickHeaders(headers) {
224 // Pick just the headers that gotScraping can correctly handle (= order)
225 // This seems to be needed mainly to avoid setting Host header, which when set, was at the end of the headers list, which Cloudflare did not like
226 // If we skip the Host header, then gotScraping will set it automatically, and in the correct order
227
228 // taken from https://github.com/apify/header-generator/blob/1b0fd217b6fa0beaf42b9de321e47ac5f1d4cebf/src/data_files/headers-order.json#L62
229 const headersList = [
230 `sec-ch-ua`,
231 `sec-ch-ua-mobile`,
232 `user-agent`,
233 `User-Agent`,
234 `accept`,
235 `Accept`,
236 `accept-language`,
237 `Accept-Language`,
238 `accept-encoding`,
239 `Accept-Encoding`,
240 `dnt`,
241 `DNT`,
242 `referer`,
243 `Referer`,
244
245 // Handling cookies explicitly
246 // `cookie`,
247 // `Cookie`,
248
249 `Connection`,
250 `upgrade-insecure-requests`,
251 `Upgrade-Insecure-Requests`,
252 `te`,
253 `sec-fetch-site`,
254 `sec-fetch-mode`,
255 `sec-fetch-user`,
256 `sec-fetch-dest`,
257 `Sec-Fetch-Mode`,
258 `Sec-Fetch-Dest`,
259 `Sec-Fetch-Site`,
260 `Sec-Fetch-User`,
261 ];
262 return headersList.reduce((acc, header) => {
263 if (headers[header]) acc[header] = headers[header];
264 return acc;
265 }, {});
266}
package.json
1{
2 "name": "bike24-bike24-de-scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4 "type": "module",
5 "scripts": {
6 "start": "node ./main.js",
7 "push-to-apify-platform": "npx apify push"
8 },
9 "dependencies": {
10 "apify3": "npm:apify@^3.0.2",
11 "crawlee": "*",
12 "playwright": "*",
13 "@crawlee/core": "*",
14 "pg": "*",
15 "pg-connection-string": "*",
16 "dotenv": "*",
17 "find-config": "*",
18 "@elastic/elasticsearch": "*",
19 "filenamify": "*",
20 "@crawlee/memory-storage": "*"
21 },
22 "apify": {
23 "title": "Bike24 (bike24.de) scraper",
24 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
25 "isPublic": true,
26 "isDeprecated": false,
27 "isAnonymouslyRunnable": true,
28 "notice": "",
29 "pictureUrl": "",
30 "seoTitle": "",
31 "seoDescription": "",
32 "categories": [
33 "ECOMMERCE"
34 ]
35 }
36}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "bike24-bike24-de-scraper",
4 "title": "Bike24 (bike24.de) scraper",
5 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6 "version": "0.1.0",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "Bike24 (bike24.de) scraper",
11 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12 "views": {
13 "overview": {
14 "title": "Overview",
15 "description": "Overview of the most important fields",
16 "transformation": {
17 "fields": [
18 "pid",
19 "name",
20 "url",
21 "img",
22 "inStock",
23 "currentPrice",
24 "originalPrice",
25 "currency"
26 ]
27 },
28 "display": {
29 "component": "table",
30 "columns": [
31 {
32 "label": "Pid",
33 "field": "pid",
34 "format": "text"
35 },
36 {
37 "label": "Name",
38 "field": "name",
39 "format": "text"
40 },
41 {
42 "label": "Url",
43 "field": "url",
44 "format": "link"
45 },
46 {
47 "label": "Img",
48 "field": "img",
49 "format": "image"
50 },
51 {
52 "label": "In Stock",
53 "field": "inStock",
54 "format": "boolean"
55 },
56 {
57 "label": "Current Price",
58 "field": "currentPrice",
59 "format": "number"
60 },
61 {
62 "label": "Original Price",
63 "field": "originalPrice",
64 "format": "number"
65 },
66 {
67 "label": "Currency",
68 "field": "currency",
69 "format": "text"
70 }
71 ]
72 }
73 }
74 }
75 }
76 }
77}
.actor/logo.png
_utils/common.js
1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Configuration, Dataset } from 'crawlee'
13import { MemoryStorage } from '@crawlee/memory-storage'
14
15config({ path: findConfig(`.env`) })
16
17const elasticIndexName = `actors-monorepo-shops`
18
19const globalLogsProps = {
20 __NODE_STARTED: new Date().toISOString(),
21}
22
23let actorName
24let pgClient
25let pgClientNormalized
26let elasticClient
27export async function init ({ actorNameOverride }, restInput) {
28 parseEnvFromInput(restInput)
29
30 if (os.platform() === `darwin`) {
31 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
32 const basename = path.basename(filePath) // foo.ts
33 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
34 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
35 .split(` `)[1]
36 .trim()
37 .replace(`refs/heads/`, ``)
38 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
39 const gitCommitShort = gitCommit.substring(0, 7)
40 globalLogsProps.__GIT_COMMIT = gitCommitShort
41 }
42
43 if (process.env.APIFY_USE_MEMORY_REQUEST_QUEUE === `true`) { // dotenv -> bool-like vars are strings
44 Configuration.getGlobalConfig().useStorageClient(new MemoryStorage())
45 }
46
47 if (process.env.APIFY_IS_AT_HOME) {
48 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
49 }
50
51 /* ELASTIC */
52 /* ======= */
53 if (process.env.ELASTIC_CLOUD_ID) {
54 elasticClient = new ElasticClient({
55 cloud: { id: process.env.ELASTIC_CLOUD_ID },
56 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
57 })
58
59 // const mapping = await elasticClient.indices.getMapping({ index: actorName })
60
61 // eslint-disable-next-line no-inner-declarations
62 async function enforceIndexMapping () {
63 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
64 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
65 await elasticClient.indices.putMapping({
66 index: elasticIndexName,
67 body: {
68 properties: {
69 _discount: { type: `float` },
70 originalPrice: { type: `float` },
71 currentPrice: { type: `float` },
72 },
73 },
74 })
75 }
76
77 try {
78 await enforceIndexMapping()
79 } catch (err) {
80 if (err.message.includes(`cannot be changed from type`)) {
81 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
82 await elasticClient.indices.delete({ index: elasticIndexName })
83 await enforceIndexMapping()
84 }
85 }
86 }
87
88 /* POSTGRESQL */
89 /* ========== */
90 if (process.env.PG_CONNECTION_STRING) {
91 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
92 // const pgPool = new pg.Pool(pgConfig)
93
94 pgClient = new pg.Client(pgConfig)
95 await pgClient.connect()
96
97 // Check if table exists and have proper columns
98 const { rows: tables } = await pgClient.query(`
99 SELECT table_name
100 FROM information_schema.tables
101 WHERE table_schema = 'public'
102 `)
103
104 // eslint-disable-next-line camelcase
105 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
106 if (!tableExists) {
107 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
108 }
109
110 // TODO: Handle pgClient closing
111 }
112
113 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
114 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
115
116 pgClientNormalized = new pg.Client(pgConfig)
117 await pgClientNormalized.connect()
118
119 // Check if table exists and have proper columns
120 const { rows: tables } = await pgClientNormalized.query(`
121 SELECT table_name
122 FROM information_schema.tables
123 WHERE table_schema = 'public'
124 `)
125
126 // eslint-disable-next-line camelcase
127 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
128 // eslint-disable-next-line camelcase
129 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
130 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
131 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
132
133 // TODO: Handle pgClient closing
134 }
135}
136
137// inspired by @drobnikj
138// TODO: Similar, but less obfuscated for easier debugging
139export const createUniqueKeyFromUrl = (url) => {
140 const hash = createHash(`sha256`)
141 const cleanUrl = url.split(`://`)[1] // Remove protocol
142 hash.update(cleanUrl)
143 return hash.digest(`hex`)
144}
145
146/**
147 *
148 * @param {Date} datetime
149 * @return {Promise<void>}
150 */
151export const sleepUntil = async (datetime) => {
152 const now = new Date()
153 const difference = datetime - now
154 if (difference > 0) {
155 return new Promise((resolve) => {
156 setTimeout(resolve, difference)
157 })
158 }
159 return Promise.resolve()
160}
161
162// TODO: Uff, nicer! But at least it's tested
163export function parsePrice (string) {
164 let amount, currency
165 const noText = string.replace(/[^\d,.]/g, ``)
166 const decimals = noText.match(/([,.])(\d{2})$/)
167 if (decimals) {
168 const decimalSeparator = decimals[1] // ?
169 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
170 const decimalAmount = decimals[2] // ?
171 const mainAmount = noText.split(decimalSeparator)[0].replace(/\D/g, ``)
172 amount = parseFloat(mainAmount + `.` + decimalAmount) // ?
173 } else {
174 const justNumbers = noText.replace(/[,.]/g, ``)
175 amount = parseInt(justNumbers)
176 }
177 return { amount, currency }
178}
179
180export function toNumberOrNull (str) {
181 // TODO: Handle better, but only after adding test
182 if (str === undefined) return null
183 if (str === null) return null
184 if (str === ``) return null
185 const num = Number(str)
186 if (Number.isNaN(num)) return null
187 return num
188}
189
190export async function save (objs) {
191 if (!Array.isArray(objs)) objs = [objs]
192 if (objs.length === 0) return console.log(`No data to save.`)
193
194 const objsExtended = objs.map(async (obj) => {
195 const objExtended = {
196 ...obj,
197 actorName,
198 ...globalLogsProps,
199 // __NODE_VERSION: global.process.versions.node,
200 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
201 }
202 // if run on Apify
203 if (process.env.APIFY_IS_AT_HOME) {
204 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
205 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
206 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
207 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
208 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
209 if (process.env.APIFY_DONT_STORE_IN_DATASET !== `true`) { // Note: dotenv is not casting vars, so they are strings
210 await Dataset.pushData(obj)
211 }
212 }
213 return objExtended
214 })
215 // if runs on local machine (MacOS)
216 if (os.platform() === `darwin`) {
217 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
218 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
219 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
220 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
221 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
222 for (const objExtended of objsExtended) {
223 const id = String(objExtended.id ?? objExtended.pid) // ?? uuidv4()
224 const fileName = `${filenamify(id)}.json`
225 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
226 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
227 }
228 }
229
230 if (pgClient) {
231 const objsPg = objs.map((obj) => ({
232 ...obj,
233 // TODO: This is becoming not nice, and not clear
234 shop: actorName,
235 scrapedAt: new Date().toISOString().split(`T`)[0],
236 }))
237
238 const columns = getColumns(objsPg)
239 const values = getValues(objsPg)
240 const queryString = `
241 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
242 VALUES (${values})
243 `
244 try {
245 const { rowCount } = await pgClient.query(queryString)
246 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
247 } catch (err) {
248 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
249 else throw err
250 }
251 }
252
253 // Only make sense for HlidacShopu
254 if (pgClientNormalized) {
255 const objsPgData = objs.map((obj) => ({
256 shop: actorName,
257 pid: obj.pid,
258 name: obj.name,
259 url: obj.url,
260 img: obj.img,
261 }))
262
263 const objsPgDataPrice = objs.map((obj) => ({
264 shop: actorName,
265 pid: obj.pid,
266 scrapedAt: new Date().toISOString().split(`T`)[0],
267 currentPrice: obj.currentPrice,
268 originalPrice: obj.originalPrice,
269 inStock: obj.inStock,
270 }))
271
272 const queryString = `
273 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
274 VALUES (${getValues(objsPgData)})
275 ON CONFLICT DO NOTHING
276 `
277 try {
278 const { rowCount } = await pgClientNormalized.query(queryString)
279 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
280 } catch (err) {
281 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
282 else throw err
283 }
284
285 const queryStringPrice = `
286 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
287 VALUES (${getValues(objsPgDataPrice)})
288 ON CONFLICT DO NOTHING
289 `
290 try {
291 const { rowCount } = await pgClientNormalized.query(queryStringPrice)
292 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
293 } catch (err) {
294 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
295 else throw err
296 }
297 }
298
299 if (elasticClient) {
300 // .index creates or updates the document
301 // .create creates a new document if it doesn't exist, 409 if it does
302 // try {
303 // const res = await elasticClient.index({
304 // index: `actors-monorepo-shops`, // TODO: Consider using actorName
305 // id, // foo-bar
306 // document: objExtended, // {...}
307 // })
308 // } catch (err) {
309 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
310 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
311 // else throw err
312 // }
313 }
314}
315
316function getColumns (objs) {
317 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
318}
319
320function getValues (objs) {
321 return objs.map(objPg => Object.values(objPg).map((value) => {
322 // escape strings to prevent SQL injection
323 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
324 // convert to DB specific null
325 if (typeof value === `undefined` || value === null) return `NULL`
326 return value
327 }).join(`, `)).join(`), (`)
328}
329
330export function parseEnvFromInput (input) {
331 const env = {}
332 for (const key in input) {
333 if (key === key.toUpperCase()) env[key] = input[key]
334 }
335 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
336 Object.assign(process.env, env)
337}
338
339export const isInspect =
340 process.execArgv.join().includes(`--inspect`) ||
341 // @ts-ignore
342 process?._preload_modules?.join(`|`)?.includes(`debug`)
Developer
Maintained by Community
Categories