r2-bike (r2-bike.com) scraper avatar
r2-bike (r2-bike.com) scraper

Deprecated

Pricing

Pay per usage

Go to Store
r2-bike (r2-bike.com) scraper

r2-bike (r2-bike.com) scraper

Deprecated

Developed by

Pavel Dolecek

Pavel Dolecek

Maintained by Community

Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).

0.0 (0)

Pricing

Pay per usage

1

Total users

2

Monthly users

1

Last modified

3 years ago

Dockerfile

FROM apify/actor-node-playwright-firefox:16
COPY package.json ./
RUN npm --quiet set progress=false \
&& npm install aws-crt \
&& npm install --only=prod --no-optional
COPY . ./

INPUT_SCHEMA.json

{
"title": "r2-bike (r2-bike.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
"type": "object",
"schemaVersion": 1,
"properties": {
"mode": {
"title": "Mode",
"description": "",
"type": "string",
"editor": "select",
"default": "TEST",
"prefill": "TEST",
"enum": [
"TEST",
"FULL"
],
"enumTitles": [
"TEST",
"FULL"
]
},
"APIFY_USE_MEMORY_REQUEST_QUEUE": {
"sectionCaption": "Advanced",
"sectionDescription": "Advanced options, use only if you know what you're doing.",
"title": "Use in-memory request queue instead of the native one",
"description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
"type": "boolean",
"default": false,
"editor": "checkbox"
},
"APIFY_DONT_STORE_IN_DATASET": {
"title": "Don't store in dataset",
"description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
"type": "boolean",
"default": false,
"editor": "checkbox"
},
"PG_CONNECTION_STRING_NORMALIZED": {
"title": "Postgres connection string for normalized data",
"description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
"type": "string",
"editor": "textfield"
},
"PG_DATA_TABLE": {
"title": "Postgres table name for product data",
"description": "Table name for storing product name, url, image, ...",
"type": "string",
"editor": "textfield"
},
"PG_DATA_PRICE_TABLE": {
"title": "Postgres table name for price data",
"description": "Table name for storing price, original price, stock status, ...",
"type": "string",
"editor": "textfield"
}
},
"required": [
"mode"
]
}

apify.json

{
"name": "r2-bike-r2-bike-com-scraper",
"version": "0.1",
"buildTag": "latest",
"env": null,
"defaultRunOptions": {
"build": "latest",
"timeoutSecs": 3600,
"memoryMbytes": 4096
}
}

main.js

1import { Actor } from "apify3";
2import {
3 CheerioCrawler,
4 createCheerioRouter,
5 utils as crawleeUtils,
6} from "crawlee";
7import { Session } from "@crawlee/core";
8import playwright from "playwright";
9import { init, parsePrice, save, toNumberOrNull } from "./_utils/common.js";
10
11const LABELS = {
12 INDEX: `INDEX`,
13 PRODUCTS: `PRODUCTS`,
14};
15
16var MODE;
17
18(function (MODE) {
19 MODE["TEST"] = "TEST";
20 MODE["FULL"] = "FULL";
21})(MODE || (MODE = {}));
22
23async function enqueueInitial(mode, crawler) {
24 if (mode === MODE.FULL) {
25 await crawler.addRequests([
26 {
27 userData: { label: LABELS.INDEX },
28 url: `https://r2-bike.com/en/brands`,
29 },
30 ]);
31 } else if (mode === MODE.TEST) {
32 await crawler.addRequests([
33 {
34 userData: { label: LABELS.PRODUCTS },
35 url: `https://r2-bike.com/en/shimano`,
36 },
37 ]);
38 }
39}
40
41const router = createCheerioRouter();
42
43router.addHandler(LABELS.INDEX, async ({ enqueueLinks }) => {
44 await enqueueLinks({
45 selector: `.vendor-index-group-wrapper li a`, // e.g. `en/shimano`
46 baseUrl: `https://r2-bike.com/`, // needed for correctly absolute URLs, otherwise it would be `https://r2-bike.com/en/en/shimano`, not sure why ¯\_(ツ)_/¯
47 userData: { label: LABELS.PRODUCTS },
48 });
49});
50
51router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request, log }) => {
52 log.info(`[PRODUCTS] ${request.url}`);
53
54 if (!request.url.match(/_s(\d+)$/)) {
55 // on first page
56 const paginationText = $(`.list-pageinfo .page-current`).text().trim(); // eg. `Page 1 of 11`
57 const match = paginationText.match(/(\d+) of (\d+)/);
58 if (!match)
59 log.error(
60 `[PRODUCTS] Failed to parse pagination text: ${paginationText}`
61 );
62 const [, currentPage, totalPages] = match ?? [];
63 if (Number(totalPages) > 1)
64 log.info(`[PRODUCTS] Found ${totalPages} pages, enqueuing`);
65 for (let i = 2; i <= Number(totalPages); i++) {
66 // skip first page, that is already handled
67 void crawler.addRequests([
68 {
69 url: `${request.url}_s${i}`, // eg. https://r2-bike.com/en/shimano_s2
70 userData: { label: LABELS.PRODUCTS },
71 },
72 ]);
73 }
74 }
75
76 const products = [];
77 const $products = $(
78 `#product-list .product-wrapper[itemprop="itemListElement"]`
79 ); // itemprop to avoid selecting last fake tile, which is actually "Next page" link
80 log.info(`[PRODUCTS] ${request.url} - found ${$products.length} products`);
81 $products.each(async (i, el) => {
82 const pid = $(`.product-cell`, el)
83 .attr(`id`) // result-wrapper_buy_form_106016
84 ?.replace(`result-wrapper_buy_form_`, ``); // 106016
85 if (!pid)
86 return log.error(
87 `[PRODUCTS] Failed to parse pid from ${i + 1}th product on ${
88 request.url
89 }`
90 );
91
92 const url = $(`meta[itemprop="url"]`, el).attr(`content`);
93 const img = $(`meta[itemprop="image"]`, el).attr(`content`);
94 const name = $(`h4[itemprop="name"]`, el).text().trim();
95
96 const priceRaw = $(`.price_wrapper .price`, el).text().trim(); // e.g. 1,98 €*
97 const price = parsePrice(priceRaw).amount;
98
99 const priceOrigRaw = $(`.price-uvp`, el).text().trim(); // e.g. MSRP: 5,95 €
100 const priceOrig = parsePrice(priceOrigRaw).amount;
101
102 const inStock = $(`.delivery-status`, el).text().includes(`available`);
103
104 const product = {
105 pid,
106 name,
107 url,
108 img,
109 inStock,
110 currentPrice: toNumberOrNull(price),
111 originalPrice: toNumberOrNull(priceOrig),
112 currency: `EUR`,
113 };
114 products.push(product);
115 });
116 await save(products);
117});
118
119void Actor.main(async () => {
120 const input = await Actor.getInput();
121 const { mode = MODE.FULL, ...rest } = input ?? {};
122 await init({ actorNameOverride: `r2-bike-com` }, rest);
123 const crawler = new CheerioCrawler({
124 requestHandler: router,
125 preNavigationHooks: [
126 async ({ session }, gotOptions) => {
127 const userData = session.userData;
128 gotOptions.headers = userData.headers; // real-like headers obtained from Firefox
129 gotOptions.headers.Cookie = userData.cookies
130 .map((c) => `${c.name}=${c.value}`)
131 .join(`; `); // real cookies obtained from Firefox
132 // gotOptions.proxyUrl = `http://127.0.0.1:9090` // NOTE: uncomment for local debugging
133 },
134 ],
135 maxConcurrency: 1, // not brave enough for concurrency
136 maxRequestRetries: 0, // not brave enough for concurrency
137 sessionPoolOptions: {
138 maxPoolSize: 1, // not brave enough for concurrency
139 sessionOptions: {
140 maxAgeSecs: 60 * 60 * 2, // 2 hours, default is 50m
141 maxUsageCount: 1000, // default is 50, let's use as much as possible, until we get blocked
142 },
143 createSessionFunction: async (sessionPool) => {
144 console.log(
145 `[SESSION] Creating new session, will use Firefox to unblock (should take ~10s)`
146 );
147 const session = new Session({ sessionPool });
148 await unblock(session);
149 return session;
150 },
151 },
152 });
153 await enqueueInitial(mode, crawler);
154 await crawler.run();
155});
156
157async function unblock(session) {
158 const browser = await playwright.firefox.launch({
159 // headless: false, // NOTE: uncomment for debugging
160 // proxy: { server: `http://127.0.0.1:9090` }, // NOTE: uncomment for local debugging
161 });
162 const browserContext = await browser.newContext({ ignoreHTTPSErrors: true });
163 await browserContext.addCookies([
164 {
165 name: `eu_cookie_store`,
166 value: `{"b209404849c0357500f7a82a6899961a":true,"3940b498c8a17157f69d757a80ff3421":true,"1d3c65b2b03ef35e14df6b163ea3a1f6":false,"0a3fbfc21a86a28c8961999929c374f3":true,"9b88c95a15e018c3f8038a7d0160145c":true,"dd31d974a78cdd704acaa6bf15da506c":true,"d86cf69a8b82547a94ca3f6a307cf9a6":false,"d323dff6f7de41c0b9af4c35e21dc032":false,"b83d1ac867f35569c614e298f645fffe":true,"21affb15e1316adac24b26db8e421a9d":false,"2d1fc55f933c039b2e04ff9034134b4d":true,"4d60ab2c6d11d753267484006c23e54c":false,"970cfba66b8380fb97b742e4571356c6":false}`,
167 domain: `r2-bike.com`,
168 path: `/`,
169 },
170 {
171 name: `r2_user_delivery_country`,
172 value: `CZ`, // TODO: make it configurable
173 domain: `r2-bike.com`,
174 path: `/`,
175 },
176 {
177 name: `r2_user_delivery_country_ip_backup`,
178 value: `CZ`, // TODO: make it configurable
179 domain: `r2-bike.com`,
180 path: `/`,
181 },
182 {
183 name: `r2_user_delivery_country_tax_1`,
184 value: `21`, // TODO: make it configurable
185 domain: `r2-bike.com`,
186 path: `/`,
187 },
188 {
189 name: `r2_user_delivery_country_tax_2`,
190 value: `10`, // TODO: make it configurable
191 domain: `r2-bike.com`,
192 path: `/`,
193 },
194 {
195 name: `ledgerCurrency`,
196 value: `EUR`,
197 domain: `r2-bike.com`,
198 path: `/`,
199 },
200 ]);
201
202 const page = await browserContext.newPage();
203 // page.on(`console`, msg => console.log(`⚪️ Playwright log (${msg.type()}) ${msg.text()}`))
204
205 let headersToSet;
206
207 await page.route(`**/*`, (route) => {
208 const request = route.request();
209 const url = request.url();
210 const method = request.method(); // GET, POST, etc.
211 const resourceType = request.resourceType(); // document, stylesheet, image, ...
212 // console.log(`🔵 Playwright route: ${method} ${url} (${resourceType})`)
213
214 // use the first main request to store the sent headers
215 if (!headersToSet) headersToSet = pickHeaders(request.headers());
216
217 route.continue();
218 });
219
220 // Go to product listing page which sets 95 products per page to current session
221 await page.goto(`https://r2-bike.com/navi.php?h=58&Sortierung=1&af=95`); // h=58 is "Shimano", Sortierung=1 is "Sort by name" – both are not important, but some values need to be set
222 // Wait for some time to pass basic Cloudflare Javascript checks
223 await crawleeUtils.sleep(5000); // TODO: Be smarter, 3000s is enough for r2-bike.com, but not for g2.com
224 // Get all cookies and store them for subsequent requests
225 const cookies = await page.context().cookies();
226 session.userData = { headers: headersToSet, cookies };
227}
228
229function pickHeaders(headers) {
230 // Pick just the headers that gotScraping can correctly handle (= order)
231 // This seems to be needed mainly to avoid setting Host header, which when set, was at the end of the headers list, which Cloudflare did not like
232 // If we skip the Host header, then gotScraping will set it automatically, and in the correct order
233
234 // taken from https://github.com/apify/header-generator/blob/1b0fd217b6fa0beaf42b9de321e47ac5f1d4cebf/src/data_files/headers-order.json#L62
235 const headersList = [
236 `sec-ch-ua`,
237 `sec-ch-ua-mobile`,
238 `user-agent`,
239 `User-Agent`,
240 `accept`,
241 `Accept`,
242 `accept-language`,
243 `Accept-Language`,
244 `accept-encoding`,
245 `Accept-Encoding`,
246 `dnt`,
247 `DNT`,
248 `referer`,
249 `Referer`,
250 `cookie`,
251 `Cookie`,
252 `Connection`,
253 `upgrade-insecure-requests`,
254 `Upgrade-Insecure-Requests`,
255 `te`,
256 `sec-fetch-site`,
257 `sec-fetch-mode`,
258 `sec-fetch-user`,
259 `sec-fetch-dest`,
260 `Sec-Fetch-Mode`,
261 `Sec-Fetch-Dest`,
262 `Sec-Fetch-Site`,
263 `Sec-Fetch-User`,
264 ];
265 return headersList.reduce((acc, header) => {
266 if (headers[header]) acc[header] = headers[header];
267 return acc;
268 }, {});
269}

package.json

{
"name": "r2-bike-r2-bike-com-scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
"type": "module",
"scripts": {
"start": "node ./main.js",
"push-to-apify-platform": "npx apify push"
},
"dependencies": {
"apify3": "npm:apify@^3.0.2",
"crawlee": "*",
"@crawlee/core": "*",
"playwright": "*",
"pg": "*",
"pg-connection-string": "*",
"dotenv": "*",
"find-config": "*",
"@elastic/elasticsearch": "*",
"filenamify": "*",
"@crawlee/memory-storage": "*"
},
"apify": {
"title": "r2-bike (r2-bike.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
"isPublic": true,
"isDeprecated": false,
"isAnonymouslyRunnable": true,
"notice": "",
"pictureUrl": "",
"seoTitle": "",
"seoDescription": "",
"categories": [
"ECOMMERCE"
]
}
}

.actor/actor.json

{
"actorSpecification": 1,
"name": "r2-bike-r2-bike-com-scraper",
"title": "r2-bike (r2-bike.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
"version": "0.1.0",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "r2-bike (r2-bike.com) scraper",
"description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).",
"views": {
"overview": {
"title": "Overview",
"description": "Overview of the most important fields",
"transformation": {
"fields": [
"pid",
"name",
"url",
"img",
"inStock",
"currentPrice",
"originalPrice",
"currency"
]
},
"display": {
"component": "table",
"columns": [
{
"label": "Pid",
"field": "pid",
"format": "text"
},
{
"label": "Name",
"field": "name",
"format": "text"
},
{
"label": "Url",
"field": "url",
"format": "link"
},
{
"label": "Img",
"field": "img",
"format": "image"
},
{
"label": "In Stock",
"field": "inStock",
"format": "boolean"
},
{
"label": "Current Price",
"field": "currentPrice",
"format": "number"
},
{
"label": "Original Price",
"field": "originalPrice",
"format": "number"
},
{
"label": "Currency",
"field": "currency",
"format": "text"
}
]
}
}
}
}
}
}

.actor/logo.png

_utils/common.js

1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Configuration, Dataset } from 'crawlee'
13import { MemoryStorage } from '@crawlee/memory-storage'
14
15config({ path: findConfig(`.env`) })
16
17const elasticIndexName = `actors-monorepo-shops`
18
19const globalLogsProps = {
20 __NODE_STARTED: new Date().toISOString(),
21}
22
23let actorName
24let pgClient
25let pgClientNormalized
26let elasticClient
27export async function init ({ actorNameOverride }, restInput) {
28 parseEnvFromInput(restInput)
29
30 if (os.platform() === `darwin`) {
31 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
32 const basename = path.basename(filePath) // foo.ts
33 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
34 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
35 .split(` `)[1]
36 .trim()
37 .replace(`refs/heads/`, ``)
38 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
39 const gitCommitShort = gitCommit.substring(0, 7)
40 globalLogsProps.__GIT_COMMIT = gitCommitShort
41 }
42
43 if (process.env.APIFY_USE_MEMORY_REQUEST_QUEUE === `true`) { // dotenv -> bool-like vars are strings
44 Configuration.getGlobalConfig().useStorageClient(new MemoryStorage())
45 }
46
47 if (process.env.APIFY_IS_AT_HOME) {
48 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
49 }
50
51 /* ELASTIC */
52 /* ======= */
53 if (process.env.ELASTIC_CLOUD_ID) {
54 elasticClient = new ElasticClient({
55 cloud: { id: process.env.ELASTIC_CLOUD_ID },
56 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
57 })
58
59 // const mapping = await elasticClient.indices.getMapping({ index: actorName })
60
61 // eslint-disable-next-line no-inner-declarations
62 async function enforceIndexMapping () {
63 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
64 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
65 await elasticClient.indices.putMapping({
66 index: elasticIndexName,
67 body: {
68 properties: {
69 _discount: { type: `float` },
70 originalPrice: { type: `float` },
71 currentPrice: { type: `float` },
72 },
73 },
74 })
75 }
76
77 try {
78 await enforceIndexMapping()
79 } catch (err) {
80 if (err.message.includes(`cannot be changed from type`)) {
81 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
82 await elasticClient.indices.delete({ index: elasticIndexName })
83 await enforceIndexMapping()
84 }
85 }
86 }
87
88 /* POSTGRESQL */
89 /* ========== */
90 if (process.env.PG_CONNECTION_STRING) {
91 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
92 // const pgPool = new pg.Pool(pgConfig)
93
94 pgClient = new pg.Client(pgConfig)
95 await pgClient.connect()
96
97 // Check if table exists and have proper columns
98 const { rows: tables } = await pgClient.query(`
99 SELECT table_name
100 FROM information_schema.tables
101 WHERE table_schema = 'public'
102 `)
103
104 // eslint-disable-next-line camelcase
105 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
106 if (!tableExists) {
107 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
108 }
109
110 // TODO: Handle pgClient closing
111 }
112
113 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
114 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
115
116 pgClientNormalized = new pg.Client(pgConfig)
117 await pgClientNormalized.connect()
118
119 // Check if table exists and have proper columns
120 const { rows: tables } = await pgClientNormalized.query(`
121 SELECT table_name
122 FROM information_schema.tables
123 WHERE table_schema = 'public'
124 `)
125
126 // eslint-disable-next-line camelcase
127 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
128 // eslint-disable-next-line camelcase
129 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
130 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
131 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
132
133 // TODO: Handle pgClient closing
134 }
135}
136
137// inspired by @drobnikj
138// TODO: Similar, but less obfuscated for easier debugging
139export const createUniqueKeyFromUrl = (url) => {
140 const hash = createHash(`sha256`)
141 const cleanUrl = url.split(`://`)[1] // Remove protocol
142 hash.update(cleanUrl)
143 return hash.digest(`hex`)
144}
145
146/**
147 *
148 * @param {Date} datetime
149 * @return {Promise<void>}
150 */
151export const sleepUntil = async (datetime) => {
152 const now = new Date()
153 const difference = datetime - now
154 if (difference > 0) {
155 return new Promise((resolve) => {
156 setTimeout(resolve, difference)
157 })
158 }
159 return Promise.resolve()
160}
161
162// TODO: Uff, nicer! But at least it's tested
163export function parsePrice (string) {
164 let amount, currency
165 const noText = string.replace(/[^\d,.]/g, ``)
166 const decimals = noText.match(/([,.])(\d{2})$/)
167 if (decimals) {
168 const decimalSeparator = decimals[1] // ?
169 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
170 const decimalAmount = decimals[2] // ?
171 const mainAmount = noText.split(decimalSeparator)[0].replace(/\D/g, ``)
172 amount = parseFloat(mainAmount + `.` + decimalAmount) // ?
173 } else {
174 const justNumbers = noText.replace(/[,.]/g, ``)
175 amount = parseInt(justNumbers)
176 }
177 return { amount, currency }
178}
179
180export function toNumberOrNull (str) {
181 // TODO: Handle better, but only after adding test
182 if (str === undefined) return null
183 if (str === null) return null
184 if (str === ``) return null
185 const num = Number(str)
186 if (Number.isNaN(num)) return null
187 return num
188}
189
190export async function save (objs) {
191 if (!Array.isArray(objs)) objs = [objs]
192 if (objs.length === 0) return console.log(`No data to save.`)
193
194 const objsExtended = objs.map(async (obj) => {
195 const objExtended = {
196 ...obj,
197 actorName,
198 ...globalLogsProps,
199 // __NODE_VERSION: global.process.versions.node,
200 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
201 }
202 // if run on Apify
203 if (process.env.APIFY_IS_AT_HOME) {
204 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
205 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
206 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
207 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
208 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
209 if (process.env.APIFY_DONT_STORE_IN_DATASET !== `true`) { // Note: dotenv is not casting vars, so they are strings
210 await Dataset.pushData(obj)
211 }
212 }
213 return objExtended
214 })
215 // if runs on local machine (MacOS)
216 if (os.platform() === `darwin`) {
217 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
218 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
219 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
220 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
221 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
222 for (const objExtended of objsExtended) {
223 const id = String(objExtended.id ?? objExtended.pid) // ?? uuidv4()
224 const fileName = `${filenamify(id)}.json`
225 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
226 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
227 }
228 }
229
230 if (pgClient) {
231 const objsPg = objs.map((obj) => ({
232 ...obj,
233 // TODO: This is becoming not nice, and not clear
234 shop: actorName,
235 scrapedAt: new Date().toISOString().split(`T`)[0],
236 }))
237
238 const columns = getColumns(objsPg)
239 const values = getValues(objsPg)
240 const queryString = `
241 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
242 VALUES (${values})
243 `
244 try {
245 const { rowCount } = await pgClient.query(queryString)
246 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
247 } catch (err) {
248 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
249 else throw err
250 }
251 }
252
253 // Only make sense for HlidacShopu
254 if (pgClientNormalized) {
255 const objsPgData = objs.map((obj) => ({
256 shop: actorName,
257 pid: obj.pid,
258 name: obj.name,
259 url: obj.url,
260 img: obj.img,
261 }))
262
263 const objsPgDataPrice = objs.map((obj) => ({
264 shop: actorName,
265 pid: obj.pid,
266 scrapedAt: new Date().toISOString().split(`T`)[0],
267 currentPrice: obj.currentPrice,
268 originalPrice: obj.originalPrice,
269 inStock: obj.inStock,
270 }))
271
272 const queryString = `
273 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
274 VALUES (${getValues(objsPgData)})
275 ON CONFLICT DO NOTHING
276 `
277 try {
278 const { rowCount } = await pgClientNormalized.query(queryString)
279 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
280 } catch (err) {
281 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
282 else throw err
283 }
284
285 const queryStringPrice = `
286 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
287 VALUES (${getValues(objsPgDataPrice)})
288 ON CONFLICT DO NOTHING
289 `
290 try {
291 const { rowCount } = await pgClientNormalized.query(queryStringPrice)
292 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
293 } catch (err) {
294 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
295 else throw err
296 }
297 }
298
299 if (elasticClient) {
300 // .index creates or updates the document
301 // .create creates a new document if it doesn't exist, 409 if it does
302 // try {
303 // const res = await elasticClient.index({
304 // index: `actors-monorepo-shops`, // TODO: Consider using actorName
305 // id, // foo-bar
306 // document: objExtended, // {...}
307 // })
308 // } catch (err) {
309 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
310 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
311 // else throw err
312 // }
313 }
314}
315
316function getColumns (objs) {
317 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
318}
319
320function getValues (objs) {
321 return objs.map(objPg => Object.values(objPg).map((value) => {
322 // escape strings to prevent SQL injection
323 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
324 // convert to DB specific null
325 if (typeof value === `undefined` || value === null) return `NULL`
326 return value
327 }).join(`, `)).join(`), (`)
328}
329
330export function parseEnvFromInput (input) {
331 const env = {}
332 for (const key in input) {
333 if (key === key.toUpperCase()) env[key] = input[key]
334 }
335 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
336 Object.assign(process.env, env)
337}
338
339export const isInspect =
340 process.execArgv.join().includes(`--inspect`) ||
341 // @ts-ignore
342 process?._preload_modules?.join(`|`)?.includes(`debug`)