
r2-bike (r2-bike.com) scraper
Deprecated
Pricing
Pay per usage
Go to Store


r2-bike (r2-bike.com) scraper
Deprecated
Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).
0.0 (0)
Pricing
Pay per usage
1
Total users
2
Monthly users
1
Last modified
3 years ago
Dockerfile
FROM apify/actor-node-playwright-firefox:16
COPY package.json ./
RUN npm --quiet set progress=false \ && npm install aws-crt \ && npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{ "title": "r2-bike (r2-bike.com) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "type": "object", "schemaVersion": 1, "properties": { "mode": { "title": "Mode", "description": "", "type": "string", "editor": "select", "default": "TEST", "prefill": "TEST", "enum": [ "TEST", "FULL" ], "enumTitles": [ "TEST", "FULL" ] }, "APIFY_USE_MEMORY_REQUEST_QUEUE": { "sectionCaption": "Advanced", "sectionDescription": "Advanced options, use only if you know what you're doing.", "title": "Use in-memory request queue instead of the native one", "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.", "type": "boolean", "default": false, "editor": "checkbox" }, "APIFY_DONT_STORE_IN_DATASET": { "title": "Don't store in dataset", "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database", "type": "boolean", "default": false, "editor": "checkbox" }, "PG_CONNECTION_STRING_NORMALIZED": { "title": "Postgres connection string for normalized data", "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables", "type": "string", "editor": "textfield" }, "PG_DATA_TABLE": { "title": "Postgres table name for product data", "description": "Table name for storing product name, url, image, ...", "type": "string", "editor": "textfield" }, "PG_DATA_PRICE_TABLE": { "title": "Postgres table name for price data", "description": "Table name for storing price, original price, stock status, ...", "type": "string", "editor": "textfield" } }, "required": [ "mode" ]}
apify.json
{ "name": "r2-bike-r2-bike-com-scraper", "version": "0.1", "buildTag": "latest", "env": null, "defaultRunOptions": { "build": "latest", "timeoutSecs": 3600, "memoryMbytes": 4096 }}
main.js
1import { Actor } from "apify3";2import {3 CheerioCrawler,4 createCheerioRouter,5 utils as crawleeUtils,6} from "crawlee";7import { Session } from "@crawlee/core";8import playwright from "playwright";9import { init, parsePrice, save, toNumberOrNull } from "./_utils/common.js";10
11const LABELS = {12 INDEX: `INDEX`,13 PRODUCTS: `PRODUCTS`,14};15
16var MODE;17
18(function (MODE) {19 MODE["TEST"] = "TEST";20 MODE["FULL"] = "FULL";21})(MODE || (MODE = {}));22
23async function enqueueInitial(mode, crawler) {24 if (mode === MODE.FULL) {25 await crawler.addRequests([26 {27 userData: { label: LABELS.INDEX },28 url: `https://r2-bike.com/en/brands`,29 },30 ]);31 } else if (mode === MODE.TEST) {32 await crawler.addRequests([33 {34 userData: { label: LABELS.PRODUCTS },35 url: `https://r2-bike.com/en/shimano`,36 },37 ]);38 }39}40
41const router = createCheerioRouter();42
43router.addHandler(LABELS.INDEX, async ({ enqueueLinks }) => {44 await enqueueLinks({45 selector: `.vendor-index-group-wrapper li a`, // e.g. `en/shimano`46 baseUrl: `https://r2-bike.com/`, // needed for correctly absolute URLs, otherwise it would be `https://r2-bike.com/en/en/shimano`, not sure why ¯\_(ツ)_/¯47 userData: { label: LABELS.PRODUCTS },48 });49});50
51router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request, log }) => {52 log.info(`[PRODUCTS] ${request.url}`);53
54 if (!request.url.match(/_s(\d+)$/)) {55 // on first page56 const paginationText = $(`.list-pageinfo .page-current`).text().trim(); // eg. `Page 1 of 11`57 const match = paginationText.match(/(\d+) of (\d+)/);58 if (!match)59 log.error(60 `[PRODUCTS] Failed to parse pagination text: ${paginationText}`61 );62 const [, currentPage, totalPages] = match ?? [];63 if (Number(totalPages) > 1)64 log.info(`[PRODUCTS] Found ${totalPages} pages, enqueuing`);65 for (let i = 2; i <= Number(totalPages); i++) {66 // skip first page, that is already handled67 void crawler.addRequests([68 {69 url: `${request.url}_s${i}`, // eg. https://r2-bike.com/en/shimano_s270 userData: { label: LABELS.PRODUCTS },71 },72 ]);73 }74 }75
76 const products = [];77 const $products = $(78 `#product-list .product-wrapper[itemprop="itemListElement"]`79 ); // itemprop to avoid selecting last fake tile, which is actually "Next page" link80 log.info(`[PRODUCTS] ${request.url} - found ${$products.length} products`);81 $products.each(async (i, el) => {82 const pid = $(`.product-cell`, el)83 .attr(`id`) // result-wrapper_buy_form_10601684 ?.replace(`result-wrapper_buy_form_`, ``); // 10601685 if (!pid)86 return log.error(87 `[PRODUCTS] Failed to parse pid from ${i + 1}th product on ${88 request.url89 }`90 );91
92 const url = $(`meta[itemprop="url"]`, el).attr(`content`);93 const img = $(`meta[itemprop="image"]`, el).attr(`content`);94 const name = $(`h4[itemprop="name"]`, el).text().trim();95
96 const priceRaw = $(`.price_wrapper .price`, el).text().trim(); // e.g. 1,98 €*97 const price = parsePrice(priceRaw).amount;98
99 const priceOrigRaw = $(`.price-uvp`, el).text().trim(); // e.g. MSRP: 5,95 €100 const priceOrig = parsePrice(priceOrigRaw).amount;101
102 const inStock = $(`.delivery-status`, el).text().includes(`available`);103
104 const product = {105 pid,106 name,107 url,108 img,109 inStock,110 currentPrice: toNumberOrNull(price),111 originalPrice: toNumberOrNull(priceOrig),112 currency: `EUR`,113 };114 products.push(product);115 });116 await save(products);117});118
119void Actor.main(async () => {120 const input = await Actor.getInput();121 const { mode = MODE.FULL, ...rest } = input ?? {};122 await init({ actorNameOverride: `r2-bike-com` }, rest);123 const crawler = new CheerioCrawler({124 requestHandler: router,125 preNavigationHooks: [126 async ({ session }, gotOptions) => {127 const userData = session.userData;128 gotOptions.headers = userData.headers; // real-like headers obtained from Firefox129 gotOptions.headers.Cookie = userData.cookies130 .map((c) => `${c.name}=${c.value}`)131 .join(`; `); // real cookies obtained from Firefox132 // gotOptions.proxyUrl = `http://127.0.0.1:9090` // NOTE: uncomment for local debugging133 },134 ],135 maxConcurrency: 1, // not brave enough for concurrency136 maxRequestRetries: 0, // not brave enough for concurrency137 sessionPoolOptions: {138 maxPoolSize: 1, // not brave enough for concurrency139 sessionOptions: {140 maxAgeSecs: 60 * 60 * 2, // 2 hours, default is 50m141 maxUsageCount: 1000, // default is 50, let's use as much as possible, until we get blocked142 },143 createSessionFunction: async (sessionPool) => {144 console.log(145 `[SESSION] Creating new session, will use Firefox to unblock (should take ~10s)`146 );147 const session = new Session({ sessionPool });148 await unblock(session);149 return session;150 },151 },152 });153 await enqueueInitial(mode, crawler);154 await crawler.run();155});156
157async function unblock(session) {158 const browser = await playwright.firefox.launch({159 // headless: false, // NOTE: uncomment for debugging160 // proxy: { server: `http://127.0.0.1:9090` }, // NOTE: uncomment for local debugging161 });162 const browserContext = await browser.newContext({ ignoreHTTPSErrors: true });163 await browserContext.addCookies([164 {165 name: `eu_cookie_store`,166 value: `{"b209404849c0357500f7a82a6899961a":true,"3940b498c8a17157f69d757a80ff3421":true,"1d3c65b2b03ef35e14df6b163ea3a1f6":false,"0a3fbfc21a86a28c8961999929c374f3":true,"9b88c95a15e018c3f8038a7d0160145c":true,"dd31d974a78cdd704acaa6bf15da506c":true,"d86cf69a8b82547a94ca3f6a307cf9a6":false,"d323dff6f7de41c0b9af4c35e21dc032":false,"b83d1ac867f35569c614e298f645fffe":true,"21affb15e1316adac24b26db8e421a9d":false,"2d1fc55f933c039b2e04ff9034134b4d":true,"4d60ab2c6d11d753267484006c23e54c":false,"970cfba66b8380fb97b742e4571356c6":false}`,167 domain: `r2-bike.com`,168 path: `/`,169 },170 {171 name: `r2_user_delivery_country`,172 value: `CZ`, // TODO: make it configurable173 domain: `r2-bike.com`,174 path: `/`,175 },176 {177 name: `r2_user_delivery_country_ip_backup`,178 value: `CZ`, // TODO: make it configurable179 domain: `r2-bike.com`,180 path: `/`,181 },182 {183 name: `r2_user_delivery_country_tax_1`,184 value: `21`, // TODO: make it configurable185 domain: `r2-bike.com`,186 path: `/`,187 },188 {189 name: `r2_user_delivery_country_tax_2`,190 value: `10`, // TODO: make it configurable191 domain: `r2-bike.com`,192 path: `/`,193 },194 {195 name: `ledgerCurrency`,196 value: `EUR`,197 domain: `r2-bike.com`,198 path: `/`,199 },200 ]);201
202 const page = await browserContext.newPage();203 // page.on(`console`, msg => console.log(`⚪️ Playwright log (${msg.type()}) ${msg.text()}`))204
205 let headersToSet;206
207 await page.route(`**/*`, (route) => {208 const request = route.request();209 const url = request.url();210 const method = request.method(); // GET, POST, etc.211 const resourceType = request.resourceType(); // document, stylesheet, image, ...212 // console.log(`🔵 Playwright route: ${method} ${url} (${resourceType})`)213
214 // use the first main request to store the sent headers215 if (!headersToSet) headersToSet = pickHeaders(request.headers());216
217 route.continue();218 });219
220 // Go to product listing page which sets 95 products per page to current session221 await page.goto(`https://r2-bike.com/navi.php?h=58&Sortierung=1&af=95`); // h=58 is "Shimano", Sortierung=1 is "Sort by name" – both are not important, but some values need to be set222 // Wait for some time to pass basic Cloudflare Javascript checks223 await crawleeUtils.sleep(5000); // TODO: Be smarter, 3000s is enough for r2-bike.com, but not for g2.com224 // Get all cookies and store them for subsequent requests225 const cookies = await page.context().cookies();226 session.userData = { headers: headersToSet, cookies };227}228
229function pickHeaders(headers) {230 // Pick just the headers that gotScraping can correctly handle (= order)231 // This seems to be needed mainly to avoid setting Host header, which when set, was at the end of the headers list, which Cloudflare did not like232 // If we skip the Host header, then gotScraping will set it automatically, and in the correct order233
234 // taken from https://github.com/apify/header-generator/blob/1b0fd217b6fa0beaf42b9de321e47ac5f1d4cebf/src/data_files/headers-order.json#L62235 const headersList = [236 `sec-ch-ua`,237 `sec-ch-ua-mobile`,238 `user-agent`,239 `User-Agent`,240 `accept`,241 `Accept`,242 `accept-language`,243 `Accept-Language`,244 `accept-encoding`,245 `Accept-Encoding`,246 `dnt`,247 `DNT`,248 `referer`,249 `Referer`,250 `cookie`,251 `Cookie`,252 `Connection`,253 `upgrade-insecure-requests`,254 `Upgrade-Insecure-Requests`,255 `te`,256 `sec-fetch-site`,257 `sec-fetch-mode`,258 `sec-fetch-user`,259 `sec-fetch-dest`,260 `Sec-Fetch-Mode`,261 `Sec-Fetch-Dest`,262 `Sec-Fetch-Site`,263 `Sec-Fetch-User`,264 ];265 return headersList.reduce((acc, header) => {266 if (headers[header]) acc[header] = headers[header];267 return acc;268 }, {});269}
package.json
{ "name": "r2-bike-r2-bike-com-scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "type": "module", "scripts": { "start": "node ./main.js", "push-to-apify-platform": "npx apify push" }, "dependencies": { "apify3": "npm:apify@^3.0.2", "crawlee": "*", "@crawlee/core": "*", "playwright": "*", "pg": "*", "pg-connection-string": "*", "dotenv": "*", "find-config": "*", "@elastic/elasticsearch": "*", "filenamify": "*", "@crawlee/memory-storage": "*" }, "apify": { "title": "r2-bike (r2-bike.com) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "isPublic": true, "isDeprecated": false, "isAnonymouslyRunnable": true, "notice": "", "pictureUrl": "", "seoTitle": "", "seoDescription": "", "categories": [ "ECOMMERCE" ] }}
.actor/actor.json
{ "actorSpecification": 1, "name": "r2-bike-r2-bike-com-scraper", "title": "r2-bike (r2-bike.com) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "version": "0.1.0", "storages": { "dataset": { "actorSpecification": 1, "title": "r2-bike (r2-bike.com) scraper", "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", "views": { "overview": { "title": "Overview", "description": "Overview of the most important fields", "transformation": { "fields": [ "pid", "name", "url", "img", "inStock", "currentPrice", "originalPrice", "currency" ] }, "display": { "component": "table", "columns": [ { "label": "Pid", "field": "pid", "format": "text" }, { "label": "Name", "field": "name", "format": "text" }, { "label": "Url", "field": "url", "format": "link" }, { "label": "Img", "field": "img", "format": "image" }, { "label": "In Stock", "field": "inStock", "format": "boolean" }, { "label": "Current Price", "field": "currentPrice", "format": "number" }, { "label": "Original Price", "field": "originalPrice", "format": "number" }, { "label": "Currency", "field": "currency", "format": "text" } ] } } } } }}
.actor/logo.png
_utils/common.js
1import { createHash } from 'crypto'2import os from "os"3import path from "path"4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals5import fs from "fs"6import pg from "pg"7import pgConnectionString from 'pg-connection-string'8import { config } from 'dotenv'9import findConfig from "find-config"10import { Client as ElasticClient } from "@elastic/elasticsearch"11import filenamify from 'filenamify'12import { Configuration, Dataset } from 'crawlee'13import { MemoryStorage } from '@crawlee/memory-storage'14
15config({ path: findConfig(`.env`) })16
17const elasticIndexName = `actors-monorepo-shops`18
19const globalLogsProps = {20 __NODE_STARTED: new Date().toISOString(),21}22
23let actorName24let pgClient25let pgClientNormalized26let elasticClient27export async function init ({ actorNameOverride }, restInput) {28 parseEnvFromInput(restInput)29
30 if (os.platform() === `darwin`) {31 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts32 const basename = path.basename(filePath) // foo.ts33 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo34 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)35 .split(` `)[1]36 .trim()37 .replace(`refs/heads/`, ``)38 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)39 const gitCommitShort = gitCommit.substring(0, 7)40 globalLogsProps.__GIT_COMMIT = gitCommitShort41 }42
43 if (process.env.APIFY_USE_MEMORY_REQUEST_QUEUE === `true`) { // dotenv -> bool-like vars are strings44 Configuration.getGlobalConfig().useStorageClient(new MemoryStorage())45 }46
47 if (process.env.APIFY_IS_AT_HOME) {48 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV49 }50
51 /* ELASTIC */52 /* ======= */53 if (process.env.ELASTIC_CLOUD_ID) {54 elasticClient = new ElasticClient({55 cloud: { id: process.env.ELASTIC_CLOUD_ID },56 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },57 })58
59 // const mapping = await elasticClient.indices.getMapping({ index: actorName })60
61 // eslint-disable-next-line no-inner-declarations62 async function enforceIndexMapping () {63 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })64 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })65 await elasticClient.indices.putMapping({66 index: elasticIndexName,67 body: {68 properties: {69 _discount: { type: `float` },70 originalPrice: { type: `float` },71 currentPrice: { type: `float` },72 },73 },74 })75 }76
77 try {78 await enforceIndexMapping()79 } catch (err) {80 if (err.message.includes(`cannot be changed from type`)) {81 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)82 await elasticClient.indices.delete({ index: elasticIndexName })83 await enforceIndexMapping()84 }85 }86 }87
88 /* POSTGRESQL */89 /* ========== */90 if (process.env.PG_CONNECTION_STRING) {91 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)92 // const pgPool = new pg.Pool(pgConfig)93
94 pgClient = new pg.Client(pgConfig)95 await pgClient.connect()96
97 // Check if table exists and have proper columns98 const { rows: tables } = await pgClient.query(`99 SELECT table_name100 FROM information_schema.tables101 WHERE table_schema = 'public'102 `)103
104 // eslint-disable-next-line camelcase105 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)106 if (!tableExists) {107 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)108 }109
110 // TODO: Handle pgClient closing111 }112
113 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {114 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)115
116 pgClientNormalized = new pg.Client(pgConfig)117 await pgClientNormalized.connect()118
119 // Check if table exists and have proper columns120 const { rows: tables } = await pgClientNormalized.query(`121 SELECT table_name122 FROM information_schema.tables123 WHERE table_schema = 'public'124 `)125
126 // eslint-disable-next-line camelcase127 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)128 // eslint-disable-next-line camelcase129 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)130 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)131 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)132
133 // TODO: Handle pgClient closing134 }135}136
137// inspired by @drobnikj138// TODO: Similar, but less obfuscated for easier debugging139export const createUniqueKeyFromUrl = (url) => {140 const hash = createHash(`sha256`)141 const cleanUrl = url.split(`://`)[1] // Remove protocol142 hash.update(cleanUrl)143 return hash.digest(`hex`)144}145
146/**147 *148 * @param {Date} datetime149 * @return {Promise<void>}150 */151export const sleepUntil = async (datetime) => {152 const now = new Date()153 const difference = datetime - now154 if (difference > 0) {155 return new Promise((resolve) => {156 setTimeout(resolve, difference)157 })158 }159 return Promise.resolve()160}161
162// TODO: Uff, nicer! But at least it's tested163export function parsePrice (string) {164 let amount, currency165 const noText = string.replace(/[^\d,.]/g, ``)166 const decimals = noText.match(/([,.])(\d{2})$/)167 if (decimals) {168 const decimalSeparator = decimals[1] // ?169 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars170 const decimalAmount = decimals[2] // ?171 const mainAmount = noText.split(decimalSeparator)[0].replace(/\D/g, ``)172 amount = parseFloat(mainAmount + `.` + decimalAmount) // ?173 } else {174 const justNumbers = noText.replace(/[,.]/g, ``)175 amount = parseInt(justNumbers)176 }177 return { amount, currency }178}179
180export function toNumberOrNull (str) {181 // TODO: Handle better, but only after adding test182 if (str === undefined) return null183 if (str === null) return null184 if (str === ``) return null185 const num = Number(str)186 if (Number.isNaN(num)) return null187 return num188}189
190export async function save (objs) {191 if (!Array.isArray(objs)) objs = [objs]192 if (objs.length === 0) return console.log(`No data to save.`)193
194 const objsExtended = objs.map(async (obj) => {195 const objExtended = {196 ...obj,197 actorName,198 ...globalLogsProps,199 // __NODE_VERSION: global.process.versions.node,200 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals201 }202 // if run on Apify203 if (process.env.APIFY_IS_AT_HOME) {204 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID205 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID206 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID207 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER208 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID209 if (process.env.APIFY_DONT_STORE_IN_DATASET !== `true`) { // Note: dotenv is not casting vars, so they are strings210 await Dataset.pushData(obj)211 }212 }213 return objExtended214 })215 // if runs on local machine (MacOS)216 if (os.platform() === `darwin`) {217 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors218 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage219 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)220 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data221 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)222 for (const objExtended of objsExtended) {223 const id = String(objExtended.id ?? objExtended.pid) // ?? uuidv4()224 const fileName = `${filenamify(id)}.json`225 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json226 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))227 }228 }229
230 if (pgClient) {231 const objsPg = objs.map((obj) => ({232 ...obj,233 // TODO: This is becoming not nice, and not clear234 shop: actorName,235 scrapedAt: new Date().toISOString().split(`T`)[0],236 }))237
238 const columns = getColumns(objsPg)239 const values = getValues(objsPg)240 const queryString = `241 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})242 VALUES (${values})243 `244 try {245 const { rowCount } = await pgClient.query(queryString)246 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)247 } catch (err) {248 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)249 else throw err250 }251 }252
253 // Only make sense for HlidacShopu254 if (pgClientNormalized) {255 const objsPgData = objs.map((obj) => ({256 shop: actorName,257 pid: obj.pid,258 name: obj.name,259 url: obj.url,260 img: obj.img,261 }))262
263 const objsPgDataPrice = objs.map((obj) => ({264 shop: actorName,265 pid: obj.pid,266 scrapedAt: new Date().toISOString().split(`T`)[0],267 currentPrice: obj.currentPrice,268 originalPrice: obj.originalPrice,269 inStock: obj.inStock,270 }))271
272 const queryString = `273 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})274 VALUES (${getValues(objsPgData)})275 ON CONFLICT DO NOTHING276 `277 try {278 const { rowCount } = await pgClientNormalized.query(queryString)279 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)280 } catch (err) {281 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)282 else throw err283 }284
285 const queryStringPrice = `286 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})287 VALUES (${getValues(objsPgDataPrice)})288 ON CONFLICT DO NOTHING289 `290 try {291 const { rowCount } = await pgClientNormalized.query(queryStringPrice)292 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)293 } catch (err) {294 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)295 else throw err296 }297 }298
299 if (elasticClient) {300 // .index creates or updates the document301 // .create creates a new document if it doesn't exist, 409 if it does302 // try {303 // const res = await elasticClient.index({304 // index: `actors-monorepo-shops`, // TODO: Consider using actorName305 // id, // foo-bar306 // document: objExtended, // {...}307 // })308 // } catch (err) {309 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583310 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)311 // else throw err312 // }313 }314}315
316function getColumns (objs) {317 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)318}319
320function getValues (objs) {321 return objs.map(objPg => Object.values(objPg).map((value) => {322 // escape strings to prevent SQL injection323 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`324 // convert to DB specific null325 if (typeof value === `undefined` || value === null) return `NULL`326 return value327 }).join(`, `)).join(`), (`)328}329
330export function parseEnvFromInput (input) {331 const env = {}332 for (const key in input) {333 if (key === key.toUpperCase()) env[key] = input[key]334 }335 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)336 Object.assign(process.env, env)337}338
339export const isInspect =340 process.execArgv.join().includes(`--inspect`) ||341 // @ts-ignore342 process?._preload_modules?.join(`|`)?.includes(`debug`)