r2-bike (r2-bike.com) scraper

No credit card required

This Actor is under maintenance.

This actor is under maintenance and it may unreliable.

r2-bike (r2-bike.com) scraper

r2-bike (r2-bike.com) scraper

strajk/r2-bike-r2-bike-com-scraper

No credit card required

Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).

Dockerfile

1FROM apify/actor-node-playwright-firefox:16 2 3COPY package.json ./ 4 5RUN npm --quiet set progress=false \ 6 && npm install aws-crt \ 7 && npm install --only=prod --no-optional 8 9COPY . ./

INPUT_SCHEMA.json

1{ 2 "title": "r2-bike (r2-bike.com) scraper", 3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 4 "type": "object", 5 "schemaVersion": 1, 6 "properties": { 7 "mode": { 8 "title": "Mode", 9 "description": "", 10 "type": "string", 11 "editor": "select", 12 "default": "TEST", 13 "prefill": "TEST", 14 "enum": [ 15 "TEST", 16 "FULL" 17 ], 18 "enumTitles": [ 19 "TEST", 20 "FULL" 21 ] 22 }, 23 "APIFY_USE_MEMORY_REQUEST_QUEUE": { 24 "sectionCaption": "Advanced", 25 "sectionDescription": "Advanced options, use only if you know what you're doing.", 26 "title": "Use in-memory request queue instead of the native one", 27 "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.", 28 "type": "boolean", 29 "default": false, 30 "editor": "checkbox" 31 }, 32 "APIFY_DONT_STORE_IN_DATASET": { 33 "title": "Don't store in dataset", 34 "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database", 35 "type": "boolean", 36 "default": false, 37 "editor": "checkbox" 38 }, 39 "PG_CONNECTION_STRING_NORMALIZED": { 40 "title": "Postgres connection string for normalized data", 41 "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables", 42 "type": "string", 43 "editor": "textfield" 44 }, 45 "PG_DATA_TABLE": { 46 "title": "Postgres table name for product data", 47 "description": "Table name for storing product name, url, image, ...", 48 "type": "string", 49 "editor": "textfield" 50 }, 51 "PG_DATA_PRICE_TABLE": { 52 "title": "Postgres table name for price data", 53 "description": "Table name for storing price, original price, stock status, ...", 54 "type": "string", 55 "editor": "textfield" 56 } 57 }, 58 "required": [ 59 "mode" 60 ] 61}

apify.json

1{ 2 "name": "r2-bike-r2-bike-com-scraper", 3 "version": "0.1", 4 "buildTag": "latest", 5 "env": null, 6 "defaultRunOptions": { 7 "build": "latest", 8 "timeoutSecs": 3600, 9 "memoryMbytes": 4096 10 } 11}

main.js

1import { Actor } from "apify3"; 2import { 3 CheerioCrawler, 4 createCheerioRouter, 5 utils as crawleeUtils, 6} from "crawlee"; 7import { Session } from "@crawlee/core"; 8import playwright from "playwright"; 9import { init, parsePrice, save, toNumberOrNull } from "./_utils/common.js"; 10 11const LABELS = { 12 INDEX: `INDEX`, 13 PRODUCTS: `PRODUCTS`, 14}; 15 16var MODE; 17 18(function (MODE) { 19 MODE["TEST"] = "TEST"; 20 MODE["FULL"] = "FULL"; 21})(MODE || (MODE = {})); 22 23async function enqueueInitial(mode, crawler) { 24 if (mode === MODE.FULL) { 25 await crawler.addRequests([ 26 { 27 userData: { label: LABELS.INDEX }, 28 url: `https://r2-bike.com/en/brands`, 29 }, 30 ]); 31 } else if (mode === MODE.TEST) { 32 await crawler.addRequests([ 33 { 34 userData: { label: LABELS.PRODUCTS }, 35 url: `https://r2-bike.com/en/shimano`, 36 }, 37 ]); 38 } 39} 40 41const router = createCheerioRouter(); 42 43router.addHandler(LABELS.INDEX, async ({ enqueueLinks }) => { 44 await enqueueLinks({ 45 selector: `.vendor-index-group-wrapper li a`, // e.g. `en/shimano` 46 baseUrl: `https://r2-bike.com/`, // needed for correctly absolute URLs, otherwise it would be `https://r2-bike.com/en/en/shimano`, not sure why ¯\_(ツ)_/¯ 47 userData: { label: LABELS.PRODUCTS }, 48 }); 49}); 50 51router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request, log }) => { 52 log.info(`[PRODUCTS] ${request.url}`); 53 54 if (!request.url.match(/_s(\d+)$/)) { 55 // on first page 56 const paginationText = $(`.list-pageinfo .page-current`).text().trim(); // eg. `Page 1 of 11` 57 const match = paginationText.match(/(\d+) of (\d+)/); 58 if (!match) 59 log.error( 60 `[PRODUCTS] Failed to parse pagination text: ${paginationText}` 61 ); 62 const [, currentPage, totalPages] = match ?? []; 63 if (Number(totalPages) > 1) 64 log.info(`[PRODUCTS] Found ${totalPages} pages, enqueuing`); 65 for (let i = 2; i <= Number(totalPages); i++) { 66 // skip first page, that is already handled 67 void crawler.addRequests([ 68 { 69 url: `${request.url}_s${i}`, // eg. https://r2-bike.com/en/shimano_s2 70 userData: { label: LABELS.PRODUCTS }, 71 }, 72 ]); 73 } 74 } 75 76 const products = []; 77 const $products = $( 78 `#product-list .product-wrapper[itemprop="itemListElement"]` 79 ); // itemprop to avoid selecting last fake tile, which is actually "Next page" link 80 log.info(`[PRODUCTS] ${request.url} - found ${$products.length} products`); 81 $products.each(async (i, el) => { 82 const pid = $(`.product-cell`, el) 83 .attr(`id`) // result-wrapper_buy_form_106016 84 ?.replace(`result-wrapper_buy_form_`, ``); // 106016 85 if (!pid) 86 return log.error( 87 `[PRODUCTS] Failed to parse pid from ${i + 1}th product on ${ 88 request.url 89 }` 90 ); 91 92 const url = $(`meta[itemprop="url"]`, el).attr(`content`); 93 const img = $(`meta[itemprop="image"]`, el).attr(`content`); 94 const name = $(`h4[itemprop="name"]`, el).text().trim(); 95 96 const priceRaw = $(`.price_wrapper .price`, el).text().trim(); // e.g. 1,98 €* 97 const price = parsePrice(priceRaw).amount; 98 99 const priceOrigRaw = $(`.price-uvp`, el).text().trim(); // e.g. MSRP: 5,95 € 100 const priceOrig = parsePrice(priceOrigRaw).amount; 101 102 const inStock = $(`.delivery-status`, el).text().includes(`available`); 103 104 const product = { 105 pid, 106 name, 107 url, 108 img, 109 inStock, 110 currentPrice: toNumberOrNull(price), 111 originalPrice: toNumberOrNull(priceOrig), 112 currency: `EUR`, 113 }; 114 products.push(product); 115 }); 116 await save(products); 117}); 118 119void Actor.main(async () => { 120 const input = await Actor.getInput(); 121 const { mode = MODE.FULL, ...rest } = input ?? {}; 122 await init({ actorNameOverride: `r2-bike-com` }, rest); 123 const crawler = new CheerioCrawler({ 124 requestHandler: router, 125 preNavigationHooks: [ 126 async ({ session }, gotOptions) => { 127 const userData = session.userData; 128 gotOptions.headers = userData.headers; // real-like headers obtained from Firefox 129 gotOptions.headers.Cookie = userData.cookies 130 .map((c) => `${c.name}=${c.value}`) 131 .join(`; `); // real cookies obtained from Firefox 132 // gotOptions.proxyUrl = `http://127.0.0.1:9090` // NOTE: uncomment for local debugging 133 }, 134 ], 135 maxConcurrency: 1, // not brave enough for concurrency 136 maxRequestRetries: 0, // not brave enough for concurrency 137 sessionPoolOptions: { 138 maxPoolSize: 1, // not brave enough for concurrency 139 sessionOptions: { 140 maxAgeSecs: 60 * 60 * 2, // 2 hours, default is 50m 141 maxUsageCount: 1000, // default is 50, let's use as much as possible, until we get blocked 142 }, 143 createSessionFunction: async (sessionPool) => { 144 console.log( 145 `[SESSION] Creating new session, will use Firefox to unblock (should take ~10s)` 146 ); 147 const session = new Session({ sessionPool }); 148 await unblock(session); 149 return session; 150 }, 151 }, 152 }); 153 await enqueueInitial(mode, crawler); 154 await crawler.run(); 155}); 156 157async function unblock(session) { 158 const browser = await playwright.firefox.launch({ 159 // headless: false, // NOTE: uncomment for debugging 160 // proxy: { server: `http://127.0.0.1:9090` }, // NOTE: uncomment for local debugging 161 }); 162 const browserContext = await browser.newContext({ ignoreHTTPSErrors: true }); 163 await browserContext.addCookies([ 164 { 165 name: `eu_cookie_store`, 166 value: `{"b209404849c0357500f7a82a6899961a":true,"3940b498c8a17157f69d757a80ff3421":true,"1d3c65b2b03ef35e14df6b163ea3a1f6":false,"0a3fbfc21a86a28c8961999929c374f3":true,"9b88c95a15e018c3f8038a7d0160145c":true,"dd31d974a78cdd704acaa6bf15da506c":true,"d86cf69a8b82547a94ca3f6a307cf9a6":false,"d323dff6f7de41c0b9af4c35e21dc032":false,"b83d1ac867f35569c614e298f645fffe":true,"21affb15e1316adac24b26db8e421a9d":false,"2d1fc55f933c039b2e04ff9034134b4d":true,"4d60ab2c6d11d753267484006c23e54c":false,"970cfba66b8380fb97b742e4571356c6":false}`, 167 domain: `r2-bike.com`, 168 path: `/`, 169 }, 170 { 171 name: `r2_user_delivery_country`, 172 value: `CZ`, // TODO: make it configurable 173 domain: `r2-bike.com`, 174 path: `/`, 175 }, 176 { 177 name: `r2_user_delivery_country_ip_backup`, 178 value: `CZ`, // TODO: make it configurable 179 domain: `r2-bike.com`, 180 path: `/`, 181 }, 182 { 183 name: `r2_user_delivery_country_tax_1`, 184 value: `21`, // TODO: make it configurable 185 domain: `r2-bike.com`, 186 path: `/`, 187 }, 188 { 189 name: `r2_user_delivery_country_tax_2`, 190 value: `10`, // TODO: make it configurable 191 domain: `r2-bike.com`, 192 path: `/`, 193 }, 194 { 195 name: `ledgerCurrency`, 196 value: `EUR`, 197 domain: `r2-bike.com`, 198 path: `/`, 199 }, 200 ]); 201 202 const page = await browserContext.newPage(); 203 // page.on(`console`, msg => console.log(`⚪️ Playwright log (${msg.type()}) ${msg.text()}`)) 204 205 let headersToSet; 206 207 await page.route(`**/*`, (route) => { 208 const request = route.request(); 209 const url = request.url(); 210 const method = request.method(); // GET, POST, etc. 211 const resourceType = request.resourceType(); // document, stylesheet, image, ... 212 // console.log(`🔵 Playwright route: ${method} ${url} (${resourceType})`) 213 214 // use the first main request to store the sent headers 215 if (!headersToSet) headersToSet = pickHeaders(request.headers()); 216 217 route.continue(); 218 }); 219 220 // Go to product listing page which sets 95 products per page to current session 221 await page.goto(`https://r2-bike.com/navi.php?h=58&Sortierung=1&af=95`); // h=58 is "Shimano", Sortierung=1 is "Sort by name" – both are not important, but some values need to be set 222 // Wait for some time to pass basic Cloudflare Javascript checks 223 await crawleeUtils.sleep(5000); // TODO: Be smarter, 3000s is enough for r2-bike.com, but not for g2.com 224 // Get all cookies and store them for subsequent requests 225 const cookies = await page.context().cookies(); 226 session.userData = { headers: headersToSet, cookies }; 227} 228 229function pickHeaders(headers) { 230 // Pick just the headers that gotScraping can correctly handle (= order) 231 // This seems to be needed mainly to avoid setting Host header, which when set, was at the end of the headers list, which Cloudflare did not like 232 // If we skip the Host header, then gotScraping will set it automatically, and in the correct order 233 234 // taken from https://github.com/apify/header-generator/blob/1b0fd217b6fa0beaf42b9de321e47ac5f1d4cebf/src/data_files/headers-order.json#L62 235 const headersList = [ 236 `sec-ch-ua`, 237 `sec-ch-ua-mobile`, 238 `user-agent`, 239 `User-Agent`, 240 `accept`, 241 `Accept`, 242 `accept-language`, 243 `Accept-Language`, 244 `accept-encoding`, 245 `Accept-Encoding`, 246 `dnt`, 247 `DNT`, 248 `referer`, 249 `Referer`, 250 `cookie`, 251 `Cookie`, 252 `Connection`, 253 `upgrade-insecure-requests`, 254 `Upgrade-Insecure-Requests`, 255 `te`, 256 `sec-fetch-site`, 257 `sec-fetch-mode`, 258 `sec-fetch-user`, 259 `sec-fetch-dest`, 260 `Sec-Fetch-Mode`, 261 `Sec-Fetch-Dest`, 262 `Sec-Fetch-Site`, 263 `Sec-Fetch-User`, 264 ]; 265 return headersList.reduce((acc, header) => { 266 if (headers[header]) acc[header] = headers[header]; 267 return acc; 268 }, {}); 269} 270

package.json

1{ 2 "name": "r2-bike-r2-bike-com-scraper", 3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 4 "type": "module", 5 "scripts": { 6 "start": "node ./main.js", 7 "push-to-apify-platform": "npx apify push" 8 }, 9 "dependencies": { 10 "apify3": "npm:apify@^3.0.2", 11 "crawlee": "*", 12 "@crawlee/core": "*", 13 "playwright": "*", 14 "pg": "*", 15 "pg-connection-string": "*", 16 "dotenv": "*", 17 "find-config": "*", 18 "@elastic/elasticsearch": "*", 19 "filenamify": "*", 20 "@crawlee/memory-storage": "*" 21 }, 22 "apify": { 23 "title": "r2-bike (r2-bike.com) scraper", 24 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 25 "isPublic": true, 26 "isDeprecated": false, 27 "isAnonymouslyRunnable": true, 28 "notice": "", 29 "pictureUrl": "", 30 "seoTitle": "", 31 "seoDescription": "", 32 "categories": [ 33 "ECOMMERCE" 34 ] 35 } 36}

.actor/actor.json

1{ 2 "actorSpecification": 1, 3 "name": "r2-bike-r2-bike-com-scraper", 4 "title": "r2-bike (r2-bike.com) scraper", 5 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 6 "version": "0.1.0", 7 "storages": { 8 "dataset": { 9 "actorSpecification": 1, 10 "title": "r2-bike (r2-bike.com) scraper", 11 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 12 "views": { 13 "overview": { 14 "title": "Overview", 15 "description": "Overview of the most important fields", 16 "transformation": { 17 "fields": [ 18 "pid", 19 "name", 20 "url", 21 "img", 22 "inStock", 23 "currentPrice", 24 "originalPrice", 25 "currency" 26 ] 27 }, 28 "display": { 29 "component": "table", 30 "columns": [ 31 { 32 "label": "Pid", 33 "field": "pid", 34 "format": "text" 35 }, 36 { 37 "label": "Name", 38 "field": "name", 39 "format": "text" 40 }, 41 { 42 "label": "Url", 43 "field": "url", 44 "format": "link" 45 }, 46 { 47 "label": "Img", 48 "field": "img", 49 "format": "image" 50 }, 51 { 52 "label": "In Stock", 53 "field": "inStock", 54 "format": "boolean" 55 }, 56 { 57 "label": "Current Price", 58 "field": "currentPrice", 59 "format": "number" 60 }, 61 { 62 "label": "Original Price", 63 "field": "originalPrice", 64 "format": "number" 65 }, 66 { 67 "label": "Currency", 68 "field": "currency", 69 "format": "text" 70 } 71 ] 72 } 73 } 74 } 75 } 76 } 77}

.actor/logo.png

_utils/common.js

1import { createHash } from 'crypto' 2import os from "os" 3import path from "path" 4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals 5import fs from "fs" 6import pg from "pg" 7import pgConnectionString from 'pg-connection-string' 8import { config } from 'dotenv' 9import findConfig from "find-config" 10import { Client as ElasticClient } from "@elastic/elasticsearch" 11import filenamify from 'filenamify' 12import { Configuration, Dataset } from 'crawlee' 13import { MemoryStorage } from '@crawlee/memory-storage' 14 15config({ path: findConfig(`.env`) }) 16 17const elasticIndexName = `actors-monorepo-shops` 18 19const globalLogsProps = { 20 __NODE_STARTED: new Date().toISOString(), 21} 22 23let actorName 24let pgClient 25let pgClientNormalized 26let elasticClient 27export async function init ({ actorNameOverride }, restInput) { 28 parseEnvFromInput(restInput) 29 30 if (os.platform() === `darwin`) { 31 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts 32 const basename = path.basename(filePath) // foo.ts 33 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo 34 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`) 35 .split(` `)[1] 36 .trim() 37 .replace(`refs/heads/`, ``) 38 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`) 39 const gitCommitShort = gitCommit.substring(0, 7) 40 globalLogsProps.__GIT_COMMIT = gitCommitShort 41 } 42 43 if (process.env.APIFY_USE_MEMORY_REQUEST_QUEUE === `true`) { // dotenv -> bool-like vars are strings 44 Configuration.getGlobalConfig().useStorageClient(new MemoryStorage()) 45 } 46 47 if (process.env.APIFY_IS_AT_HOME) { 48 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV 49 } 50 51 /* ELASTIC */ 52 /* ======= */ 53 if (process.env.ELASTIC_CLOUD_ID) { 54 elasticClient = new ElasticClient({ 55 cloud: { id: process.env.ELASTIC_CLOUD_ID }, 56 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY }, 57 }) 58 59 // const mapping = await elasticClient.indices.getMapping({ index: actorName }) 60 61 // eslint-disable-next-line no-inner-declarations 62 async function enforceIndexMapping () { 63 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName }) 64 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName }) 65 await elasticClient.indices.putMapping({ 66 index: elasticIndexName, 67 body: { 68 properties: { 69 _discount: { type: `float` }, 70 originalPrice: { type: `float` }, 71 currentPrice: { type: `float` }, 72 }, 73 }, 74 }) 75 } 76 77 try { 78 await enforceIndexMapping() 79 } catch (err) { 80 if (err.message.includes(`cannot be changed from type`)) { 81 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`) 82 await elasticClient.indices.delete({ index: elasticIndexName }) 83 await enforceIndexMapping() 84 } 85 } 86 } 87 88 /* POSTGRESQL */ 89 /* ========== */ 90 if (process.env.PG_CONNECTION_STRING) { 91 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING) 92 // const pgPool = new pg.Pool(pgConfig) 93 94 pgClient = new pg.Client(pgConfig) 95 await pgClient.connect() 96 97 // Check if table exists and have proper columns 98 const { rows: tables } = await pgClient.query(` 99 SELECT table_name 100 FROM information_schema.tables 101 WHERE table_schema = 'public' 102 `) 103 104 // eslint-disable-next-line camelcase 105 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE) 106 if (!tableExists) { 107 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`) 108 } 109 110 // TODO: Handle pgClient closing 111 } 112 113 if (process.env.PG_CONNECTION_STRING_NORMALIZED) { 114 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED) 115 116 pgClientNormalized = new pg.Client(pgConfig) 117 await pgClientNormalized.connect() 118 119 // Check if table exists and have proper columns 120 const { rows: tables } = await pgClientNormalized.query(` 121 SELECT table_name 122 FROM information_schema.tables 123 WHERE table_schema = 'public' 124 `) 125 126 // eslint-disable-next-line camelcase 127 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE) 128 // eslint-disable-next-line camelcase 129 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE) 130 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`) 131 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`) 132 133 // TODO: Handle pgClient closing 134 } 135} 136 137// inspired by @drobnikj 138// TODO: Similar, but less obfuscated for easier debugging 139export const createUniqueKeyFromUrl = (url) => { 140 const hash = createHash(`sha256`) 141 const cleanUrl = url.split(`://`)[1] // Remove protocol 142 hash.update(cleanUrl) 143 return hash.digest(`hex`) 144} 145 146/** 147 * 148 * @param {Date} datetime 149 * @return {Promise<void>} 150 */ 151export const sleepUntil = async (datetime) => { 152 const now = new Date() 153 const difference = datetime - now 154 if (difference > 0) { 155 return new Promise((resolve) => { 156 setTimeout(resolve, difference) 157 }) 158 } 159 return Promise.resolve() 160} 161 162// TODO: Uff, nicer! But at least it's tested 163export function parsePrice (string) { 164 let amount, currency 165 const noText = string.replace(/[^\d,.]/g, ``) 166 const decimals = noText.match(/([,.])(\d{2})$/) 167 if (decimals) { 168 const decimalSeparator = decimals[1] // ? 169 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars 170 const decimalAmount = decimals[2] // ? 171 const mainAmount = noText.split(decimalSeparator)[0].replace(/\D/g, ``) 172 amount = parseFloat(mainAmount + `.` + decimalAmount) // ? 173 } else { 174 const justNumbers = noText.replace(/[,.]/g, ``) 175 amount = parseInt(justNumbers) 176 } 177 return { amount, currency } 178} 179 180export function toNumberOrNull (str) { 181 // TODO: Handle better, but only after adding test 182 if (str === undefined) return null 183 if (str === null) return null 184 if (str === ``) return null 185 const num = Number(str) 186 if (Number.isNaN(num)) return null 187 return num 188} 189 190export async function save (objs) { 191 if (!Array.isArray(objs)) objs = [objs] 192 if (objs.length === 0) return console.log(`No data to save.`) 193 194 const objsExtended = objs.map(async (obj) => { 195 const objExtended = { 196 ...obj, 197 actorName, 198 ...globalLogsProps, 199 // __NODE_VERSION: global.process.versions.node, 200 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals 201 } 202 // if run on Apify 203 if (process.env.APIFY_IS_AT_HOME) { 204 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID 205 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID 206 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID 207 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER 208 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID 209 if (process.env.APIFY_DONT_STORE_IN_DATASET !== `true`) { // Note: dotenv is not casting vars, so they are strings 210 await Dataset.pushData(obj) 211 } 212 } 213 return objExtended 214 }) 215 // if runs on local machine (MacOS) 216 if (os.platform() === `darwin`) { 217 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors 218 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage 219 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir) 220 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data 221 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir) 222 for (const objExtended of objsExtended) { 223 const id = String(objExtended.id ?? objExtended.pid) // ?? uuidv4() 224 const fileName = `${filenamify(id)}.json` 225 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json 226 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2)) 227 } 228 } 229 230 if (pgClient) { 231 const objsPg = objs.map((obj) => ({ 232 ...obj, 233 // TODO: This is becoming not nice, and not clear 234 shop: actorName, 235 scrapedAt: new Date().toISOString().split(`T`)[0], 236 })) 237 238 const columns = getColumns(objsPg) 239 const values = getValues(objsPg) 240 const queryString = ` 241 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns}) 242 VALUES (${values}) 243 ` 244 try { 245 const { rowCount } = await pgClient.query(queryString) 246 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`) 247 } catch (err) { 248 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 249 else throw err 250 } 251 } 252 253 // Only make sense for HlidacShopu 254 if (pgClientNormalized) { 255 const objsPgData = objs.map((obj) => ({ 256 shop: actorName, 257 pid: obj.pid, 258 name: obj.name, 259 url: obj.url, 260 img: obj.img, 261 })) 262 263 const objsPgDataPrice = objs.map((obj) => ({ 264 shop: actorName, 265 pid: obj.pid, 266 scrapedAt: new Date().toISOString().split(`T`)[0], 267 currentPrice: obj.currentPrice, 268 originalPrice: obj.originalPrice, 269 inStock: obj.inStock, 270 })) 271 272 const queryString = ` 273 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)}) 274 VALUES (${getValues(objsPgData)}) 275 ON CONFLICT DO NOTHING 276 ` 277 try { 278 const { rowCount } = await pgClientNormalized.query(queryString) 279 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`) 280 } catch (err) { 281 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 282 else throw err 283 } 284 285 const queryStringPrice = ` 286 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)}) 287 VALUES (${getValues(objsPgDataPrice)}) 288 ON CONFLICT DO NOTHING 289 ` 290 try { 291 const { rowCount } = await pgClientNormalized.query(queryStringPrice) 292 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`) 293 } catch (err) { 294 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 295 else throw err 296 } 297 } 298 299 if (elasticClient) { 300 // .index creates or updates the document 301 // .create creates a new document if it doesn't exist, 409 if it does 302 // try { 303 // const res = await elasticClient.index({ 304 // index: `actors-monorepo-shops`, // TODO: Consider using actorName 305 // id, // foo-bar 306 // document: objExtended, // {...} 307 // }) 308 // } catch (err) { 309 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583 310 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`) 311 // else throw err 312 // } 313 } 314} 315 316function getColumns (objs) { 317 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `) 318} 319 320function getValues (objs) { 321 return objs.map(objPg => Object.values(objPg).map((value) => { 322 // escape strings to prevent SQL injection 323 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'` 324 // convert to DB specific null 325 if (typeof value === `undefined` || value === null) return `NULL` 326 return value 327 }).join(`, `)).join(`), (`) 328} 329 330export function parseEnvFromInput (input) { 331 const env = {} 332 for (const key in input) { 333 if (key === key.toUpperCase()) env[key] = input[key] 334 } 335 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`) 336 Object.assign(process.env, env) 337} 338 339export const isInspect = 340 process.execArgv.join().includes(`--inspect`) || 341 // @ts-ignore 342 process?._preload_modules?.join(`|`)?.includes(`debug`) 343
Developer
Maintained by Community
Actor stats
  • 2 users
  • 495 runs
  • Modified about 1 year ago
Categories

You might also like these Actors