
Bike24 (bike24.de) scraper
Deprecated
Pricing
Pay per usage
Go to Store


Bike24 (bike24.de) scraper
Deprecated
Scrapes products titles, prices, images and availability. Does NOT scrape product details.
0.0 (0)
Pricing
Pay per usage
1
Monthly users
2
Last modified
2 years ago
Dockerfile
1FROM apify/actor-node-playwright-firefox:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install aws-crt \
7 && npm install --only=prod --no-optional
8
9COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "Bike24 (bike24.de) scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "mode": {
8 "title": "Mode",
9 "description": "",
10 "type": "string",
11 "editor": "select",
12 "default": "TEST",
13 "prefill": "TEST",
14 "enum": [
15 "TEST",
16 "FULL"
17 ],
18 "enumTitles": [
19 "TEST mode (scrapes only few categories)",
20 "FULL"
21 ]
22 },
23 "proxyConfiguration": {
24 "title": "Proxy configuration",
25 "description": "Select proxies to be used by your actor.",
26 "type": "object",
27 "editor": "proxy",
28 "default": {
29 "useApifyProxy": true,
30 "apifyProxyGroups": [
31 "RESIDENTIAL"
32 ]
33 },
34 "prefill": {
35 "useApifyProxy": true,
36 "apifyProxyGroups": [
37 "RESIDENTIAL"
38 ]
39 }
40 },
41 "debug": {
42 "title": "Debug",
43 "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
44 "type": "boolean",
45 "editor": "checkbox",
46 "default": false
47 },
48 "APIFY_USE_MEMORY_REQUEST_QUEUE": {
49 "sectionCaption": "Advanced",
50 "sectionDescription": "Advanced options, use only if you know what you're doing.",
51 "title": "Use in-memory request queue instead of the native one",
52 "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
53 "type": "boolean",
54 "default": false,
55 "editor": "checkbox"
56 },
57 "APIFY_DONT_STORE_IN_DATASET": {
58 "title": "Don't store in dataset",
59 "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database",
60 "type": "boolean",
61 "default": false,
62 "editor": "checkbox"
63 },
64 "PG_CONNECTION_STRING_NORMALIZED": {
65 "title": "Postgres connection string for normalized data",
66 "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables",
67 "type": "string",
68 "editor": "textfield"
69 },
70 "PG_DATA_TABLE": {
71 "title": "Postgres table name for product data",
72 "description": "Table name for storing product name, url, image, ...",
73 "type": "string",
74 "editor": "textfield"
75 },
76 "PG_DATA_PRICE_TABLE": {
77 "title": "Postgres table name for price data",
78 "description": "Table name for storing price, original price, stock status, ...",
79 "type": "string",
80 "editor": "textfield"
81 }
82 },
83 "required": [
84 "mode",
85 "proxyConfiguration"
86 ]
87}
apify.json
1{
2 "name": "bike24-bike24-de-scraper",
3 "version": "0.1",
4 "buildTag": "latest",
5 "env": null,
6 "defaultRunOptions": {
7 "build": "latest",
8 "timeoutSecs": 3600,
9 "memoryMbytes": 4096
10 }
11}
main.js
1import { URL } from "node:url";
2import { Actor } from "apify3";
3import {
4 CheerioCrawler,
5 createCheerioRouter,
6 utils as crawleeUtils,
7} from "crawlee";
8import playwright from "playwright";
9import { Session } from "@crawlee/core";
10import { init, save } from "./_utils/common.js";
11
12const LABELS = {
13 INDEX: `INDEX`,
14 PRODUCTS: `PRODUCTS`,
15};
16
17var MODE;
18
19(function (MODE) {
20 MODE["TEST"] = "TEST";
21 MODE["FULL"] = "FULL";
22})(MODE || (MODE = {}));
23
24const BASE_URL = `https://www.bike24.com`;
25
26async function enqueueInitial(mode, crawler) {
27 if (mode === MODE.FULL) {
28 await crawler.addRequests([
29 {
30 userData: { label: LABELS.INDEX },
31 url: `https://www.bike24.com/brands`,
32 },
33 ]);
34 } else if (mode === MODE.TEST) {
35 await crawler.addRequests([
36 {
37 userData: { label: LABELS.PRODUCTS },
38 url: `https://www.bike24.com/brands/100percent`,
39 },
40 ]);
41 }
42}
43
44const router = createCheerioRouter();
45
46router.addHandler(LABELS.INDEX, async ({ crawler, $ }) => {
47 $(`.list-brands-sitemap__section-item a`).each((i, el) => {
48 const url = $(el).attr(`href`); // urls are relative
49 const fullUrl = `${BASE_URL}${url}`;
50 const name = $(el).text().trim(); // there's extra space at the beginning and end
51 void crawler.addRequests([
52 {
53 userData: { label: LABELS.PRODUCTS, category: name },
54 url: fullUrl,
55 },
56 ]);
57 });
58});
59
60router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request }) => {
61 if (!request.url.includes(`page=`)) {
62 // on first page
63 const totalPages = Number($(`.page-pagination-item`).last().text()); // e.g. `12`
64 // FIXME:
65 for (let i = 2; i <= Math.min(totalPages, 3); i++) {
66 // skip first page, that is already handled
67 const url = new URL(request.url);
68 url.searchParams.set(`page`, i.toString());
69 void crawler.addRequests([
70 {
71 url: url.toString(),
72 userData: {
73 label: LABELS.PRODUCTS,
74 category: request.userData.category, // pass category name
75 },
76 },
77 ]);
78 }
79 }
80
81 const TAX_RATE = 1.21;
82
83 const products = [];
84 const $products = $(`.product-tile`);
85 $products.each((i, el) => {
86 const pid = $(el)
87 .find(`.product-tile__anchor`)
88 .attr(`href`)
89 .replace(/\D/g, ``); // e.g. `p2421335.html` -> `2421335
90 const relUrl = $(el).find(`.product-tile__anchor`).attr(`href`); // relative url
91 const url = `${BASE_URL}${relUrl}`;
92 const name = $(el).find(`.product-tile__title`)?.text()?.trim();
93 const prices = JSON.parse($(`.productPrice`, el).attr(`data-props`));
94 const img = $(el).find(`.product-tile__picture img`).attr(`src`);
95 const inStock = !!$(`.delivery-message--success`).length;
96 const product = {
97 pid,
98 name,
99 url,
100 img,
101 inStock,
102 currentPrice: prices.price * TAX_RATE,
103 originalPrice: prices.oldPrice
104 ? prices.oldPrice * TAX_RATE
105 : prices.price * TAX_RATE,
106 currency: `EUR`,
107 };
108 products.push(product);
109 });
110 await save(products);
111});
112
113void Actor.main(async () => {
114 const input = await Actor.getInput();
115 const {
116 mode = MODE.FULL,
117 proxyConfiguration: inputProxyConfiguration,
118 ...rest
119 } = input ?? {};
120
121 // TODO: Better pattern to handle both proxy and no proxy
122 const proxyConfiguration = inputProxyConfiguration
123 ? await Actor.createProxyConfiguration(inputProxyConfiguration)
124 : undefined;
125
126 await init({ actorNameOverride: `bike-24` }, rest);
127 const crawler = new CheerioCrawler({
128 proxyConfiguration,
129 maxConcurrency: 1,
130 maxRequestRetries: 0,
131 sessionPoolOptions: {
132 maxPoolSize: 1, // not brave enough for concurrency
133 sessionOptions: {
134 maxAgeSecs: 60 * 60 * 2, // 2 hours, default is 50m
135 maxUsageCount: 1000, // default is 50, let's use as much as possible, until we get blocked
136 // TODO: Investigate why so many Firefox sessions are created
137 },
138 createSessionFunction: async (sessionPool) => {
139 console.log(
140 `[SESSION] Creating new session, will use Firefox to unblock (should take ~10s)`
141 );
142 const session = new Session({ sessionPool });
143 await unblock(session, proxyConfiguration);
144 return session;
145 },
146 },
147 persistCookiesPerSession: true,
148 preNavigationHooks: [
149 async ({ session }, gotOptions) => {
150 const userData = session.userData;
151 gotOptions.headers = userData.headers; // real-like headers obtained from Firefox
152 gotOptions.headers.Cookie = userData.cookies
153 .map((c) => `${c.name}=${c.value}`)
154 .join(`; `); // real cookies obtained from Firefox
155 // gotOptions.proxyUrl = `http://127.0.0.1:9090` // NOTE: uncomment for debugging with MITM
156 },
157 ],
158 requestHandler: router,
159 });
160 await enqueueInitial(mode, crawler);
161 await crawler.run();
162});
163
164async function unblock(session, proxyConfiguration) {
165 const browser = await playwright.firefox.launch({
166 headless: true, // NOTE: uncomment for debugging
167 // TODO: Better pattern to handle both proxy and no proxy
168 proxy: proxyConfiguration
169 ? { server: await proxyConfiguration.newUrl(session.id) }
170 : undefined,
171 // proxy: { server: `http://127.0.0.1:9090` }, // NOTE: uncomment for debugging with MITM
172 });
173 const browserContext = await browser.newContext({ ignoreHTTPSErrors: true });
174
175 const countryCode = `29`;
176 await browserContext.addCookies([
177 {
178 name: `countryTax`,
179 value: `{"shippingCountry":${countryCode},"taxRates":[{"value":21,"name":"Normaler Mehrwertsteuersatz","taxGroup":1},{"value":15,"name":"Lebensmittel mit red. MwSt.","taxGroup":2},{"value":15,"name":"Druckerzeugnisse","taxGroup":3}],"validUntil":"Wednesday, 16-Nov-2022 00:00:00 UTC"}`, // FIXME
180 domain: `www.bike24.com`,
181 path: `/`,
182 },
183 {
184 name: `deliveryLocation`,
185 value: `{"country":${countryCode},"zipCode":null}`,
186 domain: `www.bike24.com`,
187 path: `/`,
188 },
189 ]);
190
191 const page = await browserContext.newPage();
192 // page.on(`console`, msg => console.log(`⚪️ Playwright log (${msg.type()}) ${msg.text()}`))
193
194 let headersToSet;
195
196 await page.route(`**/*`, (route) => {
197 const request = route.request();
198 const url = request.url();
199 const method = request.method(); // GET, POST, etc.
200 const resourceType = request.resourceType(); // document, stylesheet, image, ...
201 // console.log(`🔵 Playwright route: ${method} ${url} (${resourceType})`)
202
203 // use the first main request to store the sent headers
204 if (!headersToSet) headersToSet = pickHeaders(request.headers());
205
206 route.continue();
207 });
208
209 await page.goto(`https://www.bike24.com/brands/shimano`);
210 // Wait for some time to pass basic Cloudflare Javascript checks
211 await crawleeUtils.sleep(5000); // TODO: Be smarter, 3000s is enough for r2-bike.com, but not for g2.com
212 // Get all cookies and store them for subsequent requests
213 const cookies = await page.context().cookies();
214 // eslint-disable-next-line dot-notation
215 const cfCookie = cookies.find((c) => c.name === `__cf_bm`).value;
216 console.log(
217 `[SESSION] Cloudflare cookie "__cf_bm": ${cfCookie ?? `😱😱😱 not found`}`
218 );
219 session.userData = { headers: headersToSet, cookies };
220 await browser.close();
221}
222
223function pickHeaders(headers) {
224 // Pick just the headers that gotScraping can correctly handle (= order)
225 // This seems to be needed mainly to avoid setting Host header, which when set, was at the end of the headers list, which Cloudflare did not like
226 // If we skip the Host header, then gotScraping will set it automatically, and in the correct order
227
228 // taken from https://github.com/apify/header-generator/blob/1b0fd217b6fa0beaf42b9de321e47ac5f1d4cebf/src/data_files/headers-order.json#L62
229 const headersList = [
230 `sec-ch-ua`,
231 `sec-ch-ua-mobile`,
232 `user-agent`,
233 `User-Agent`,
234 `accept`,
235 `Accept`,
236 `accept-language`,
237 `Accept-Language`,
238 `accept-encoding`,
239 `Accept-Encoding`,
240 `dnt`,
241 `DNT`,
242 `referer`,
243 `Referer`,
244
245 // Handling cookies explicitly
246 // `cookie`,
247 // `Cookie`,
248
249 `Connection`,
250 `upgrade-insecure-requests`,
251 `Upgrade-Insecure-Requests`,
252 `te`,
253 `sec-fetch-site`,
254 `sec-fetch-mode`,
255 `sec-fetch-user`,
256 `sec-fetch-dest`,
257 `Sec-Fetch-Mode`,
258 `Sec-Fetch-Dest`,
259 `Sec-Fetch-Site`,
260 `Sec-Fetch-User`,
261 ];
262 return headersList.reduce((acc, header) => {
263 if (headers[header]) acc[header] = headers[header];
264 return acc;
265 }, {});
266}
package.json
1{
2 "name": "bike24-bike24-de-scraper",
3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
4 "type": "module",
5 "scripts": {
6 "start": "node ./main.js",
7 "push-to-apify-platform": "npx apify push"
8 },
9 "dependencies": {
10 "apify3": "npm:apify@^3.0.2",
11 "crawlee": "*",
12 "playwright": "*",
13 "@crawlee/core": "*",
14 "pg": "*",
15 "pg-connection-string": "*",
16 "dotenv": "*",
17 "find-config": "*",
18 "@elastic/elasticsearch": "*",
19 "filenamify": "*",
20 "@crawlee/memory-storage": "*"
21 },
22 "apify": {
23 "title": "Bike24 (bike24.de) scraper",
24 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
25 "isPublic": true,
26 "isDeprecated": false,
27 "isAnonymouslyRunnable": true,
28 "notice": "",
29 "pictureUrl": "",
30 "seoTitle": "",
31 "seoDescription": "",
32 "categories": [
33 "ECOMMERCE"
34 ]
35 }
36}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "bike24-bike24-de-scraper",
4 "title": "Bike24 (bike24.de) scraper",
5 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
6 "version": "0.1.0",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "Bike24 (bike24.de) scraper",
11 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.",
12 "views": {
13 "overview": {
14 "title": "Overview",
15 "description": "Overview of the most important fields",
16 "transformation": {
17 "fields": [
18 "pid",
19 "name",
20 "url",
21 "img",
22 "inStock",
23 "currentPrice",
24 "originalPrice",
25 "currency"
26 ]
27 },
28 "display": {
29 "component": "table",
30 "columns": [
31 {
32 "label": "Pid",
33 "field": "pid",
34 "format": "text"
35 },
36 {
37 "label": "Name",
38 "field": "name",
39 "format": "text"
40 },
41 {
42 "label": "Url",
43 "field": "url",
44 "format": "link"
45 },
46 {
47 "label": "Img",
48 "field": "img",
49 "format": "image"
50 },
51 {
52 "label": "In Stock",
53 "field": "inStock",
54 "format": "boolean"
55 },
56 {
57 "label": "Current Price",
58 "field": "currentPrice",
59 "format": "number"
60 },
61 {
62 "label": "Original Price",
63 "field": "originalPrice",
64 "format": "number"
65 },
66 {
67 "label": "Currency",
68 "field": "currency",
69 "format": "text"
70 }
71 ]
72 }
73 }
74 }
75 }
76 }
77}
.actor/logo.png
_utils/common.js
1import { createHash } from 'crypto'
2import os from "os"
3import path from "path"
4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals
5import fs from "fs"
6import pg from "pg"
7import pgConnectionString from 'pg-connection-string'
8import { config } from 'dotenv'
9import findConfig from "find-config"
10import { Client as ElasticClient } from "@elastic/elasticsearch"
11import filenamify from 'filenamify'
12import { Configuration, Dataset } from 'crawlee'
13import { MemoryStorage } from '@crawlee/memory-storage'
14
15config({ path: findConfig(`.env`) })
16
17const elasticIndexName = `actors-monorepo-shops`
18
19const globalLogsProps = {
20 __NODE_STARTED: new Date().toISOString(),
21}
22
23let actorName
24let pgClient
25let pgClientNormalized
26let elasticClient
27export async function init ({ actorNameOverride }, restInput) {
28 parseEnvFromInput(restInput)
29
30 if (os.platform() === `darwin`) {
31 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts
32 const basename = path.basename(filePath) // foo.ts
33 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo
34 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`)
35 .split(` `)[1]
36 .trim()
37 .replace(`refs/heads/`, ``)
38 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`)
39 const gitCommitShort = gitCommit.substring(0, 7)
40 globalLogsProps.__GIT_COMMIT = gitCommitShort
41 }
42
43 if (process.env.APIFY_USE_MEMORY_REQUEST_QUEUE === `true`) { // dotenv -> bool-like vars are strings
44 Configuration.getGlobalConfig().useStorageClient(new MemoryStorage())
45 }
46
47 if (process.env.APIFY_IS_AT_HOME) {
48 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV
49 }
50
51 /* ELASTIC */
52 /* ======= */
53 if (process.env.ELASTIC_CLOUD_ID) {
54 elasticClient = new ElasticClient({
55 cloud: { id: process.env.ELASTIC_CLOUD_ID },
56 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY },
57 })
58
59 // const mapping = await elasticClient.indices.getMapping({ index: actorName })
60
61 // eslint-disable-next-line no-inner-declarations
62 async function enforceIndexMapping () {
63 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName })
64 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName })
65 await elasticClient.indices.putMapping({
66 index: elasticIndexName,
67 body: {
68 properties: {
69 _discount: { type: `float` },
70 originalPrice: { type: `float` },
71 currentPrice: { type: `float` },
72 },
73 },
74 })
75 }
76
77 try {
78 await enforceIndexMapping()
79 } catch (err) {
80 if (err.message.includes(`cannot be changed from type`)) {
81 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`)
82 await elasticClient.indices.delete({ index: elasticIndexName })
83 await enforceIndexMapping()
84 }
85 }
86 }
87
88 /* POSTGRESQL */
89 /* ========== */
90 if (process.env.PG_CONNECTION_STRING) {
91 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING)
92 // const pgPool = new pg.Pool(pgConfig)
93
94 pgClient = new pg.Client(pgConfig)
95 await pgClient.connect()
96
97 // Check if table exists and have proper columns
98 const { rows: tables } = await pgClient.query(`
99 SELECT table_name
100 FROM information_schema.tables
101 WHERE table_schema = 'public'
102 `)
103
104 // eslint-disable-next-line camelcase
105 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
106 if (!tableExists) {
107 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
108 }
109
110 // TODO: Handle pgClient closing
111 }
112
113 if (process.env.PG_CONNECTION_STRING_NORMALIZED) {
114 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED)
115
116 pgClientNormalized = new pg.Client(pgConfig)
117 await pgClientNormalized.connect()
118
119 // Check if table exists and have proper columns
120 const { rows: tables } = await pgClientNormalized.query(`
121 SELECT table_name
122 FROM information_schema.tables
123 WHERE table_schema = 'public'
124 `)
125
126 // eslint-disable-next-line camelcase
127 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE)
128 // eslint-disable-next-line camelcase
129 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE)
130 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`)
131 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`)
132
133 // TODO: Handle pgClient closing
134 }
135}
136
137// inspired by @drobnikj
138// TODO: Similar, but less obfuscated for easier debugging
139export const createUniqueKeyFromUrl = (url) => {
140 const hash = createHash(`sha256`)
141 const cleanUrl = url.split(`://`)[1] // Remove protocol
142 hash.update(cleanUrl)
143 return hash.digest(`hex`)
144}
145
146/**
147 *
148 * @param {Date} datetime
149 * @return {Promise<void>}
150 */
151export const sleepUntil = async (datetime) => {
152 const now = new Date()
153 const difference = datetime - now
154 if (difference > 0) {
155 return new Promise((resolve) => {
156 setTimeout(resolve, difference)
157 })
158 }
159 return Promise.resolve()
160}
161
162// TODO: Uff, nicer! But at least it's tested
163export function parsePrice (string) {
164 let amount, currency
165 const noText = string.replace(/[^\d,.]/g, ``)
166 const decimals = noText.match(/([,.])(\d{2})$/)
167 if (decimals) {
168 const decimalSeparator = decimals[1] // ?
169 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars
170 const decimalAmount = decimals[2] // ?
171 const mainAmount = noText.split(decimalSeparator)[0].replace(/\D/g, ``)
172 amount = parseFloat(mainAmount + `.` + decimalAmount) // ?
173 } else {
174 const justNumbers = noText.replace(/[,.]/g, ``)
175 amount = parseInt(justNumbers)
176 }
177 return { amount, currency }
178}
179
180export function toNumberOrNull (str) {
181 // TODO: Handle better, but only after adding test
182 if (str === undefined) return null
183 if (str === null) return null
184 if (str === ``) return null
185 const num = Number(str)
186 if (Number.isNaN(num)) return null
187 return num
188}
189
190export async function save (objs) {
191 if (!Array.isArray(objs)) objs = [objs]
192 if (objs.length === 0) return console.log(`No data to save.`)
193
194 const objsExtended = objs.map(async (obj) => {
195 const objExtended = {
196 ...obj,
197 actorName,
198 ...globalLogsProps,
199 // __NODE_VERSION: global.process.versions.node,
200 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals
201 }
202 // if run on Apify
203 if (process.env.APIFY_IS_AT_HOME) {
204 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID
205 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID
206 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID
207 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER
208 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID
209 if (process.env.APIFY_DONT_STORE_IN_DATASET !== `true`) { // Note: dotenv is not casting vars, so they are strings
210 await Dataset.pushData(obj)
211 }
212 }
213 return objExtended
214 })
215 // if runs on local machine (MacOS)
216 if (os.platform() === `darwin`) {
217 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors
218 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage
219 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir)
220 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data
221 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir)
222 for (const objExtended of objsExtended) {
223 const id = String(objExtended.id ?? objExtended.pid) // ?? uuidv4()
224 const fileName = `${filenamify(id)}.json`
225 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json
226 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2))
227 }
228 }
229
230 if (pgClient) {
231 const objsPg = objs.map((obj) => ({
232 ...obj,
233 // TODO: This is becoming not nice, and not clear
234 shop: actorName,
235 scrapedAt: new Date().toISOString().split(`T`)[0],
236 }))
237
238 const columns = getColumns(objsPg)
239 const values = getValues(objsPg)
240 const queryString = `
241 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns})
242 VALUES (${values})
243 `
244 try {
245 const { rowCount } = await pgClient.query(queryString)
246 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`)
247 } catch (err) {
248 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
249 else throw err
250 }
251 }
252
253 // Only make sense for HlidacShopu
254 if (pgClientNormalized) {
255 const objsPgData = objs.map((obj) => ({
256 shop: actorName,
257 pid: obj.pid,
258 name: obj.name,
259 url: obj.url,
260 img: obj.img,
261 }))
262
263 const objsPgDataPrice = objs.map((obj) => ({
264 shop: actorName,
265 pid: obj.pid,
266 scrapedAt: new Date().toISOString().split(`T`)[0],
267 currentPrice: obj.currentPrice,
268 originalPrice: obj.originalPrice,
269 inStock: obj.inStock,
270 }))
271
272 const queryString = `
273 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)})
274 VALUES (${getValues(objsPgData)})
275 ON CONFLICT DO NOTHING
276 `
277 try {
278 const { rowCount } = await pgClientNormalized.query(queryString)
279 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`)
280 } catch (err) {
281 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
282 else throw err
283 }
284
285 const queryStringPrice = `
286 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)})
287 VALUES (${getValues(objsPgDataPrice)})
288 ON CONFLICT DO NOTHING
289 `
290 try {
291 const { rowCount } = await pgClientNormalized.query(queryStringPrice)
292 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`)
293 } catch (err) {
294 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`)
295 else throw err
296 }
297 }
298
299 if (elasticClient) {
300 // .index creates or updates the document
301 // .create creates a new document if it doesn't exist, 409 if it does
302 // try {
303 // const res = await elasticClient.index({
304 // index: `actors-monorepo-shops`, // TODO: Consider using actorName
305 // id, // foo-bar
306 // document: objExtended, // {...}
307 // })
308 // } catch (err) {
309 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583
310 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`)
311 // else throw err
312 // }
313 }
314}
315
316function getColumns (objs) {
317 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `)
318}
319
320function getValues (objs) {
321 return objs.map(objPg => Object.values(objPg).map((value) => {
322 // escape strings to prevent SQL injection
323 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'`
324 // convert to DB specific null
325 if (typeof value === `undefined` || value === null) return `NULL`
326 return value
327 }).join(`, `)).join(`), (`)
328}
329
330export function parseEnvFromInput (input) {
331 const env = {}
332 for (const key in input) {
333 if (key === key.toUpperCase()) env[key] = input[key]
334 }
335 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`)
336 Object.assign(process.env, env)
337}
338
339export const isInspect =
340 process.execArgv.join().includes(`--inspect`) ||
341 // @ts-ignore
342 process?._preload_modules?.join(`|`)?.includes(`debug`)
Pricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.