Fitanu (fitanu.com) scraper

No credit card required

This Actor is under maintenance.

This actor is under maintenance and it may unreliable.

Fitanu (fitanu.com) scraper

Fitanu (fitanu.com) scraper

strajk/fitanu-fitanu-com-scraper

No credit card required

Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).

Dockerfile

1FROM apify/actor-node:16 2 3COPY package.json ./ 4 5RUN npm --quiet set progress=false \ 6 && npm install --only=prod --no-optional 7 8COPY . ./

INPUT_SCHEMA.json

1{ 2 "title": "Fitanu (fitanu.com) scraper", 3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 4 "type": "object", 5 "schemaVersion": 1, 6 "properties": { 7 "mode": { 8 "title": "Mode", 9 "description": "", 10 "type": "string", 11 "editor": "select", 12 "default": "TEST", 13 "prefill": "TEST", 14 "enum": [ 15 "TEST", 16 "FULL" 17 ], 18 "enumTitles": [ 19 "TEST", 20 "FULL" 21 ] 22 }, 23 "APIFY_DONT_STORE_IN_DATASET": { 24 "sectionCaption": "Advanced", 25 "sectionDescription": "Advanced options, use only if you know what you're doing.", 26 "title": "Don't store in dataset", 27 "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database", 28 "type": "boolean", 29 "default": false, 30 "editor": "checkbox" 31 }, 32 "PG_CONNECTION_STRING_NORMALIZED": { 33 "title": "Postgres connection string for normalized data", 34 "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables", 35 "type": "string", 36 "editor": "textfield" 37 }, 38 "PG_DATA_TABLE": { 39 "title": "Postgres table name for product data", 40 "description": "Table name for storing product name, url, image, ...", 41 "type": "string", 42 "editor": "textfield" 43 }, 44 "PG_DATA_PRICE_TABLE": { 45 "title": "Postgres table name for price data", 46 "description": "Table name for storing price, original price, stock status, ...", 47 "type": "string", 48 "editor": "textfield" 49 } 50 }, 51 "required": [ 52 "mode" 53 ] 54}

apify.json

1{ 2 "name": "fitanu-fitanu-com-scraper", 3 "version": "0.1", 4 "buildTag": "latest", 5 "env": null, 6 "defaultRunOptions": { 7 "build": "latest", 8 "timeoutSecs": 3600, 9 "memoryMbytes": 1024 10 } 11}

main.js

1import { Actor } from "apify3"; 2import { CheerioCrawler, createCheerioRouter } from "crawlee"; 3import { init, save, toNumberOrNull } from "./_utils/common.js"; 4 5const LABELS = { 6 INDEX: `INDEX`, 7 PRODUCTS: `PRODUCTS`, 8}; 9 10var MODE; 11 12(function (MODE) { 13 MODE["TEST"] = "TEST"; 14 MODE["FULL"] = "FULL"; 15})(MODE || (MODE = {})); 16 17const BASE_URL = `https://fitanu.com`; 18 19async function enqueueInitial(mode, crawler) { 20 if (mode === MODE.FULL) { 21 await crawler.addRequests([ 22 { 23 userData: { label: LABELS.INDEX }, 24 url: `https://fitanu.com/cz/nase-znacky`, 25 }, 26 ]); 27 } else if (mode === MODE.TEST) { 28 await crawler.addRequests([ 29 { 30 userData: { label: LABELS.PRODUCTS }, 31 url: `https://fitanu.com/cz/produkty/zna%C4%8Dka/nike`, 32 }, 33 ]); 34 } 35} 36 37const router = createCheerioRouter(); 38 39router.addHandler(LABELS.INDEX, async ({ enqueueLinks }) => { 40 await enqueueLinks({ 41 selector: `[href^="${BASE_URL}/cz/produkty/zna膷ka/"]`, 42 userData: { label: LABELS.PRODUCTS }, 43 }); 44}); 45 46router.addHandler(LABELS.PRODUCTS, async ({ crawler, $, request, log }) => { 47 log.info(`[PRODUCTS] ${request.url}`); 48 if (!request.url.match(/\/page\/\d+$/)) { 49 // on first page, handle navigation 50 const totalPages = Number($(`.pages-items .page.last`).first().text()); // e.g. 6 51 for (let i = 2; i <= totalPages; i++) { 52 // skip first page, that is already handled 53 void crawler.addRequests([ 54 { 55 userData: { label: LABELS.PRODUCTS }, 56 url: `${request.url}/page/${i}`, 57 }, 58 ]); 59 } 60 } 61 62 const products = []; 63 // Some .product-item are .category-description, not real products, 64 // that's why we need to filter just ones with data-product-sku 65 $(`.products-grid .product-item[data-product-sku]`).each((i, el) => { 66 const pid = $(el).attr(`data-product-sku`); // e.g. C92800472927 67 const url = $(el).find(`a.product-item-photo`).attr(`href`); // absolute url 68 const img = $(el).find(`img.product-image-photo`).attr(`src`); // absolute url 69 const name = $(el) 70 .find(`.product-item-name`) 71 .text() 72 .replace(/\n/g, ` `) // replace new lines with space 73 .replace(/\s+/g, ` `); // replace multiple spaces with single space 74 const price = $(el) 75 .find(`[data-price-type="finalPrice"]`) 76 .attr(`data-price-amount`); // e.g. 1099 77 const priceOrig = $(el) 78 .find(`[data-price-type="oldPrice"]`) 79 .attr(`data-price-amount`); // e.g. 1299 80 const inStock = undefined; // not available 81 const product = { 82 pid, 83 name, 84 url, 85 img, 86 inStock, 87 currentPrice: toNumberOrNull(price), 88 originalPrice: toNumberOrNull(priceOrig), 89 currency: `CZK`, 90 }; 91 products.push(product); 92 }); 93 await save(products); 94}); 95 96void Actor.main(async () => { 97 const input = await Actor.getInput(); 98 const { mode = MODE.FULL, ...rest } = input ?? {}; 99 await init({ actorNameOverride: `fitanu` }, rest); 100 const crawler = new CheerioCrawler({ requestHandler: router }); 101 await enqueueInitial(mode, crawler); 102 await crawler.run(); 103}); 104

package.json

1{ 2 "name": "fitanu-fitanu-com-scraper", 3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 4 "type": "module", 5 "scripts": { 6 "start": "node ./main.js", 7 "push-to-apify-platform": "npx apify push" 8 }, 9 "dependencies": { 10 "apify3": "npm:apify@^3.0.2", 11 "crawlee": "*", 12 "pg": "*", 13 "pg-connection-string": "*", 14 "dotenv": "*", 15 "find-config": "*", 16 "@elastic/elasticsearch": "*", 17 "filenamify": "*" 18 }, 19 "apify": { 20 "title": "Fitanu (fitanu.com) scraper", 21 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 22 "isPublic": true, 23 "isDeprecated": false, 24 "isAnonymouslyRunnable": true, 25 "notice": "", 26 "pictureUrl": "", 27 "seoTitle": "", 28 "seoDescription": "", 29 "categories": [ 30 "ECOMMERCE" 31 ] 32 } 33}

.actor/actor.json

1{ 2 "actorSpecification": 1, 3 "name": "fitanu-fitanu-com-scraper", 4 "title": "Fitanu (fitanu.com) scraper", 5 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 6 "version": "0.1.0", 7 "storages": { 8 "dataset": { 9 "actorSpecification": 1, 10 "title": "Fitanu (fitanu.com) scraper", 11 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details. Uses Crawlee (Apify v3).", 12 "views": { 13 "overview": { 14 "title": "Overview", 15 "description": "Overview of the most important fields", 16 "transformation": { 17 "fields": [ 18 "pid", 19 "name", 20 "url", 21 "img", 22 "inStock", 23 "currentPrice", 24 "originalPrice", 25 "currency" 26 ] 27 }, 28 "display": { 29 "component": "table", 30 "columns": [ 31 { 32 "label": "Pid", 33 "field": "pid", 34 "format": "text" 35 }, 36 { 37 "label": "Name", 38 "field": "name", 39 "format": "text" 40 }, 41 { 42 "label": "Url", 43 "field": "url", 44 "format": "link" 45 }, 46 { 47 "label": "Img", 48 "field": "img", 49 "format": "image" 50 }, 51 { 52 "label": "In Stock", 53 "field": "inStock", 54 "format": "boolean" 55 }, 56 { 57 "label": "Current Price", 58 "field": "currentPrice", 59 "format": "number" 60 }, 61 { 62 "label": "Original Price", 63 "field": "originalPrice", 64 "format": "number" 65 }, 66 { 67 "label": "Currency", 68 "field": "currency", 69 "format": "text" 70 } 71 ] 72 } 73 } 74 } 75 } 76 } 77}

_utils/common.js

1import { createHash } from 'crypto' 2import os from "os" 3import path from "path" 4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals 5import fs from "fs" 6import pg from "pg" 7import pgConnectionString from 'pg-connection-string' 8import { config } from 'dotenv' 9import findConfig from "find-config" 10import { Client as ElasticClient } from "@elastic/elasticsearch" 11import filenamify from 'filenamify' 12import { Dataset } from 'crawlee' 13 14config({ path: findConfig(`.env`) }) 15 16const elasticIndexName = `actors-monorepo-shops` 17 18const globalLogsProps = { 19 __NODE_STARTED: new Date().toISOString(), 20} 21 22let actorName 23let pgClient 24let pgClientNormalized 25let elasticClient 26export async function init ({ actorNameOverride }, restInput) { 27 parseEnvFromInput(restInput) 28 29 if (os.platform() === `darwin`) { 30 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts 31 const basename = path.basename(filePath) // foo.ts 32 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo 33 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`) 34 .split(` `)[1] 35 .trim() 36 .replace(`refs/heads/`, ``) 37 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`) 38 const gitCommitShort = gitCommit.substring(0, 7) 39 globalLogsProps.__GIT_COMMIT = gitCommitShort 40 } 41 42 if (process.env.APIFY_IS_AT_HOME) { 43 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV 44 } 45 46 /* ELASTIC */ 47 /* ======= */ 48 if (process.env.ELASTIC_CLOUD_ID) { 49 elasticClient = new ElasticClient({ 50 cloud: { id: process.env.ELASTIC_CLOUD_ID }, 51 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY }, 52 }) 53 54 // const mapping = await elasticClient.indices.getMapping({ index: actorName }) 55 56 // eslint-disable-next-line no-inner-declarations 57 async function enforceIndexMapping () { 58 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName }) 59 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName }) 60 await elasticClient.indices.putMapping({ 61 index: elasticIndexName, 62 body: { 63 properties: { 64 _discount: { type: `float` }, 65 originalPrice: { type: `float` }, 66 currentPrice: { type: `float` }, 67 }, 68 }, 69 }) 70 } 71 72 try { 73 await enforceIndexMapping() 74 } catch (err) { 75 if (err.message.includes(`cannot be changed from type`)) { 76 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`) 77 await elasticClient.indices.delete({ index: elasticIndexName }) 78 await enforceIndexMapping() 79 } 80 } 81 } 82 83 /* POSTGRESQL */ 84 /* ========== */ 85 if (process.env.PG_CONNECTION_STRING) { 86 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING) 87 // const pgPool = new pg.Pool(pgConfig) 88 89 pgClient = new pg.Client(pgConfig) 90 await pgClient.connect() 91 92 // Check if table exists and have proper columns 93 const { rows: tables } = await pgClient.query(` 94 SELECT table_name 95 FROM information_schema.tables 96 WHERE table_schema = 'public' 97 `) 98 99 // eslint-disable-next-line camelcase 100 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE) 101 if (!tableExists) { 102 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`) 103 } 104 105 // TODO: Handle pgClient closing 106 } 107 108 if (process.env.PG_CONNECTION_STRING_NORMALIZED) { 109 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED) 110 111 pgClientNormalized = new pg.Client(pgConfig) 112 await pgClientNormalized.connect() 113 114 // Check if table exists and have proper columns 115 const { rows: tables } = await pgClientNormalized.query(` 116 SELECT table_name 117 FROM information_schema.tables 118 WHERE table_schema = 'public' 119 `) 120 121 // eslint-disable-next-line camelcase 122 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE) 123 // eslint-disable-next-line camelcase 124 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE) 125 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`) 126 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`) 127 128 // TODO: Handle pgClient closing 129 } 130} 131 132// inspired by @drobnikj 133// TODO: Similar, but less obfuscated for easier debugging 134export const createUniqueKeyFromUrl = (url) => { 135 const hash = createHash(`sha256`) 136 const cleanUrl = url.split(`://`)[1] // Remove protocol 137 hash.update(cleanUrl) 138 return hash.digest(`hex`) 139} 140 141/** 142 * 143 * @param {Date} datetime 144 * @return {Promise<void>} 145 */ 146export const sleepUntil = async (datetime) => { 147 const now = new Date() 148 const difference = datetime - now 149 if (difference > 0) { 150 return new Promise((resolve) => { 151 setTimeout(resolve, difference) 152 }) 153 } 154 return Promise.resolve() 155} 156 157export function parsePrice (string) { 158 let amount, currency 159 const noText = string.replace(/[^\d,.]/g, ``) 160 const decimals = noText.match(/([,.])(\d{2})$/) 161 if (decimals) { 162 const decimalSeparator = decimals[1] 163 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars 164 const decimalAmount = decimals[2] 165 amount = parseInt(noText.split(decimalSeparator)[0]) 166 } { 167 const justNumbers = noText.replace(/[,.]/g, ``) 168 amount = parseInt(justNumbers) 169 } 170 return { amount, currency } 171} 172 173export function toNumberOrNull (str) { 174 // TODO: Handle better, but only after adding test 175 if (str === undefined) return null 176 if (str === null) return null 177 if (str === ``) return null 178 const num = Number(str) 179 if (Number.isNaN(num)) return null 180 return num 181} 182 183export async function save (objs) { 184 if (!Array.isArray(objs)) objs = [objs] 185 if (objs.length === 0) return 186 187 const objsExtended = objs.map((obj) => { 188 const objExtended = { 189 ...obj, 190 actorName, 191 ...globalLogsProps, 192 // __NODE_VERSION: global.process.versions.node, 193 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals 194 } 195 // if run on Apify 196 if (process.env.APIFY_IS_AT_HOME) { 197 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID 198 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID 199 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID 200 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER 201 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID 202 if (!process.env.APIFY_DONT_STORE_IN_DATASET) void Dataset.pushData(obj) 203 } 204 return objExtended 205 }) 206 // if runs on local machine (MacOS) 207 if (os.platform() === `darwin`) { 208 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors 209 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage 210 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir) 211 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data 212 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir) 213 for (const objExtended of objsExtended) { 214 const id = objExtended.id ?? objExtended.pid // ?? uuidv4() 215 const fileName = `${filenamify(id)}.json` 216 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json 217 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2)) 218 } 219 } 220 221 if (pgClient) { 222 const objsPg = objs.map((obj) => ({ 223 ...obj, 224 // TODO: This is becoming not nice, and not clear 225 shop: actorName, 226 scrapedAt: new Date().toISOString().split(`T`)[0], 227 })) 228 229 const columns = getColumns(objsPg) 230 const values = getValues(objsPg) 231 const queryString = ` 232 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns}) 233 VALUES (${values}) 234 ` 235 try { 236 const { rowCount } = await pgClient.query(queryString) 237 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`) 238 } catch (err) { 239 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 240 else throw err 241 } 242 } 243 244 // Only make sense for HlidacShopu 245 if (pgClientNormalized) { 246 const objsPgData = objs.map((obj) => ({ 247 shop: actorName, 248 pid: obj.pid, 249 name: obj.name, 250 url: obj.url, 251 img: obj.img, 252 })) 253 254 const objsPgDataPrice = objs.map((obj) => ({ 255 shop: actorName, 256 pid: obj.pid, 257 scrapedAt: new Date().toISOString().split(`T`)[0], 258 currentPrice: obj.currentPrice, 259 originalPrice: obj.originalPrice, 260 inStock: obj.inStock, 261 })) 262 263 const queryString = ` 264 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)}) 265 VALUES (${getValues(objsPgData)}) 266 ON CONFLICT DO NOTHING 267 ` 268 try { 269 const { rowCount } = await pgClientNormalized.query(queryString) 270 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`) 271 } catch (err) { 272 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 273 else throw err 274 } 275 276 const queryStringPrice = ` 277 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)}) 278 VALUES (${getValues(objsPgDataPrice)}) 279 ON CONFLICT DO NOTHING 280 ` 281 try { 282 const { rowCount } = await pgClientNormalized.query(queryStringPrice) 283 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`) 284 } catch (err) { 285 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 286 else throw err 287 } 288 } 289 290 if (elasticClient) { 291 // .index creates or updates the document 292 // .create creates a new document if it doesn't exist, 409 if it does 293 // try { 294 // const res = await elasticClient.index({ 295 // index: `actors-monorepo-shops`, // TODO: Consider using actorName 296 // id, // foo-bar 297 // document: objExtended, // {...} 298 // }) 299 // } catch (err) { 300 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583 301 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`) 302 // else throw err 303 // } 304 } 305} 306 307function getColumns (objs) { 308 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `) 309} 310 311function getValues (objs) { 312 return objs.map(objPg => Object.values(objPg).map((value) => { 313 // escape strings to prevent SQL injection 314 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'` 315 // convert to DB specific null 316 if (typeof value === `undefined` || value === null) return `NULL` 317 return value 318 }).join(`, `)).join(`), (`) 319} 320 321export function parseEnvFromInput (input) { 322 const env = {} 323 for (const key in input) { 324 if (key === key.toUpperCase()) env[key] = input[key] 325 } 326 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`) 327 Object.assign(process.env, env) 328} 329
Developer
Maintained by Community
Actor stats
  • 3 users
  • 414 runs
  • Modified about 1 year ago
Categories

You might also like these Actors