Team Sport (teamsport.cz) scraper

No credit card required

This Actor is under maintenance.

This actor is under maintenance and it may unreliable.

Team Sport (teamsport.cz) scraper

Team Sport (teamsport.cz) scraper

strajk/team-sport-teamsport-cz-scraper

No credit card required

Scrapes products titles, prices, images and availability. Does NOT scrape product details.

Dockerfile

1FROM apify/actor-node:16 2 3COPY package.json ./ 4 5RUN npm --quiet set progress=false \ 6 && npm install --only=prod --no-optional 7 8COPY . ./

INPUT_SCHEMA.json

1{ 2 "title": "Team Sport (teamsport.cz) scraper", 3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", 4 "type": "object", 5 "schemaVersion": 1, 6 "properties": { 7 "mode": { 8 "title": "Mode", 9 "description": "", 10 "type": "string", 11 "editor": "select", 12 "default": "TEST", 13 "prefill": "TEST", 14 "enum": [ 15 "TEST", 16 "FULL" 17 ], 18 "enumTitles": [ 19 "TEST", 20 "FULL" 21 ] 22 }, 23 "APIFY_DONT_STORE_IN_DATASET": { 24 "sectionCaption": "Advanced", 25 "sectionDescription": "Advanced options, use only if you know what you're doing.", 26 "title": "Don't store in dataset", 27 "description": "If set to true, the actor will not store the results in the default dataset. Useful when using alternative storage, like own database", 28 "type": "boolean", 29 "default": false, 30 "editor": "checkbox" 31 }, 32 "PG_CONNECTION_STRING_NORMALIZED": { 33 "title": "Postgres connection string for normalized data", 34 "description": "If set, actor will store normalized data in Postgres database in PG_DATA_TABLE and PG_DATA_PRICE_TABLE tables", 35 "type": "string", 36 "editor": "textfield" 37 }, 38 "PG_DATA_TABLE": { 39 "title": "Postgres table name for product data", 40 "description": "Table name for storing product name, url, image, ...", 41 "type": "string", 42 "editor": "textfield" 43 }, 44 "PG_DATA_PRICE_TABLE": { 45 "title": "Postgres table name for price data", 46 "description": "Table name for storing price, original price, stock status, ...", 47 "type": "string", 48 "editor": "textfield" 49 } 50 }, 51 "required": [ 52 "mode" 53 ] 54}

apify.json

1{ 2 "name": "team-sport-teamsport-cz-scraper", 3 "version": "0.1", 4 "buildTag": "latest", 5 "env": null, 6 "defaultRunOptions": { 7 "build": "latest", 8 "timeoutSecs": 3600, 9 "memoryMbytes": 1024 10 } 11}

main.js

1import { Actor } from "apify3"; 2import { CheerioCrawler, createCheerioRouter } from "crawlee"; 3import { init, save, toNumberOrNull } from "./_utils/common.js"; 4 5var LABEL; 6 7(function (LABEL) { 8 LABEL["INDEX"] = "INDEX"; 9 LABEL["PRODUCTS"] = "PRODUCTS"; 10})(LABEL || (LABEL = {})); 11var MODE; 12 13(function (MODE) { 14 MODE["TEST"] = "TEST"; 15 MODE["FULL"] = "FULL"; 16})(MODE || (MODE = {})); 17 18// TODO: Solve ?listcnt nicer 19 20async function enqueueInitial(type, crawler) { 21 if (type === MODE.FULL) { 22 await crawler.addRequests([ 23 { 24 userData: { label: LABEL.INDEX }, 25 url: `https://www.teamsport.cz/znacky/`, 26 }, 27 ]); 28 } else if (type === MODE.TEST) { 29 await crawler.addRequests([ 30 { 31 userData: { label: LABEL.PRODUCTS }, 32 url: `https://www.teamsport.cz/bbb/`, 33 }, 34 { 35 userData: { label: LABEL.PRODUCTS }, 36 url: `https://www.teamsport.cz/five-ten/`, 37 }, 38 { 39 userData: { label: LABEL.PRODUCTS }, 40 url: `https://www.teamsport.cz/fox-racing/`, 41 }, 42 ]); 43 } 44} 45 46const router = createCheerioRouter(); 47 48router.addHandler(LABEL.INDEX, async ({ crawler, $ }) => { 49 const requests = []; 50 $(`.commonListVyrobcu .listLinks a`).each((i, el) => { 51 const url = $(el).attr(`href`); 52 const name = $(el).text(); 53 requests.push({ 54 userData: { label: LABEL.PRODUCTS, category: name }, 55 url: url + `?listcnt=999`, 56 }); 57 }); 58 await crawler.addRequests(requests); 59}); 60 61router.addHandler(LABEL.PRODUCTS, async ({ crawler, $ }) => { 62 const products = []; 63 $(`.categoryProducts article.product`).each((i, el) => { 64 const id = $(el).find(`[data-productid]`).attr(`data-productid`); 65 const url = $(el).find(`a.product__href`).attr(`href`); 66 const title = $(el).find(`a.product__href`).attr(`title`); 67 const priceRaw = $(el).find(`.product__wrapper__inner__price__new`).text(); // `640 Kč` 68 const price = priceRaw.replace(/\D/g, ``); 69 const priceOrigRaw = $(el) 70 .find(`.product__wrapper__inner__price__old`) 71 .text(); // `640 Kč` 72 const priceOrig = priceOrigRaw.replace(/\D/g, ``); 73 const img = $(el).find(`.product__img__src`).attr(`src`); 74 const product = { 75 pid: id, 76 name: title, 77 url: url, 78 img: img, 79 inStock: true, // FIXME: either style='color:#29b237;' or 'skladem' 80 currentPrice: toNumberOrNull(price), 81 originalPrice: toNumberOrNull(priceOrig), 82 currency: `CZK`, 83 }; 84 products.push(product); 85 }); 86 void save(products); 87}); 88 89void Actor.main(async () => { 90 const input = await Actor.getInput(); 91 const { mode = MODE.FULL, ...rest } = input ?? {}; 92 await init({ actorNameOverride: `teamsport-cz` }, rest); 93 const crawler = new CheerioCrawler({ requestHandler: router }); 94 await enqueueInitial(mode, crawler); 95 await crawler.run(); 96}); 97

package.json

1{ 2 "name": "team-sport-teamsport-cz-scraper", 3 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", 4 "type": "module", 5 "scripts": { 6 "start": "node ./main.js", 7 "push-to-apify-platform": "npx apify push" 8 }, 9 "dependencies": { 10 "apify3": "npm:apify@^3.0.2", 11 "crawlee": "*", 12 "pg": "*", 13 "pg-connection-string": "*", 14 "dotenv": "*", 15 "find-config": "*", 16 "@elastic/elasticsearch": "*", 17 "filenamify": "*" 18 }, 19 "apify": { 20 "title": "Team Sport (teamsport.cz) scraper", 21 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", 22 "isPublic": true, 23 "isDeprecated": false, 24 "isAnonymouslyRunnable": true, 25 "notice": "", 26 "pictureUrl": "", 27 "seoTitle": "", 28 "seoDescription": "", 29 "categories": [ 30 "ECOMMERCE" 31 ] 32 } 33}

.actor/actor.json

1{ 2 "actorSpecification": 1, 3 "name": "team-sport-teamsport-cz-scraper", 4 "title": "Team Sport (teamsport.cz) scraper", 5 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", 6 "version": "0.1.0", 7 "storages": { 8 "dataset": { 9 "actorSpecification": 1, 10 "title": "Team Sport (teamsport.cz) scraper", 11 "description": "Scrapes products titles, prices, images and availability. Does NOT scrape product details.", 12 "views": { 13 "overview": { 14 "title": "Overview", 15 "description": "Overview of the most important fields", 16 "transformation": { 17 "fields": [ 18 "pid", 19 "name", 20 "url", 21 "img", 22 "inStock", 23 "currentPrice", 24 "originalPrice", 25 "currency" 26 ] 27 }, 28 "display": { 29 "component": "table", 30 "columns": [ 31 { 32 "label": "Pid", 33 "field": "pid", 34 "format": "text" 35 }, 36 { 37 "label": "Name", 38 "field": "name", 39 "format": "text" 40 }, 41 { 42 "label": "Url", 43 "field": "url", 44 "format": "link" 45 }, 46 { 47 "label": "Img", 48 "field": "img", 49 "format": "image" 50 }, 51 { 52 "label": "In Stock", 53 "field": "inStock", 54 "format": "boolean" 55 }, 56 { 57 "label": "Current Price", 58 "field": "currentPrice", 59 "format": "number" 60 }, 61 { 62 "label": "Original Price", 63 "field": "originalPrice", 64 "format": "number" 65 }, 66 { 67 "label": "Currency", 68 "field": "currency", 69 "format": "text" 70 } 71 ] 72 } 73 } 74 } 75 } 76 } 77}

.actor/logo.png

_utils/common.js

1import { createHash } from 'crypto' 2import os from "os" 3import path from "path" 4// eslint-disable-next-line @apify/apify-actor/no-forbidden-node-internals 5import fs from "fs" 6import pg from "pg" 7import pgConnectionString from 'pg-connection-string' 8import { config } from 'dotenv' 9import findConfig from "find-config" 10import { Client as ElasticClient } from "@elastic/elasticsearch" 11import filenamify from 'filenamify' 12import { Dataset } from 'crawlee' 13 14config({ path: findConfig(`.env`) }) 15 16const elasticIndexName = `actors-monorepo-shops` 17 18const globalLogsProps = { 19 __NODE_STARTED: new Date().toISOString(), 20} 21 22let actorName 23let pgClient 24let pgClientNormalized 25let elasticClient 26export async function init ({ actorNameOverride }, restInput) { 27 parseEnvFromInput(restInput) 28 29 if (os.platform() === `darwin`) { 30 const filePath = process.argv[1] // ~/Projects/apify-actors-monorepo/actors/foo.ts 31 const basename = path.basename(filePath) // foo.ts 32 actorName = actorNameOverride ?? basename.split(`.`)[0] // foo 33 const gitBranch = fs.readFileSync(path.join(process.cwd(), `..`, `.git/HEAD`), `utf8`) 34 .split(` `)[1] 35 .trim() 36 .replace(`refs/heads/`, ``) 37 const gitCommit = fs.readFileSync(path.join(process.cwd(), `..`, `.git/refs/heads/${gitBranch}`), `utf8`) 38 const gitCommitShort = gitCommit.substring(0, 7) 39 globalLogsProps.__GIT_COMMIT = gitCommitShort 40 } 41 42 if (process.env.APIFY_IS_AT_HOME) { 43 actorName = actorNameOverride ?? process.env.APIFY_ACTOR_ID // Name would be better, but it's not in ENV 44 } 45 46 /* ELASTIC */ 47 /* ======= */ 48 if (process.env.ELASTIC_CLOUD_ID) { 49 elasticClient = new ElasticClient({ 50 cloud: { id: process.env.ELASTIC_CLOUD_ID }, 51 auth: { apiKey: process.env.ELASTIC_CLOUD_API_KEY }, 52 }) 53 54 // const mapping = await elasticClient.indices.getMapping({ index: actorName }) 55 56 // eslint-disable-next-line no-inner-declarations 57 async function enforceIndexMapping () { 58 const doesIndexExist = await elasticClient.indices.exists({ index: elasticIndexName }) 59 if (!doesIndexExist) await elasticClient.indices.create({ index: elasticIndexName }) 60 await elasticClient.indices.putMapping({ 61 index: elasticIndexName, 62 body: { 63 properties: { 64 _discount: { type: `float` }, 65 originalPrice: { type: `float` }, 66 currentPrice: { type: `float` }, 67 }, 68 }, 69 }) 70 } 71 72 try { 73 await enforceIndexMapping() 74 } catch (err) { 75 if (err.message.includes(`cannot be changed from type`)) { 76 console.log(`Elastic index ${elasticIndexName} already exists with incorrect mappings. As existing mapping cannot be changed, index will be deleted and recreated.`) 77 await elasticClient.indices.delete({ index: elasticIndexName }) 78 await enforceIndexMapping() 79 } 80 } 81 } 82 83 /* POSTGRESQL */ 84 /* ========== */ 85 if (process.env.PG_CONNECTION_STRING) { 86 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING) 87 // const pgPool = new pg.Pool(pgConfig) 88 89 pgClient = new pg.Client(pgConfig) 90 await pgClient.connect() 91 92 // Check if table exists and have proper columns 93 const { rows: tables } = await pgClient.query(` 94 SELECT table_name 95 FROM information_schema.tables 96 WHERE table_schema = 'public' 97 `) 98 99 // eslint-disable-next-line camelcase 100 const tableExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE) 101 if (!tableExists) { 102 throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`) 103 } 104 105 // TODO: Handle pgClient closing 106 } 107 108 if (process.env.PG_CONNECTION_STRING_NORMALIZED) { 109 const pgConfig = pgConnectionString(process.env.PG_CONNECTION_STRING_NORMALIZED) 110 111 pgClientNormalized = new pg.Client(pgConfig) 112 await pgClientNormalized.connect() 113 114 // Check if table exists and have proper columns 115 const { rows: tables } = await pgClientNormalized.query(` 116 SELECT table_name 117 FROM information_schema.tables 118 WHERE table_schema = 'public' 119 `) 120 121 // eslint-disable-next-line camelcase 122 const tableMainExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_TABLE) 123 // eslint-disable-next-line camelcase 124 const tablePricesExists = tables.some(({ table_name }) => table_name === process.env.PG_DATA_PRICE_TABLE) 125 if (!tableMainExists) throw new Error(`Table ${process.env.PG_DATA_TABLE} does not exist in database ${pgConfig.database}`) 126 if (!tablePricesExists) throw new Error(`Table ${process.env.PG_DATA_PRICE_TABLE} does not exist in database ${pgConfig.database}`) 127 128 // TODO: Handle pgClient closing 129 } 130} 131 132// inspired by @drobnikj 133// TODO: Similar, but less obfuscated for easier debugging 134export const createUniqueKeyFromUrl = (url) => { 135 const hash = createHash(`sha256`) 136 const cleanUrl = url.split(`://`)[1] // Remove protocol 137 hash.update(cleanUrl) 138 return hash.digest(`hex`) 139} 140 141/** 142 * 143 * @param {Date} datetime 144 * @return {Promise<void>} 145 */ 146export const sleepUntil = async (datetime) => { 147 const now = new Date() 148 const difference = datetime - now 149 if (difference > 0) { 150 return new Promise((resolve) => { 151 setTimeout(resolve, difference) 152 }) 153 } 154 return Promise.resolve() 155} 156 157export function parsePrice (string) { 158 let amount, currency 159 const noText = string.replace(/[^\d,.]/g, ``) 160 const decimals = noText.match(/([,.])(\d{2})$/) 161 if (decimals) { 162 const decimalSeparator = decimals[1] 163 // eslint-disable-next-line @typescript-eslint/no-unused-vars, no-unused-vars 164 const decimalAmount = decimals[2] 165 amount = parseInt(noText.split(decimalSeparator)[0]) 166 } { 167 const justNumbers = noText.replace(/[,.]/g, ``) 168 amount = parseInt(justNumbers) 169 } 170 return { amount, currency } 171} 172 173export function toNumberOrNull (str) { 174 // TODO: Handle better, but only after adding test 175 if (str === undefined) return null 176 if (str === null) return null 177 if (str === ``) return null 178 const num = Number(str) 179 if (Number.isNaN(num)) return null 180 return num 181} 182 183export async function save (objs) { 184 if (!Array.isArray(objs)) objs = [objs] 185 if (objs.length === 0) return 186 187 const objsExtended = objs.map((obj) => { 188 const objExtended = { 189 ...obj, 190 actorName, 191 ...globalLogsProps, 192 // __NODE_VERSION: global.process.versions.node, 193 // __NODE_UPTIME: global.process.uptime().toFixed(2), // seconds, 2 decimals 194 } 195 // if run on Apify 196 if (process.env.APIFY_IS_AT_HOME) { 197 objExtended.__APIFY_ACTOR_ID = process.env.APIFY_ACTOR_ID 198 objExtended.__APIFY_ACTOR_RUN_ID = process.env.APIFY_ACTOR_RUN_ID 199 objExtended.__APIFY_ACTOR_BUILD_ID = process.env.APIFY_ACTOR_BUILD_ID 200 objExtended.__APIFY_ACTOR_BUILD_NUMBER = process.env.APIFY_ACTOR_BUILD_NUMBER 201 objExtended.__APIFY_ACTOR_TASK_ID = process.env.APIFY_ACTOR_TASK_ID 202 if (!process.env.APIFY_DONT_STORE_IN_DATASET) void Dataset.pushData(obj) 203 } 204 return objExtended 205 }) 206 // if runs on local machine (MacOS) 207 if (os.platform() === `darwin`) { 208 const cwd = process.cwd() // ~/Projects/apify-actors-monorepo/actors 209 const storageDir = path.join(cwd, `${actorName}.storage`) // ~/Projects/apify-actors-monorepo/actors/foo.storage 210 if (!fs.existsSync(storageDir)) fs.mkdirSync(storageDir) 211 const dataDir = path.join(storageDir, `data`) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data 212 if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir) 213 for (const objExtended of objsExtended) { 214 const id = objExtended.id ?? objExtended.pid // ?? uuidv4() 215 const fileName = `${filenamify(id)}.json` 216 const dataFilePath = path.join(dataDir, fileName) // ~/Projects/apify-actors-monorepo/actors/foo.storage/data/foo.json 217 fs.writeFileSync(dataFilePath, JSON.stringify(objExtended, null, 2)) 218 } 219 } 220 221 if (pgClient) { 222 const objsPg = objs.map((obj) => ({ 223 ...obj, 224 // TODO: This is becoming not nice, and not clear 225 shop: actorName, 226 scrapedAt: new Date().toISOString().split(`T`)[0], 227 })) 228 229 const columns = getColumns(objsPg) 230 const values = getValues(objsPg) 231 const queryString = ` 232 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${columns}) 233 VALUES (${values}) 234 ` 235 try { 236 const { rowCount } = await pgClient.query(queryString) 237 console.log(`[save] saved to database: ${JSON.stringify(rowCount)}`) 238 } catch (err) { 239 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 240 else throw err 241 } 242 } 243 244 // Only make sense for HlidacShopu 245 if (pgClientNormalized) { 246 const objsPgData = objs.map((obj) => ({ 247 shop: actorName, 248 pid: obj.pid, 249 name: obj.name, 250 url: obj.url, 251 img: obj.img, 252 })) 253 254 const objsPgDataPrice = objs.map((obj) => ({ 255 shop: actorName, 256 pid: obj.pid, 257 scrapedAt: new Date().toISOString().split(`T`)[0], 258 currentPrice: obj.currentPrice, 259 originalPrice: obj.originalPrice, 260 inStock: obj.inStock, 261 })) 262 263 const queryString = ` 264 INSERT INTO public."${process.env.PG_DATA_TABLE}" (${getColumns(objsPgData)}) 265 VALUES (${getValues(objsPgData)}) 266 ON CONFLICT DO NOTHING 267 ` 268 try { 269 const { rowCount } = await pgClientNormalized.query(queryString) 270 console.log(`[save] saved to database (data): ${JSON.stringify(rowCount)}`) 271 } catch (err) { 272 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 273 else throw err 274 } 275 276 const queryStringPrice = ` 277 INSERT INTO public."${process.env.PG_DATA_PRICE_TABLE}" (${getColumns(objsPgDataPrice)}) 278 VALUES (${getValues(objsPgDataPrice)}) 279 ON CONFLICT DO NOTHING 280 ` 281 try { 282 const { rowCount } = await pgClientNormalized.query(queryStringPrice) 283 console.log(`[save] saved to database (price): ${JSON.stringify(rowCount)}`) 284 } catch (err) { 285 if (err.message.includes(`violates unique constraint`)) console.warn(`PostgresSQL: violates unique constraint`) 286 else throw err 287 } 288 } 289 290 if (elasticClient) { 291 // .index creates or updates the document 292 // .create creates a new document if it doesn't exist, 409 if it does 293 // try { 294 // const res = await elasticClient.index({ 295 // index: `actors-monorepo-shops`, // TODO: Consider using actorName 296 // id, // foo-bar 297 // document: objExtended, // {...} 298 // }) 299 // } catch (err) { 300 // // https://discuss.elastic.co/t/elasticsearch-503-ok-false-message-the-requested-deployment-is-currently-unavailable/200583 301 // if (err.message.includes(`requested resource is currently unavailable`)) console.log(`Elasticsearch is unavailable, skipping, but not aborting`) 302 // else throw err 303 // } 304 } 305} 306 307function getColumns (objs) { 308 return Object.keys(objs[0]).map((key) => `"${key}"`).join(`, `) 309} 310 311function getValues (objs) { 312 return objs.map(objPg => Object.values(objPg).map((value) => { 313 // escape strings to prevent SQL injection 314 if (typeof value === `string`) return `'${value.replace(/'/g, `''`)}'` 315 // convert to DB specific null 316 if (typeof value === `undefined` || value === null) return `NULL` 317 return value 318 }).join(`, `)).join(`), (`) 319} 320 321export function parseEnvFromInput (input) { 322 const env = {} 323 for (const key in input) { 324 if (key === key.toUpperCase()) env[key] = input[key] 325 } 326 console.log(`[parseEnvFromInput] ${JSON.stringify(env)}`) 327 Object.assign(process.env, env) 328} 329
Developer
Maintained by Community
Actor stats
  • 5 users
  • 485 runs
  • Modified about 1 year ago
Categories

You might also like these Actors