TSBohemia Scraper
Deprecated
Pricing
Pay per usage
Go to Store
TSBohemia Scraper
Deprecated
Scrapes all TSBohemia.cz products.
0.0 (0)
Pricing
Pay per usage
2
Total users
13
Monthly users
1
Last modified
2 years ago
Dockerfile
# This is a template for a Dockerfile used to run acts in Actor system.# The base image name below is set during the act build, based on user settings.# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/userFROM apify/actor-node-chrome:v0.21.10
# Second, copy just package.json and package-lock.json since it should be# the only file that affects "npm install" in the next step, to speed up the buildCOPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Copy source code to container# Do this in the last step, to have fast build if only the source code changedCOPY . ./
# NOTE: The CMD is already defined by the base image.# Uncomment this for local node inspector debugging:# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
{ "name": "apify-project", "version": "0.0.1", "description": "", "author": "It's not you it's me", "license": "ISC", "dependencies": { "apify": "0.21.10" }, "scripts": { "start": "node main.js" }}
main.js
1const Apify = require('apify');2
3/**4 * Gets attribute as text from a ElementHandle.5 * @param {ElementHandle} element - The element to get attribute from.6 * @param {string} attr - Name of the attribute to get.7 */8async function getAttribute(element, attr){9 try{10 const prop = await element.getProperty(attr);11 return (await prop.jsonValue()).trim();12 }13 catch(e){return null;}14}15
16/** Main function */17Apify.main(async () => {18 19 // Open request queue and add statrUrl20 const requestQueue = await Apify.openRequestQueue();21 await requestQueue.addRequest(new Apify.Request({ 22 userData: {label: 'start'}, 23 url: 'https://www.tsbohemia.cz/' 24 }));25
26 // Disable all non-essential requests27 const gotoFunction = async ({ page, request }) => {28 await page.setRequestInterception(true);29 page.on('request', intercepted => {30 const type = intercepted.resourceType();31 if(type === 'stylesheet'){intercepted.abort();}32 else{intercepted.continue();}33 });34 await Apify.utils.puppeteer.hideWebDriver(page);35 return await page.goto(request.url, {timeout: 200000});36 };37 38 // Handle page context39 const handlePageFunction = async ({ page, request }) => {40 41 // This is the start page42 if(request.userData.label === 'start'){43 await page.waitFor(2000);44 45 // Download all category links46 const urls = await page.evaluate(async () => {47 let result = [];48 const tcs = $('.i6lt_plus a');49 for(let i = 0; i < tcs.length; i++){50 const tc = tcs[i];51 const id = $(tc).attr('data-strid');52 const resp = await fetch("https://www.tsbohemia.cz/default_jx.asp?show=sptnavigator&strid=" + id, {"credentials":"include","headers":{"accept":"*/*","accept-language":"cs,en-US;q=0.9,en;q=0.8","x-requested-with":"XMLHttpRequest"},"referrer":"https://www.tsbohemia.cz/","referrerPolicy":"no-referrer-when-downgrade","body":null,"method":"GET","mode":"cors"});53 const text = await resp.text();54 const subLinks = $('.level6 > li > a', text);55 result = result.concat(subLinks.toArray().map(sl => $(sl).attr('href')));56 }57 return result;58 });59 60 // Enqueue category links61 for(const url of urls){62 await requestQueue.addRequest(new Apify.Request({63 userData: {label: 'page'},64 url: 'https://www.tsbohemia.cz/' + url65 }));66 }67 }68 69 // This is the category page70 else if(request.userData.label === 'page'){71 72 // Enqueue pagination pages73 await Apify.utils.puppeteer.enqueueLinks({74 page: page, 75 requestQueue: requestQueue, 76 selector: '.numpage a', 77 pseudoUrls: null, 78 userData: {label: 'page'}79 });80 81 // Scroll through whole page to force images to load82 await page.evaluate(async () => {83 const body = document.body;84 const html = document.documentElement;85 const height = Math.max( body.scrollHeight, body.offsetHeight, 86 html.clientHeight, html.scrollHeight, html.offsetHeight);87 let yPos = 0;88 while(yPos < height){89 yPos += 300;90 window.scrollTo(0, yPos);91 await new Promise(r => setTimeout(r, 100));92 }93 });94 95 // Iterate all products and extract data96 const items = await page.$$('.prodbox');97 for(const item of items){98 const toNumber = p => p.replace(/\s/g, '').match(/\d+/)[0];99 try{100 const linkElem = await item.$('h2 a');101 const imgElem = await item.$('.img img');102 const oPriceElem = await item.$('.price .mc');103 const priceElem = await item.$('.price .wvat');104 const img = await getAttribute(imgElem, 'src');105 const link = await getAttribute(linkElem, 'href');106 const name = await getAttribute(linkElem, 'textContent');107 const price = await getAttribute(priceElem, 'textContent');108 const dataItem = {109 img: img,110 itemId: link.match(/d(\d+)/)[1],111 itemUrl: link,112 itemName: name,113 discounted: oPriceElem ? true : false,114 currentPrice: price ? toNumber(price) : null115 };116 if(oPriceElem){117 const oPrice = await getAttribute(oPriceElem, 'textContent');118 dataItem.originalPrice = toNumber(oPrice);119 }120 121 // Save data to dataset122 await Apify.pushData(dataItem);123 }124 catch(e){console.log(e);}125 }126 }127 };128 129 // Create crawler130 const crawler = new Apify.PuppeteerCrawler({131 requestQueue,132
133 // Use proxy134 launchPuppeteerOptions: {135 useApifyProxy: true136 },137 138 handlePageFunction,139
140 // If request failed 4 times then this function is executed141 handleFailedRequestFunction: async ({ request }) => {142 console.log(`Request ${request.url} failed 4 times`);143 },144 });145
146 // Run crawler147 await crawler.run();148});