TSBohemia Scraper avatar

TSBohemia Scraper

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
TSBohemia Scraper

TSBohemia Scraper

petr_cermak/tsbohemia-scraper

Scrapes all TSBohemia.cz products.

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY --chown=myuser:myuser . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "0.21.10"
9    },
10    "scripts": {
11        "start": "node main.js"
12    }
13}

main.js

1const Apify = require('apify');
2
3/**
4 * Gets attribute as text from a ElementHandle.
5 * @param {ElementHandle} element - The element to get attribute from.
6 * @param {string} attr - Name of the attribute to get.
7 */
8async function getAttribute(element, attr){
9    try{
10        const prop = await element.getProperty(attr);
11        return (await prop.jsonValue()).trim();
12    }
13    catch(e){return null;}
14}
15
16/** Main function */
17Apify.main(async () => {
18    
19    // Open request queue and add statrUrl
20    const requestQueue = await Apify.openRequestQueue();
21    await requestQueue.addRequest(new Apify.Request({ 
22        userData: {label: 'start'}, 
23        url: 'https://www.tsbohemia.cz/' 
24    }));
25
26    // Disable all non-essential requests
27    const gotoFunction = async ({ page, request }) => {
28    	await page.setRequestInterception(true);
29    	page.on('request', intercepted => {
30    	    const type = intercepted.resourceType();
31    	    if(type === 'stylesheet'){intercepted.abort();}
32    	    else{intercepted.continue();}
33    	});
34    	await Apify.utils.puppeteer.hideWebDriver(page);
35    	return await page.goto(request.url, {timeout: 200000});
36    };
37    
38    // Handle page context
39    const handlePageFunction = async ({ page, request }) => {
40        
41        // This is the start page
42        if(request.userData.label === 'start'){
43            await page.waitFor(2000);
44    
45            // Download all category links
46            const urls = await page.evaluate(async () => {
47                let result = [];
48                const tcs = $('.i6lt_plus a');
49                for(let i = 0; i < tcs.length; i++){
50                    const tc = tcs[i];
51                    const id = $(tc).attr('data-strid');
52                    const resp = await fetch("https://www.tsbohemia.cz/default_jx.asp?show=sptnavigator&strid=" + id, {"credentials":"include","headers":{"accept":"*/*","accept-language":"cs,en-US;q=0.9,en;q=0.8","x-requested-with":"XMLHttpRequest"},"referrer":"https://www.tsbohemia.cz/","referrerPolicy":"no-referrer-when-downgrade","body":null,"method":"GET","mode":"cors"});
53                    const text = await resp.text();
54                    const subLinks = $('.level6 > li > a', text);
55                    result = result.concat(subLinks.toArray().map(sl => $(sl).attr('href')));
56                }
57                return result;
58            });
59            
60            // Enqueue category links
61            for(const url of urls){
62                await requestQueue.addRequest(new Apify.Request({
63                    userData: {label: 'page'},
64                    url: 'https://www.tsbohemia.cz/' + url
65                }));
66            }
67        }
68        
69        // This is the category page
70        else if(request.userData.label === 'page'){
71            
72            // Enqueue pagination pages
73            await Apify.utils.puppeteer.enqueueLinks({
74                page: page, 
75                requestQueue: requestQueue, 
76                selector: '.numpage a', 
77                pseudoUrls: null, 
78                userData: {label: 'page'}
79            });
80            
81            // Scroll through whole page to force images to load
82            await page.evaluate(async () => {
83                const body = document.body;
84                const html = document.documentElement;
85                const height = Math.max( body.scrollHeight, body.offsetHeight, 
86                       html.clientHeight, html.scrollHeight, html.offsetHeight);
87                let yPos = 0;
88                while(yPos < height){
89                    yPos += 300;
90                    window.scrollTo(0, yPos);
91                    await new Promise(r => setTimeout(r, 100));
92                }
93            });
94            
95            // Iterate all products and extract data
96            const items = await page.$$('.prodbox');
97            for(const item of items){
98                const toNumber = p => p.replace(/\s/g, '').match(/\d+/)[0];
99                try{
100                    const linkElem = await item.$('h2 a');
101                    const imgElem = await item.$('.img img');
102                    const oPriceElem = await item.$('.price .mc');
103                    const priceElem = await item.$('.price .wvat');
104                    const img = await getAttribute(imgElem, 'src');
105                    const link = await getAttribute(linkElem, 'href');
106                    const name = await getAttribute(linkElem, 'textContent');
107                    const price = await getAttribute(priceElem, 'textContent');
108                    const dataItem = {
109                        img: img,
110                        itemId: link.match(/d(\d+)/)[1],
111                        itemUrl: link,
112                        itemName: name,
113                        discounted: oPriceElem ? true : false,
114                        currentPrice: price ? toNumber(price) : null
115                    };
116                    if(oPriceElem){
117                        const oPrice = await getAttribute(oPriceElem, 'textContent');
118                        dataItem.originalPrice = toNumber(oPrice);
119                    }
120                    
121                    // Save data to dataset
122                    await Apify.pushData(dataItem);
123                }
124                catch(e){console.log(e);}
125            }
126        }
127    };
128    
129    // Create crawler
130    const crawler = new Apify.PuppeteerCrawler({
131        requestQueue,
132
133        // Use proxy
134        launchPuppeteerOptions: {
135            useApifyProxy: true
136        },
137        
138        handlePageFunction,
139
140        // If request failed 4 times then this function is executed
141        handleFailedRequestFunction: async ({ request }) => {
142            console.log(`Request ${request.url} failed 4 times`);
143        },
144    });
145
146    // Run crawler
147    await crawler.run();
148});
Developer
Maintained by Community
Categories