Mironet Scraper avatar

Mironet Scraper

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Mironet Scraper

Mironet Scraper

petr_cermak/mironet-scraper

Scrapes all Mironet.cz products.

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY --chown=myuser:myuser . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "0.21.10"
9    },
10    "scripts": {
11        "start": "node main.js"
12    }
13}

main.js

1const Apify = require('apify');
2
3/**
4 * Gets attribute as text from a ElementHandle.
5 * @param {ElementHandle} element - The element to get attribute from.
6 * @param {string} attr - Name of the attribute to get.
7 */
8async function getAttribute(element, attr){
9    try{
10        const prop = await element.getProperty(attr);
11        return (await prop.jsonValue()).trim();
12    }
13    catch(e){return null;}
14}
15
16/** Main function */
17Apify.main(async () => {
18    
19    // Open request queue and add statrUrl
20    const requestQueue = await Apify.openRequestQueue();
21    await requestQueue.addRequest(new Apify.Request({ 
22        userData: {label: 'start'}, 
23        url: 'https://www.mironet.cz/' 
24    }));
25    
26    // Disable all non-essential requests
27    const gotoFunction = async ({ page, request }) => {
28    	await page.setRequestInterception(true);
29    	page.on('request', intercepted => {
30    	    const type = intercepted.resourceType();
31    	    if(type === 'image' || type === 'stylesheet'){intercepted.abort();}
32    	    else{intercepted.continue();}
33    	});
34    	await Apify.utils.puppeteer.hideWebDriver(page);
35    	return await page.goto(request.url, {timeout: 200000});
36    };
37
38    // Handle page context
39    const handlePageFunction = async ({ page, request }) => {
40        
41        // This is the start page
42        if(request.userData.label === 'start'){
43            
44            // Enqueue category links
45            await Apify.utils.puppeteer.enqueueLinks({
46                page: page, 
47                requestQueue: requestQueue, 
48                selector: '.nadpis a', 
49                pseudoUrls: null, 
50                userData: {label: 'page'}
51            });
52        }
53        
54        // This is the category page
55        else if(request.userData.label === 'page'){
56            
57            // Enqueue pagination pages
58            await Apify.utils.puppeteer.enqueueLinks({
59                page: page, 
60                requestQueue: requestQueue, 
61                selector: 'a.PageNew', 
62                pseudoUrls: null, 
63                userData: {label: 'page'}
64            });
65            
66            // Iterate all products and extract data
67            const items = await page.$$('.item_b');
68            for(const item of items){
69                const toNumber = p => p.replace(/\s/g, '').match(/\d+/)[0];
70                try{
71                    const idElem = await item.$('.item_kod');
72                    const linkElem = await item.$('.nazev a');
73                    const priceElem = await item.$('.item_cena');
74                    const imgElem = await item.$('.item_obr img');
75                    const oPriceElem = await item.$('.item_s_cena span');
76                    const img = await getAttribute(imgElem, 'src');
77                    const link = await getAttribute(linkElem, 'href');
78                    const id = await getAttribute(idElem, 'textContent');
79                    const name = await getAttribute(linkElem, 'textContent');
80                    const price = await getAttribute(priceElem, 'textContent');
81                    const dataItem = {
82                        img: img,
83                        itemId: id.match(/\d+/)[0],
84                        itemUrl: link,
85                        itemName: name,
86                        discounted: oPriceElem ? true : false,
87                        currentPrice: price ? toNumber(price) : null
88                    };
89                    if(oPriceElem){
90                        const oPrice = await getAttribute(oPriceElem, 'textContent');
91                        dataItem.originalPrice = toNumber(oPrice);
92                    }
93                    
94                    // Save data to dataset
95                    await Apify.pushData(dataItem);
96                }
97                catch(e){console.log(e);}
98            }
99        }
100    };
101    
102    // Create crawler
103    const crawler = new Apify.PuppeteerCrawler({
104        requestQueue,
105
106        // Use proxy
107        launchPuppeteerOptions: {
108            useApifyProxy: true
109        },
110
111        gotoFunction,
112
113        handlePageFunction,
114
115        // If request failed 4 times then this function is executed
116        handleFailedRequestFunction: async ({ request }) => {
117            console.log(`Request ${request.url} failed 4 times`);
118        },
119    });
120
121    // Run crawler
122    await crawler.run();
123});
Developer
Maintained by Community
Categories