Mironet Scraper avatar
Mironet Scraper

Deprecated

Pricing

Pay per usage

Go to Store
Mironet Scraper

Mironet Scraper

Deprecated

Developed by

Petr Cermak

Petr Cermak

Maintained by Community

Scrapes all Mironet.cz products.

0.0 (0)

Pricing

Pay per usage

2

Total users

13

Monthly users

1

Last modified

2 years ago

Dockerfile

# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-chrome:v0.21.10
# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY --chown=myuser:myuser . ./
# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

{
"name": "apify-project",
"version": "0.0.1",
"description": "",
"author": "It's not you it's me",
"license": "ISC",
"dependencies": {
"apify": "0.21.10"
},
"scripts": {
"start": "node main.js"
}
}

main.js

1const Apify = require('apify');
2
3/**
4 * Gets attribute as text from a ElementHandle.
5 * @param {ElementHandle} element - The element to get attribute from.
6 * @param {string} attr - Name of the attribute to get.
7 */
8async function getAttribute(element, attr){
9 try{
10 const prop = await element.getProperty(attr);
11 return (await prop.jsonValue()).trim();
12 }
13 catch(e){return null;}
14}
15
16/** Main function */
17Apify.main(async () => {
18
19 // Open request queue and add statrUrl
20 const requestQueue = await Apify.openRequestQueue();
21 await requestQueue.addRequest(new Apify.Request({
22 userData: {label: 'start'},
23 url: 'https://www.mironet.cz/'
24 }));
25
26 // Disable all non-essential requests
27 const gotoFunction = async ({ page, request }) => {
28 await page.setRequestInterception(true);
29 page.on('request', intercepted => {
30 const type = intercepted.resourceType();
31 if(type === 'image' || type === 'stylesheet'){intercepted.abort();}
32 else{intercepted.continue();}
33 });
34 await Apify.utils.puppeteer.hideWebDriver(page);
35 return await page.goto(request.url, {timeout: 200000});
36 };
37
38 // Handle page context
39 const handlePageFunction = async ({ page, request }) => {
40
41 // This is the start page
42 if(request.userData.label === 'start'){
43
44 // Enqueue category links
45 await Apify.utils.puppeteer.enqueueLinks({
46 page: page,
47 requestQueue: requestQueue,
48 selector: '.nadpis a',
49 pseudoUrls: null,
50 userData: {label: 'page'}
51 });
52 }
53
54 // This is the category page
55 else if(request.userData.label === 'page'){
56
57 // Enqueue pagination pages
58 await Apify.utils.puppeteer.enqueueLinks({
59 page: page,
60 requestQueue: requestQueue,
61 selector: 'a.PageNew',
62 pseudoUrls: null,
63 userData: {label: 'page'}
64 });
65
66 // Iterate all products and extract data
67 const items = await page.$$('.item_b');
68 for(const item of items){
69 const toNumber = p => p.replace(/\s/g, '').match(/\d+/)[0];
70 try{
71 const idElem = await item.$('.item_kod');
72 const linkElem = await item.$('.nazev a');
73 const priceElem = await item.$('.item_cena');
74 const imgElem = await item.$('.item_obr img');
75 const oPriceElem = await item.$('.item_s_cena span');
76 const img = await getAttribute(imgElem, 'src');
77 const link = await getAttribute(linkElem, 'href');
78 const id = await getAttribute(idElem, 'textContent');
79 const name = await getAttribute(linkElem, 'textContent');
80 const price = await getAttribute(priceElem, 'textContent');
81 const dataItem = {
82 img: img,
83 itemId: id.match(/\d+/)[0],
84 itemUrl: link,
85 itemName: name,
86 discounted: oPriceElem ? true : false,
87 currentPrice: price ? toNumber(price) : null
88 };
89 if(oPriceElem){
90 const oPrice = await getAttribute(oPriceElem, 'textContent');
91 dataItem.originalPrice = toNumber(oPrice);
92 }
93
94 // Save data to dataset
95 await Apify.pushData(dataItem);
96 }
97 catch(e){console.log(e);}
98 }
99 }
100 };
101
102 // Create crawler
103 const crawler = new Apify.PuppeteerCrawler({
104 requestQueue,
105
106 // Use proxy
107 launchPuppeteerOptions: {
108 useApifyProxy: true
109 },
110
111 gotoFunction,
112
113 handlePageFunction,
114
115 // If request failed 4 times then this function is executed
116 handleFailedRequestFunction: async ({ request }) => {
117 console.log(`Request ${request.url} failed 4 times`);
118 },
119 });
120
121 // Run crawler
122 await crawler.run();
123});