TSBohemia Scraper
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsTSBohemia Scraper
petr_cermak/tsbohemia-scraper
Scrapes all TSBohemia.cz products.
Dockerfile
1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
1{
2 "name": "apify-project",
3 "version": "0.0.1",
4 "description": "",
5 "author": "It's not you it's me",
6 "license": "ISC",
7 "dependencies": {
8 "apify": "0.21.10"
9 },
10 "scripts": {
11 "start": "node main.js"
12 }
13}
main.js
1const Apify = require('apify');
2
3/**
4 * Gets attribute as text from a ElementHandle.
5 * @param {ElementHandle} element - The element to get attribute from.
6 * @param {string} attr - Name of the attribute to get.
7 */
8async function getAttribute(element, attr){
9 try{
10 const prop = await element.getProperty(attr);
11 return (await prop.jsonValue()).trim();
12 }
13 catch(e){return null;}
14}
15
16/** Main function */
17Apify.main(async () => {
18
19 // Open request queue and add statrUrl
20 const requestQueue = await Apify.openRequestQueue();
21 await requestQueue.addRequest(new Apify.Request({
22 userData: {label: 'start'},
23 url: 'https://www.tsbohemia.cz/'
24 }));
25
26 // Disable all non-essential requests
27 const gotoFunction = async ({ page, request }) => {
28 await page.setRequestInterception(true);
29 page.on('request', intercepted => {
30 const type = intercepted.resourceType();
31 if(type === 'stylesheet'){intercepted.abort();}
32 else{intercepted.continue();}
33 });
34 await Apify.utils.puppeteer.hideWebDriver(page);
35 return await page.goto(request.url, {timeout: 200000});
36 };
37
38 // Handle page context
39 const handlePageFunction = async ({ page, request }) => {
40
41 // This is the start page
42 if(request.userData.label === 'start'){
43 await page.waitFor(2000);
44
45 // Download all category links
46 const urls = await page.evaluate(async () => {
47 let result = [];
48 const tcs = $('.i6lt_plus a');
49 for(let i = 0; i < tcs.length; i++){
50 const tc = tcs[i];
51 const id = $(tc).attr('data-strid');
52 const resp = await fetch("https://www.tsbohemia.cz/default_jx.asp?show=sptnavigator&strid=" + id, {"credentials":"include","headers":{"accept":"*/*","accept-language":"cs,en-US;q=0.9,en;q=0.8","x-requested-with":"XMLHttpRequest"},"referrer":"https://www.tsbohemia.cz/","referrerPolicy":"no-referrer-when-downgrade","body":null,"method":"GET","mode":"cors"});
53 const text = await resp.text();
54 const subLinks = $('.level6 > li > a', text);
55 result = result.concat(subLinks.toArray().map(sl => $(sl).attr('href')));
56 }
57 return result;
58 });
59
60 // Enqueue category links
61 for(const url of urls){
62 await requestQueue.addRequest(new Apify.Request({
63 userData: {label: 'page'},
64 url: 'https://www.tsbohemia.cz/' + url
65 }));
66 }
67 }
68
69 // This is the category page
70 else if(request.userData.label === 'page'){
71
72 // Enqueue pagination pages
73 await Apify.utils.puppeteer.enqueueLinks({
74 page: page,
75 requestQueue: requestQueue,
76 selector: '.numpage a',
77 pseudoUrls: null,
78 userData: {label: 'page'}
79 });
80
81 // Scroll through whole page to force images to load
82 await page.evaluate(async () => {
83 const body = document.body;
84 const html = document.documentElement;
85 const height = Math.max( body.scrollHeight, body.offsetHeight,
86 html.clientHeight, html.scrollHeight, html.offsetHeight);
87 let yPos = 0;
88 while(yPos < height){
89 yPos += 300;
90 window.scrollTo(0, yPos);
91 await new Promise(r => setTimeout(r, 100));
92 }
93 });
94
95 // Iterate all products and extract data
96 const items = await page.$$('.prodbox');
97 for(const item of items){
98 const toNumber = p => p.replace(/\s/g, '').match(/\d+/)[0];
99 try{
100 const linkElem = await item.$('h2 a');
101 const imgElem = await item.$('.img img');
102 const oPriceElem = await item.$('.price .mc');
103 const priceElem = await item.$('.price .wvat');
104 const img = await getAttribute(imgElem, 'src');
105 const link = await getAttribute(linkElem, 'href');
106 const name = await getAttribute(linkElem, 'textContent');
107 const price = await getAttribute(priceElem, 'textContent');
108 const dataItem = {
109 img: img,
110 itemId: link.match(/d(\d+)/)[1],
111 itemUrl: link,
112 itemName: name,
113 discounted: oPriceElem ? true : false,
114 currentPrice: price ? toNumber(price) : null
115 };
116 if(oPriceElem){
117 const oPrice = await getAttribute(oPriceElem, 'textContent');
118 dataItem.originalPrice = toNumber(oPrice);
119 }
120
121 // Save data to dataset
122 await Apify.pushData(dataItem);
123 }
124 catch(e){console.log(e);}
125 }
126 }
127 };
128
129 // Create crawler
130 const crawler = new Apify.PuppeteerCrawler({
131 requestQueue,
132
133 // Use proxy
134 launchPuppeteerOptions: {
135 useApifyProxy: true
136 },
137
138 handlePageFunction,
139
140 // If request failed 4 times then this function is executed
141 handleFailedRequestFunction: async ({ request }) => {
142 console.log(`Request ${request.url} failed 4 times`);
143 },
144 });
145
146 // Run crawler
147 await crawler.run();
148});
Developer
Maintained by Community
Categories