TSBohemia Scraper avatar
TSBohemia Scraper

Deprecated

Pricing

Pay per usage

Go to Store
TSBohemia Scraper

TSBohemia Scraper

Deprecated

Developed by

Petr Cermak

Petr Cermak

Maintained by Community

Scrapes all TSBohemia.cz products.

0.0 (0)

Pricing

Pay per usage

2

Total users

13

Monthly users

1

Last modified

2 years ago

Dockerfile

# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-chrome:v0.21.10
# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY --chown=myuser:myuser . ./
# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

{
"name": "apify-project",
"version": "0.0.1",
"description": "",
"author": "It's not you it's me",
"license": "ISC",
"dependencies": {
"apify": "0.21.10"
},
"scripts": {
"start": "node main.js"
}
}

main.js

1const Apify = require('apify');
2
3/**
4 * Gets attribute as text from a ElementHandle.
5 * @param {ElementHandle} element - The element to get attribute from.
6 * @param {string} attr - Name of the attribute to get.
7 */
8async function getAttribute(element, attr){
9 try{
10 const prop = await element.getProperty(attr);
11 return (await prop.jsonValue()).trim();
12 }
13 catch(e){return null;}
14}
15
16/** Main function */
17Apify.main(async () => {
18
19 // Open request queue and add statrUrl
20 const requestQueue = await Apify.openRequestQueue();
21 await requestQueue.addRequest(new Apify.Request({
22 userData: {label: 'start'},
23 url: 'https://www.tsbohemia.cz/'
24 }));
25
26 // Disable all non-essential requests
27 const gotoFunction = async ({ page, request }) => {
28 await page.setRequestInterception(true);
29 page.on('request', intercepted => {
30 const type = intercepted.resourceType();
31 if(type === 'stylesheet'){intercepted.abort();}
32 else{intercepted.continue();}
33 });
34 await Apify.utils.puppeteer.hideWebDriver(page);
35 return await page.goto(request.url, {timeout: 200000});
36 };
37
38 // Handle page context
39 const handlePageFunction = async ({ page, request }) => {
40
41 // This is the start page
42 if(request.userData.label === 'start'){
43 await page.waitFor(2000);
44
45 // Download all category links
46 const urls = await page.evaluate(async () => {
47 let result = [];
48 const tcs = $('.i6lt_plus a');
49 for(let i = 0; i < tcs.length; i++){
50 const tc = tcs[i];
51 const id = $(tc).attr('data-strid');
52 const resp = await fetch("https://www.tsbohemia.cz/default_jx.asp?show=sptnavigator&strid=" + id, {"credentials":"include","headers":{"accept":"*/*","accept-language":"cs,en-US;q=0.9,en;q=0.8","x-requested-with":"XMLHttpRequest"},"referrer":"https://www.tsbohemia.cz/","referrerPolicy":"no-referrer-when-downgrade","body":null,"method":"GET","mode":"cors"});
53 const text = await resp.text();
54 const subLinks = $('.level6 > li > a', text);
55 result = result.concat(subLinks.toArray().map(sl => $(sl).attr('href')));
56 }
57 return result;
58 });
59
60 // Enqueue category links
61 for(const url of urls){
62 await requestQueue.addRequest(new Apify.Request({
63 userData: {label: 'page'},
64 url: 'https://www.tsbohemia.cz/' + url
65 }));
66 }
67 }
68
69 // This is the category page
70 else if(request.userData.label === 'page'){
71
72 // Enqueue pagination pages
73 await Apify.utils.puppeteer.enqueueLinks({
74 page: page,
75 requestQueue: requestQueue,
76 selector: '.numpage a',
77 pseudoUrls: null,
78 userData: {label: 'page'}
79 });
80
81 // Scroll through whole page to force images to load
82 await page.evaluate(async () => {
83 const body = document.body;
84 const html = document.documentElement;
85 const height = Math.max( body.scrollHeight, body.offsetHeight,
86 html.clientHeight, html.scrollHeight, html.offsetHeight);
87 let yPos = 0;
88 while(yPos < height){
89 yPos += 300;
90 window.scrollTo(0, yPos);
91 await new Promise(r => setTimeout(r, 100));
92 }
93 });
94
95 // Iterate all products and extract data
96 const items = await page.$$('.prodbox');
97 for(const item of items){
98 const toNumber = p => p.replace(/\s/g, '').match(/\d+/)[0];
99 try{
100 const linkElem = await item.$('h2 a');
101 const imgElem = await item.$('.img img');
102 const oPriceElem = await item.$('.price .mc');
103 const priceElem = await item.$('.price .wvat');
104 const img = await getAttribute(imgElem, 'src');
105 const link = await getAttribute(linkElem, 'href');
106 const name = await getAttribute(linkElem, 'textContent');
107 const price = await getAttribute(priceElem, 'textContent');
108 const dataItem = {
109 img: img,
110 itemId: link.match(/d(\d+)/)[1],
111 itemUrl: link,
112 itemName: name,
113 discounted: oPriceElem ? true : false,
114 currentPrice: price ? toNumber(price) : null
115 };
116 if(oPriceElem){
117 const oPrice = await getAttribute(oPriceElem, 'textContent');
118 dataItem.originalPrice = toNumber(oPrice);
119 }
120
121 // Save data to dataset
122 await Apify.pushData(dataItem);
123 }
124 catch(e){console.log(e);}
125 }
126 }
127 };
128
129 // Create crawler
130 const crawler = new Apify.PuppeteerCrawler({
131 requestQueue,
132
133 // Use proxy
134 launchPuppeteerOptions: {
135 useApifyProxy: true
136 },
137
138 handlePageFunction,
139
140 // If request failed 4 times then this function is executed
141 handleFailedRequestFunction: async ({ request }) => {
142 console.log(`Request ${request.url} failed 4 times`);
143 },
144 });
145
146 // Run crawler
147 await crawler.run();
148});