Cyklobazar (cyklobazar.cz) scraper RSS avatar
Cyklobazar (cyklobazar.cz) scraper RSS

Pricing

Pay per usage

Go to Store
Cyklobazar (cyklobazar.cz) scraper RSS

Cyklobazar (cyklobazar.cz) scraper RSS

Developed by

Pavel Dolecek

Pavel Dolecek

Maintained by Community

Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.

0.0 (0)

Pricing

Pay per usage

2

Total users

21

Monthly users

4

Runs succeeded

>99%

Last modified

a year ago

Dockerfile

FROM apify/actor-node:18
COPY package.json ./
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional
COPY . ./

INPUT_SCHEMA.json

{
"title": "Cyklobazar (cyklobazar.cz) scraper RSS",
"description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
"type": "object",
"schemaVersion": 1,
"properties": {
"urls": {
"title": "Urls",
"description": "",
"type": "array",
"editor": "requestListSources",
"prefill": [
{
"url": "https://www.cyklobazar.cz/vsechny-kategorie?q=canyon"
}
]
},
"APIFY_USE_MEMORY_REQUEST_QUEUE": {
"sectionCaption": "Advanced",
"sectionDescription": "Advanced options, use only if you know what you're doing.",
"title": "Use in-memory request queue instead of the native one",
"description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
"type": "boolean",
"default": false,
"editor": "checkbox"
}
},
"required": [
"urls"
]
}

apify.json

{
"name": "cyklobazar-cyklobazar-cz-scraper-rss",
"version": "0.1",
"buildTag": "latest",
"env": null,
"defaultRunOptions": {
"build": "latest",
"timeoutSecs": 3600,
"memoryMbytes": 1024
}
}

main.js

1import Apify from "apify2";
2
3const BASE_URL = `https://www.cyklobazar.cz`;
4
5Apify.main(async () => {
6 const input = await Apify.getInput();
7 const {
8 urls = [{ url: `https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek` }],
9 } = input ?? {};
10
11 /* Validate input */
12 let invalidInput = false;
13 for (const { url } of urls) {
14 if (!url.startsWith(BASE_URL)) {
15 console.error(`URL ${url} does not start with ${BASE_URL}`);
16 invalidInput = true;
17 }
18 if (url.includes(`vp-page=`)) {
19 console.error(
20 `URL ${url} contains pagination parameter "vp-page=", use first page only`
21 );
22 invalidInput = true;
23 }
24 }
25 if (invalidInput) throw new Error(`Invalid input`);
26
27 /* Enqueue initial */
28 const requestQueue = await Apify.openRequestQueue();
29 for (const { url } of urls) {
30 await requestQueue.addRequest({ url });
31 }
32
33 const crawler = new Apify.CheerioCrawler({
34 requestQueue,
35 async handlePageFunction({ request, $ }) {
36 /* If on first page, handle pagination */
37 if (!request.url.includes(`vp-page=`)) {
38 // strict class match to avoid `.paginator__item .paginator__item--next`
39 const totalPages = parseInt(
40 $(`[class=paginator__item]`).last().find(`.cb-btn`).text()
41 );
42 for (let i = 2; i <= totalPages; i++) {
43 const Url = new URL(request.url);
44 Url.searchParams.set(`vp-page`, i.toString());
45 const url = Url.toString();
46 await requestQueue.addRequest({ url });
47 }
48 }
49
50 /* Scrape items */
51 $(`.layout__main .cb-offer-list .cb-offer`).each((i, el) => {
52 // get also pinned, we will dedup later
53 // if cb-offer--is-pinned, skip
54 if ($(el).hasClass(`cb-offer--is-pinned`)) {
55 console.log(`Skipping pinned`, $(el).find(`h4`).text());
56 return;
57 }
58
59 const urlRel = $(el).attr(`href`);
60 const id = urlRel // /inzerat/621592/prodej-horskeho-kola-trek-procaliber-9-6
61 .split(`/`)[2]; // 621592
62 const title = $(el).find(`h4`).text()?.trim();
63
64 const dateRaw = $(el)
65 .find(`.cb-time-ago`)
66 .attr(`title`) // Vytvořeno 31. 5. 2022, 14:36
67 ?.trim()
68 ?.replace(`Vytvořeno `, ``); // 31. 5. 2022, 14:36
69 let date = dateFromString(dateRaw);
70
71 if (!date) {
72 // on "card" view, which is used on profile pages, we have to take date from uploaded image
73 // <img src="/uploads/items/2024/4/8/823304/250_8420a453-
74 const imgEl = $(el).find(`.cb-offer__photo img`);
75 const imgSrc = imgEl.attr(`src`);
76 const dateMatch = imgSrc?.match(
77 /\/uploads\/items\/(\d+)\/(\d+)\/(\d+)\//
78 );
79 if (!dateMatch)
80 return console.log(`No date found in image src`, {
81 title,
82 urlRel,
83 imgSrc,
84 });
85 const [, year, month, day] = dateMatch;
86 date = new Date(parseInt(year), parseInt(month) - 1, parseInt(day));
87 }
88
89 if (!date)
90 return console.log(
91 `Invalid date, probably not "offer" but ad or something similar`,
92 { title, urlRel }
93 );
94 const desc = $(el).find(`.cb-offer__desc`).text();
95 const price = $(el).find(`.cb-offer__price`).text().replace(/\s/g, ``);
96 const location = $(el)
97 .find(`.cb-offer__tag-location, .cb-offer__vertical-location`)
98 .text()
99 ?.trim();
100 const brand = $(el).find(`.cb-offer__tag-brand`).text()?.trim();
101 let user = $(el).find(`.cb-offer__tag-user`).text()?.trim();
102
103 if (!user) {
104 // https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek
105 user = request.url.match(/\/u\/\w+\/([\w-]+)/)?.[1];
106 }
107 void Apify.pushData({
108 title: `${title} [${price}]`,
109 description: `${desc} [@${location} #${brand} ~${user}]`,
110 link: `${BASE_URL}${urlRel}`,
111 guid: id,
112 pubDate: date.toISOString(),
113 });
114 });
115 },
116 });
117 await crawler.run();
118});
119
120// 31. 5. 2022, 14:36 -> 2020-05-31T14:36:00.000Z
121function dateFromString(dateString) {
122 if (!dateString) return null;
123 const [date, time] = dateString.split(`,`).map((s) => s.trim());
124 const [day, month, year] = date.split(`.`).map((s) => parseInt(s));
125 const [hour, minute] = time.split(`:`);
126 return new Date(year, month - 1, day, hour, minute);
127}

package.json

{
"name": "cyklobazar-cyklobazar-cz-scraper-rss",
"description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
"type": "module",
"scripts": {
"start": "node ./main.js",
"push-to-apify-platform": "npx apify push"
},
"dependencies": {
"apify2": "npm:apify@^2.3.2",
"apify": "^2.3.2"
},
"apify": {
"title": "Cyklobazar (cyklobazar.cz) scraper RSS",
"description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
"isPublic": true,
"isDeprecated": false,
"isAnonymouslyRunnable": true,
"notice": "",
"pictureUrl": "",
"seoTitle": "",
"seoDescription": "",
"categories": [
"ECOMMERCE"
]
}
}

.actor/actor.json

{
"actorSpecification": 1,
"name": "cyklobazar-cyklobazar-cz-scraper-rss",
"title": "Cyklobazar (cyklobazar.cz) scraper RSS",
"description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
"version": "0.1.0",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "Cyklobazar (cyklobazar.cz) scraper RSS",
"description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
"views": {
"overview": {
"title": "Overview",
"description": "Overview of the most important fields",
"transformation": {
"fields": [
"title",
"description",
"link",
"guid",
"pubDate"
]
},
"display": {
"component": "table",
"columns": [
{
"label": "Title",
"field": "title",
"format": "text"
},
{
"label": "Description",
"field": "description",
"format": "text"
},
{
"label": "Link",
"field": "link",
"format": "text"
},
{
"label": "Guid",
"field": "guid",
"format": "text"
},
{
"label": "Pub Date",
"field": "pubDate",
"format": "text"
}
]
}
}
}
}
}
}

.actor/logo.png