
Cyklobazar (cyklobazar.cz) scraper RSS
Pricing
Pay per usage
Go to Store


Cyklobazar (cyklobazar.cz) scraper RSS
Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.
0.0 (0)
Pricing
Pay per usage
2
Total users
21
Monthly users
4
Runs succeeded
>99%
Last modified
a year ago
Dockerfile
FROM apify/actor-node:18
COPY package.json ./
RUN npm --quiet set progress=false \ && npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{ "title": "Cyklobazar (cyklobazar.cz) scraper RSS", "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.", "type": "object", "schemaVersion": 1, "properties": { "urls": { "title": "Urls", "description": "", "type": "array", "editor": "requestListSources", "prefill": [ { "url": "https://www.cyklobazar.cz/vsechny-kategorie?q=canyon" } ] }, "APIFY_USE_MEMORY_REQUEST_QUEUE": { "sectionCaption": "Advanced", "sectionDescription": "Advanced options, use only if you know what you're doing.", "title": "Use in-memory request queue instead of the native one", "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.", "type": "boolean", "default": false, "editor": "checkbox" } }, "required": [ "urls" ]}
apify.json
{ "name": "cyklobazar-cyklobazar-cz-scraper-rss", "version": "0.1", "buildTag": "latest", "env": null, "defaultRunOptions": { "build": "latest", "timeoutSecs": 3600, "memoryMbytes": 1024 }}
main.js
1import Apify from "apify2";2
3const BASE_URL = `https://www.cyklobazar.cz`;4
5Apify.main(async () => {6 const input = await Apify.getInput();7 const {8 urls = [{ url: `https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek` }],9 } = input ?? {};10
11 /* Validate input */12 let invalidInput = false;13 for (const { url } of urls) {14 if (!url.startsWith(BASE_URL)) {15 console.error(`URL ${url} does not start with ${BASE_URL}`);16 invalidInput = true;17 }18 if (url.includes(`vp-page=`)) {19 console.error(20 `URL ${url} contains pagination parameter "vp-page=", use first page only`21 );22 invalidInput = true;23 }24 }25 if (invalidInput) throw new Error(`Invalid input`);26
27 /* Enqueue initial */28 const requestQueue = await Apify.openRequestQueue();29 for (const { url } of urls) {30 await requestQueue.addRequest({ url });31 }32
33 const crawler = new Apify.CheerioCrawler({34 requestQueue,35 async handlePageFunction({ request, $ }) {36 /* If on first page, handle pagination */37 if (!request.url.includes(`vp-page=`)) {38 // strict class match to avoid `.paginator__item .paginator__item--next`39 const totalPages = parseInt(40 $(`[class=paginator__item]`).last().find(`.cb-btn`).text()41 );42 for (let i = 2; i <= totalPages; i++) {43 const Url = new URL(request.url);44 Url.searchParams.set(`vp-page`, i.toString());45 const url = Url.toString();46 await requestQueue.addRequest({ url });47 }48 }49
50 /* Scrape items */51 $(`.layout__main .cb-offer-list .cb-offer`).each((i, el) => {52 // get also pinned, we will dedup later53 // if cb-offer--is-pinned, skip54 if ($(el).hasClass(`cb-offer--is-pinned`)) {55 console.log(`Skipping pinned`, $(el).find(`h4`).text());56 return;57 }58
59 const urlRel = $(el).attr(`href`);60 const id = urlRel // /inzerat/621592/prodej-horskeho-kola-trek-procaliber-9-661 .split(`/`)[2]; // 62159262 const title = $(el).find(`h4`).text()?.trim();63
64 const dateRaw = $(el)65 .find(`.cb-time-ago`)66 .attr(`title`) // Vytvořeno 31. 5. 2022, 14:3667 ?.trim()68 ?.replace(`Vytvořeno `, ``); // 31. 5. 2022, 14:3669 let date = dateFromString(dateRaw);70
71 if (!date) {72 // on "card" view, which is used on profile pages, we have to take date from uploaded image73 // <img src="/uploads/items/2024/4/8/823304/250_8420a453-74 const imgEl = $(el).find(`.cb-offer__photo img`);75 const imgSrc = imgEl.attr(`src`);76 const dateMatch = imgSrc?.match(77 /\/uploads\/items\/(\d+)\/(\d+)\/(\d+)\//78 );79 if (!dateMatch)80 return console.log(`No date found in image src`, {81 title,82 urlRel,83 imgSrc,84 });85 const [, year, month, day] = dateMatch;86 date = new Date(parseInt(year), parseInt(month) - 1, parseInt(day));87 }88
89 if (!date)90 return console.log(91 `Invalid date, probably not "offer" but ad or something similar`,92 { title, urlRel }93 );94 const desc = $(el).find(`.cb-offer__desc`).text();95 const price = $(el).find(`.cb-offer__price`).text().replace(/\s/g, ``);96 const location = $(el)97 .find(`.cb-offer__tag-location, .cb-offer__vertical-location`)98 .text()99 ?.trim();100 const brand = $(el).find(`.cb-offer__tag-brand`).text()?.trim();101 let user = $(el).find(`.cb-offer__tag-user`).text()?.trim();102
103 if (!user) {104 // https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek105 user = request.url.match(/\/u\/\w+\/([\w-]+)/)?.[1];106 }107 void Apify.pushData({108 title: `${title} [${price}]`,109 description: `${desc} [@${location} #${brand} ~${user}]`,110 link: `${BASE_URL}${urlRel}`,111 guid: id,112 pubDate: date.toISOString(),113 });114 });115 },116 });117 await crawler.run();118});119
120// 31. 5. 2022, 14:36 -> 2020-05-31T14:36:00.000Z121function dateFromString(dateString) {122 if (!dateString) return null;123 const [date, time] = dateString.split(`,`).map((s) => s.trim());124 const [day, month, year] = date.split(`.`).map((s) => parseInt(s));125 const [hour, minute] = time.split(`:`);126 return new Date(year, month - 1, day, hour, minute);127}
package.json
{ "name": "cyklobazar-cyklobazar-cz-scraper-rss", "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.", "type": "module", "scripts": { "start": "node ./main.js", "push-to-apify-platform": "npx apify push" }, "dependencies": { "apify2": "npm:apify@^2.3.2", "apify": "^2.3.2" }, "apify": { "title": "Cyklobazar (cyklobazar.cz) scraper RSS", "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.", "isPublic": true, "isDeprecated": false, "isAnonymouslyRunnable": true, "notice": "", "pictureUrl": "", "seoTitle": "", "seoDescription": "", "categories": [ "ECOMMERCE" ] }}
.actor/actor.json
{ "actorSpecification": 1, "name": "cyklobazar-cyklobazar-cz-scraper-rss", "title": "Cyklobazar (cyklobazar.cz) scraper RSS", "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.", "version": "0.1.0", "storages": { "dataset": { "actorSpecification": 1, "title": "Cyklobazar (cyklobazar.cz) scraper RSS", "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.", "views": { "overview": { "title": "Overview", "description": "Overview of the most important fields", "transformation": { "fields": [ "title", "description", "link", "guid", "pubDate" ] }, "display": { "component": "table", "columns": [ { "label": "Title", "field": "title", "format": "text" }, { "label": "Description", "field": "description", "format": "text" }, { "label": "Link", "field": "link", "format": "text" }, { "label": "Guid", "field": "guid", "format": "text" }, { "label": "Pub Date", "field": "pubDate", "format": "text" } ] } } } } }}