Cyklobazar (cyklobazar.cz) scraper RSS avatar

Cyklobazar (cyklobazar.cz) scraper RSS

Try for free

No credit card required

View all Actors
Cyklobazar (cyklobazar.cz) scraper RSS

Cyklobazar (cyklobazar.cz) scraper RSS

strajk/cyklobazar-cyklobazar-cz-scraper-rss
Try for free

No credit card required

Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.

Dockerfile

1FROM apify/actor-node:18
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6  && npm install --only=prod --no-optional
7
8COPY . ./

INPUT_SCHEMA.json

1{
2  "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
3  "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "urls": {
8      "title": "Urls",
9      "description": "",
10      "type": "array",
11      "editor": "requestListSources",
12      "prefill": [
13        {
14          "url": "https://www.cyklobazar.cz/vsechny-kategorie?q=canyon"
15        }
16      ]
17    },
18    "APIFY_USE_MEMORY_REQUEST_QUEUE": {
19      "sectionCaption": "Advanced",
20      "sectionDescription": "Advanced options, use only if you know what you're doing.",
21      "title": "Use in-memory request queue instead of the native one",
22      "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
23      "type": "boolean",
24      "default": false,
25      "editor": "checkbox"
26    }
27  },
28  "required": [
29    "urls"
30  ]
31}

apify.json

1{
2  "name": "cyklobazar-cyklobazar-cz-scraper-rss",
3  "version": "0.1",
4  "buildTag": "latest",
5  "env": null,
6  "defaultRunOptions": {
7    "build": "latest",
8    "timeoutSecs": 3600,
9    "memoryMbytes": 1024
10  }
11}

main.js

1import Apify from "apify2";
2
3const BASE_URL = `https://www.cyklobazar.cz`;
4
5Apify.main(async () => {
6  const input = await Apify.getInput();
7  const {
8    urls = [{ url: `https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek` }],
9  } = input ?? {};
10
11  /* Validate input */
12  let invalidInput = false;
13  for (const { url } of urls) {
14    if (!url.startsWith(BASE_URL)) {
15      console.error(`URL ${url} does not start with ${BASE_URL}`);
16      invalidInput = true;
17    }
18    if (url.includes(`vp-page=`)) {
19      console.error(
20        `URL ${url} contains pagination parameter "vp-page=", use first page only`
21      );
22      invalidInput = true;
23    }
24  }
25  if (invalidInput) throw new Error(`Invalid input`);
26
27  /* Enqueue initial */
28  const requestQueue = await Apify.openRequestQueue();
29  for (const { url } of urls) {
30    await requestQueue.addRequest({ url });
31  }
32
33  const crawler = new Apify.CheerioCrawler({
34    requestQueue,
35    async handlePageFunction({ request, $ }) {
36      /* If on first page, handle pagination */
37      if (!request.url.includes(`vp-page=`)) {
38        // strict class match to avoid `.paginator__item .paginator__item--next`
39        const totalPages = parseInt(
40          $(`[class=paginator__item]`).last().find(`.cb-btn`).text()
41        );
42        for (let i = 2; i <= totalPages; i++) {
43          const Url = new URL(request.url);
44          Url.searchParams.set(`vp-page`, i.toString());
45          const url = Url.toString();
46          await requestQueue.addRequest({ url });
47        }
48      }
49
50      /* Scrape items */
51      $(`.layout__main .cb-offer-list .cb-offer`).each((i, el) => {
52        // get also pinned, we will dedup later
53        // if cb-offer--is-pinned, skip
54        if ($(el).hasClass(`cb-offer--is-pinned`)) {
55          console.log(`Skipping pinned`, $(el).find(`h4`).text());
56          return;
57        }
58
59        const urlRel = $(el).attr(`href`);
60        const id = urlRel // /inzerat/621592/prodej-horskeho-kola-trek-procaliber-9-6
61          .split(`/`)[2]; // 621592
62        const title = $(el).find(`h4`).text()?.trim();
63
64        const dateRaw = $(el)
65          .find(`.cb-time-ago`)
66          .attr(`title`) // Vytvořeno 31. 5. 2022, 14:36
67          ?.trim()
68          ?.replace(`Vytvořeno `, ``); // 31. 5. 2022, 14:36
69        let date = dateFromString(dateRaw);
70
71        if (!date) {
72          // on "card" view, which is used on profile pages, we have to take date from uploaded image
73          // <img src="/uploads/items/2024/4/8/823304/250_8420a453-
74          const imgEl = $(el).find(`.cb-offer__photo img`);
75          const imgSrc = imgEl.attr(`src`);
76          const dateMatch = imgSrc?.match(
77            /\/uploads\/items\/(\d+)\/(\d+)\/(\d+)\//
78          );
79          if (!dateMatch)
80            return console.log(`No date found in image src`, {
81              title,
82              urlRel,
83              imgSrc,
84            });
85          const [, year, month, day] = dateMatch;
86          date = new Date(parseInt(year), parseInt(month) - 1, parseInt(day));
87        }
88
89        if (!date)
90          return console.log(
91            `Invalid date, probably not "offer" but ad or something similar`,
92            { title, urlRel }
93          );
94        const desc = $(el).find(`.cb-offer__desc`).text();
95        const price = $(el).find(`.cb-offer__price`).text().replace(/\s/g, ``);
96        const location = $(el)
97          .find(`.cb-offer__tag-location, .cb-offer__vertical-location`)
98          .text()
99          ?.trim();
100        const brand = $(el).find(`.cb-offer__tag-brand`).text()?.trim();
101        let user = $(el).find(`.cb-offer__tag-user`).text()?.trim();
102
103        if (!user) {
104          // https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek
105          user = request.url.match(/\/u\/\w+\/([\w-]+)/)?.[1];
106        }
107        void Apify.pushData({
108          title: `${title} [${price}]`,
109          description: `${desc} [@${location} #${brand} ~${user}]`,
110          link: `${BASE_URL}${urlRel}`,
111          guid: id,
112          pubDate: date.toISOString(),
113        });
114      });
115    },
116  });
117  await crawler.run();
118});
119
120// 31. 5. 2022, 14:36 -> 2020-05-31T14:36:00.000Z
121function dateFromString(dateString) {
122  if (!dateString) return null;
123  const [date, time] = dateString.split(`,`).map((s) => s.trim());
124  const [day, month, year] = date.split(`.`).map((s) => parseInt(s));
125  const [hour, minute] = time.split(`:`);
126  return new Date(year, month - 1, day, hour, minute);
127}

package.json

1{
2  "name": "cyklobazar-cyklobazar-cz-scraper-rss",
3  "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
4  "type": "module",
5  "scripts": {
6    "start": "node ./main.js",
7    "push-to-apify-platform": "npx apify push"
8  },
9  "dependencies": {
10    "apify2": "npm:apify@^2.3.2",
11    "apify": "^2.3.2"
12  },
13  "apify": {
14    "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
15    "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
16    "isPublic": true,
17    "isDeprecated": false,
18    "isAnonymouslyRunnable": true,
19    "notice": "",
20    "pictureUrl": "",
21    "seoTitle": "",
22    "seoDescription": "",
23    "categories": [
24      "ECOMMERCE"
25    ]
26  }
27}

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "cyklobazar-cyklobazar-cz-scraper-rss",
4  "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
5  "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
6  "version": "0.1.0",
7  "storages": {
8    "dataset": {
9      "actorSpecification": 1,
10      "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
11      "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
12      "views": {
13        "overview": {
14          "title": "Overview",
15          "description": "Overview of the most important fields",
16          "transformation": {
17            "fields": [
18              "title",
19              "description",
20              "link",
21              "guid",
22              "pubDate"
23            ]
24          },
25          "display": {
26            "component": "table",
27            "columns": [
28              {
29                "label": "Title",
30                "field": "title",
31                "format": "text"
32              },
33              {
34                "label": "Description",
35                "field": "description",
36                "format": "text"
37              },
38              {
39                "label": "Link",
40                "field": "link",
41                "format": "text"
42              },
43              {
44                "label": "Guid",
45                "field": "guid",
46                "format": "text"
47              },
48              {
49                "label": "Pub Date",
50                "field": "pubDate",
51                "format": "text"
52              }
53            ]
54          }
55        }
56      }
57    }
58  }
59}

.actor/logo.png

Developer
Maintained by Community

Actor Metrics

  • 3 monthly users

  • 2 stars

  • >99% runs succeeded

  • Created in Aug 2022

  • Modified 7 months ago

Categories