Obchodni Rejstrik Downloader avatar
Obchodni Rejstrik Downloader

Pricing

Pay per usage

Go to Store
Obchodni Rejstrik Downloader

Obchodni Rejstrik Downloader

Developed by

Josef Válek

Maintained by Community

Downloads data from Czech company registry https://or.justice.cz/

0.0 (0)

Pricing

Pay per usage

0

Monthly users

1

Runs succeeded

>99%

Last modified

a month ago

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38    && npm install --omit=dev --omit=optional \
39    && echo "Installed NPM packages:" \
40    && (npm list --omit=dev --all || true) \
41    && echo "Node.js version:" \
42    && node --version \
43    && echo "NPM version:" \
44    && npm --version \
45    && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY --from=builder /usr/src/app/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY . ./
54
55
56# Run the image.
57CMD npm run start:prod --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-25",
4    "title": "Project Cheerio Crawler Typescript",
5    "description": "Crawlee and Cheerio project in typescript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "ts-crawlee-cheerio"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Obchodni rejstrik downloader",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "companyIco": {
7            "title": "ICO of the company you want to scrape",
8            "type": "string",
9            "editor": "textfield",
10            "description": "ICO of the company you want to scrape",
11            "prefill": "04788290"
12        }
13    },
14    "required": ["companyIco"]
15}

src/main.ts

1import { Actor, log } from 'apify';
2import { CheerioCrawler, Dataset } from 'crawlee';
3
4interface Input {
5    companyIco: string,
6}
7
8await Actor.init();
9
10const getHeaderValue = (rawHeaders: string[], name: string): string | null | undefined => {
11    return rawHeaders[rawHeaders.findIndex((h) => h.toLowerCase() === name) + 1];
12};
13
14const getFileMetadata = (prefix: string, rawHeaders: string[]) => {
15    const contentType = getHeaderValue(rawHeaders, 'content-type') ?? 'text/plain';
16    const filename = getHeaderValue(rawHeaders, 'content-disposition')?.match(/filename="(.*)"/)?.[1] || 'unknown';
17
18    return {
19        contentType,
20        filename: `${prefix}-${+new Date()}-${filename.replace(/[^a-zA-Z0-9_.-]/g, '-')}`,
21    };
22};
23
24
25const {
26    companyIco,
27} = await Actor.getInput<Input>() ?? {} as Input;
28
29const kvs = await Actor.openKeyValueStore();
30
31const LABELS = {
32    START: 'START',
33    SBIRKA_LISTIN: 'SBIRKA_LISTIN',
34    LISTINA: 'LISTINA',
35} as const;
36
37const crawler = new CheerioCrawler({
38    // TODO: Is this using proxy properly?
39    proxyConfiguration: await Actor.createProxyConfiguration(),
40    maxConcurrency: 10,
41    requestHandler: async ({ enqueueLinks, request, $, sendRequest }) => {
42        if (request.label === LABELS.START) {
43            log.info('Enqueuing urls from search page...');
44            await enqueueLinks({
45                selector: 'a[href^="./vypis-sl"]',
46                label: LABELS.SBIRKA_LISTIN,
47            });
48        } else if (request.label === LABELS.SBIRKA_LISTIN) {
49            log.info('Enqueuing URLs from document list...');
50            await enqueueLinks({
51                selector: 'a[href^="./vypis-sl-detail"]',
52                label: LABELS.LISTINA,
53            });
54        } else if (request.label === LABELS.LISTINA) {
55            const links = $('a[href^=/ias/content/download]').toArray();
56            log.info(`Found ${links.length} links`);
57
58            await Promise.allSettled(links.map(async (link) => {
59                log.info('Downloading document...');
60                const downloadUrl = `https://or.justice.cz${link.attribs.href}`;
61                const response = await sendRequest({ url: downloadUrl });
62                // For some reason, we can only access raw headers
63                const { contentType, filename } = getFileMetadata('file', response.rawHeaders);
64                await Actor.setValue(filename, response.rawBody, { contentType });
65                await Actor.pushData({
66                    url: request.url,
67                    filename,
68                    fileUrl: kvs.getPublicUrl(filename),
69                });
70
71            }));
72        }
73    },
74});
75
76const startUrl = new URL('https://or.justice.cz/ias/ui/rejstrik-$firma');
77startUrl.searchParams.set('ico', companyIco.replace(/[^0-9]/, ''));
78
79await crawler.run([
80    { url: startUrl.toString(), label: LABELS.START },
81]);
82
83// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
84await Actor.exit();

.dockerignore

1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "root": true,
3    "env": {
4        "browser": true,
5        "es2020": true,
6        "node": true
7    },
8    "extends": [
9        "@apify/eslint-config-ts"
10    ],
11    "parserOptions": {
12        "project": "./tsconfig.json",
13        "ecmaVersion": 2020
14    },
15    "ignorePatterns": [
16        "node_modules",
17        "dist",
18        "**/*.d.ts"
19    ]
20}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5.vscode
6dist
7node_modules
8apify_storage
9storage

package.json

1{
2    "name": "crawlee-cheerio-typescript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is a boilerplate of an Apify actor.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.2.6",
11        "crawlee": "^3.11.5"
12    },
13    "devDependencies": {
14        "@apify/eslint-config-ts": "^0.3.0",
15        "@apify/tsconfig": "^0.1.0",
16        "@typescript-eslint/eslint-plugin": "^7.18.0",
17        "@typescript-eslint/parser": "^7.18.0",
18        "eslint": "^8.50.0",
19        "tsx": "^4.6.2",
20        "typescript": "^5.3.3"
21    },
22    "scripts": {
23        "start": "npm run start:dev",
24        "start:prod": "node dist/main.js",
25        "start:dev": "tsx src/main.ts",
26        "build": "tsc",
27        "lint": "eslint ./src --ext .ts",
28        "lint:fix": "eslint ./src --ext .ts --fix",
29        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
30    },
31    "author": "It's not you it's me",
32    "license": "ISC"
33}

tsconfig.json

1{
2    "extends": "@apify/tsconfig",
3    "compilerOptions": {
4        "module": "NodeNext",
5        "moduleResolution": "NodeNext",
6        "target": "ES2022",
7        "outDir": "dist",
8        "noUnusedLocals": false,
9        "skipLibCheck": true,
10        "lib": ["DOM"]
11    },
12    "include": [
13        "./src/**/*"
14    ]
15}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.