1import { Actor, log } from 'apify';
2import { CheerioCrawler, Dataset } from 'crawlee';
3import * as cheerio from 'cheerio';
4
5interface Input {
6 companyIco: string,
7}
8
9await Actor.init();
10
11const getHeaderValue = (rawHeaders: string[], name: string): string | null | undefined => {
12 return rawHeaders[rawHeaders.findIndex((h) => h.toLowerCase() === name) + 1];
13};
14
15const getFileMetadata = (prefix: string, rawHeaders: string[]) => {
16 const contentType = getHeaderValue(rawHeaders, 'content-type') ?? 'text/plain';
17 const filename = getHeaderValue(rawHeaders, 'content-disposition')?.match(/filename="(.*)"/)?.[1] || 'unknown';
18
19 return {
20 contentType,
21 filename: `${prefix}-${filename.replace(/[^a-zA-Z0-9_.-]/g, '-')}`,
22 };
23};
24
25const downloadPdf = async (
26 fileIdentifier: string,
27 downloadUrl: string,
28 fileCategory: string,
29 documentUrl: string,
30 { sendRequest }: Record<string, any>,
31) => {
32 log.info('Downloading document', { downloadUrl, fileCategory });
33 const response = await sendRequest({ url: downloadUrl });
34
35 const { contentType, filename } = getFileMetadata(fileIdentifier, response.rawHeaders);
36
37 if (contentType.startsWith('text/html')) {
38 const $res = cheerio.load(response.rawBody);
39 if ($res('h2').text().includes("časová platnost")) {
40
41 throw new Error('Failed to get the document in time. Will retry');
42 }
43 }
44
45
46 const doc = await Actor.getValue(filename);
47 if (doc) {
48 return;
49 }
50
51 await Actor.setValue(filename, response.rawBody, { contentType });
52 await Actor.pushData({
53 url: documentUrl,
54 filename,
55 fileUrl: kvs.getPublicUrl(filename),
56 fileCategory,
57 });
58}
59
60
61const {
62 companyIco,
63} = await Actor.getInput<Input>() ?? {} as Input;
64
65const kvs = await Actor.openKeyValueStore();
66
67const LABELS = {
68 START: 'START',
69 SBIRKA_LISTIN: 'SBIRKA_LISTIN',
70 DOKUMENT: 'DOKUMENT',
71} as const;
72
73const DOCUMENT_CATEGORY = {
74 LISTINA: 'LISTINA',
75 UPLNY_VYPIS: 'UPLNY_VYPIS',
76 VYPIS_PLATNYCH: 'VYPIS_PLATNYCH',
77} as const;
78
79
80const requestQueue = await Actor.openRequestQueue();
81
82
83const crawler = new CheerioCrawler({
84
85 proxyConfiguration: await Actor.createProxyConfiguration(),
86 maxConcurrency: 10,
87 maxRequestRetries: 7,
88 requestHandler: async ({ request, $, sendRequest }) => {
89 if (request.label === LABELS.START) {
90 log.info('Determining subjectId of the organization...');
91
92
93 const [ link ] = $('a[href*="PLATNY"]').toArray();
94 const subjektId = new URLSearchParams(link.attribs.href.split('?')[1]).get('subjektId')!;
95
96 if (!subjektId) {
97 log.error('Subjekt id not found, cannot continue.');
98 return;
99 }
100
101 log.info('Found subjektId', { subjektId });
102
103
104 const documentsUrl = new URL('https://or.justice.cz/ias/ui/vypis-sl-firma');
105 documentsUrl.searchParams.set('subjektId', subjektId);
106 await requestQueue.addRequests([{ url: documentsUrl.toString(), label: LABELS.SBIRKA_LISTIN }]);
107
108
109 const baseUrl = new URL('https://or.justice.cz/ias/ui/print-pdf');
110 baseUrl.searchParams.set('subjektId', subjektId);
111
112 baseUrl.searchParams.set('typVypisu', 'PLATNY');
113 await downloadPdf(
114 'vypis-platnych',
115 baseUrl.toString(),
116 DOCUMENT_CATEGORY.VYPIS_PLATNYCH,
117 request.url,
118 { sendRequest },
119 );
120
121 baseUrl.searchParams.set('typVypisu', 'UPLNY');
122
123 await downloadPdf(
124 'uplny-vypis',
125 baseUrl.toString(),
126 DOCUMENT_CATEGORY.UPLNY_VYPIS,
127 request.url,
128 { sendRequest },
129 );
130
131
132 } else if (request.label === LABELS.SBIRKA_LISTIN) {
133
134 const documentsLinks = $('a[href^="./vypis-sl-detail"]').toArray().map(link => {
135 const params = new URLSearchParams(link.attribs.href.split('?')[1]);
136 const url = new URL('https://or.justice.cz/ias/ui/vypis-sl-detail');
137 const dokumentId = params.get('dokument')!;
138 url.searchParams.set('dokument', dokumentId);
139 url.searchParams.set('subjektId', params.get('subjektId')!);
140 url.searchParams.set('spis', params.get('spis')!);
141 return {
142 url: url.toString(),
143 userData: {
144 label: LABELS.DOKUMENT,
145 dokumentId,
146 }
147 }
148 });
149
150 log.info('Found documents', { count: documentsLinks.length });
151
152 await requestQueue.addRequests(documentsLinks);
153
154 } else if (request.label === LABELS.DOKUMENT) {
155 const links = $('a[href^=/ias/content/download]').toArray();
156 log.info(`Found document links`, { count: links.length });
157
158 const dokumentId = request.userData.dokumentId;
159
160 await Promise.all(links.map(async (link, idx) => {
161 const downloadUrl = `https://or.justice.cz${link.attribs.href}`;
162 await downloadPdf(
163 `doc-${dokumentId}-part-${idx}`,
164 downloadUrl,
165 DOCUMENT_CATEGORY.LISTINA,
166 request.url,
167 { sendRequest },
168 );
169 }));
170 }
171 },
172});
173
174const startUrl = new URL('https://or.justice.cz/ias/ui/rejstrik-$firma');
175startUrl.searchParams.set('ico', companyIco.replace(/[^0-9]/, ''));
176
177await crawler.run([
178 { url: startUrl.toString(), label: LABELS.START },
179]);
180
181
182await Actor.exit();