Rulemaking Proposals
DeprecatedView all Actors![Rulemaking Proposals](https://apify.com/img/store/actor_picture.svg)
This Actor is deprecated
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsRulemaking Proposals
nondescript_cord/rulemaking-proposals
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv
crawlee.json
1{
2 "purgeOnStart": false
3}
Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27
28# Run the image. If you know you won't need headful browsers,
29# you can remove the XVFB start script for a micro perf gain.
30CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
package.json
1{
2 "name": "consumerfinance",
3 "version": "0.0.1",
4 "type": "commonjs",
5 "description": "consumerfinance rules extractor",
6 "dependencies": {
7 "22": "^0.0.0",
8 "apify": "^3.1.10",
9 "crawlee": "^3.5.4",
10 "fs": "^0.0.1-security",
11 "node-fetch": "^3.3.2",
12 "path": "^0.12.7",
13 "pdf-parse": "^1.1.1",
14 "playwright": "^1.43.1"
15 },
16 "devDependencies": {
17 "@apify/eslint-config": "^0.4.0",
18 "@types/pdf-parse": "^1.1.4",
19 "eslint": "^8.50.0",
20 "@playwright/test": "^1.43.1"
21 },
22 "scripts": {
23 "start": "node src/linksdata.js && node src/main.js",
24 "lint": "eslint ./src --ext .js,.jsx",
25 "lint:fix": "eslint ./src --ext .js,.jsx --fix",
26 "test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
27 "postinstall": "npx crawlee install-playwright-browsers"
28 },
29 "author": "Moazzam Malek"
30}
playwright.config.js
1// @ts-check
2const { defineConfig, devices } = require('@playwright/test');
3
4/**
5 * Read environment variables from file.
6 * https://github.com/motdotla/dotenv
7 */
8// require('dotenv').config();
9
10/**
11 * @see https://playwright.dev/docs/test-configuration
12 */
13module.exports = defineConfig({
14 testDir: './tests',
15 /* Run tests in files in parallel */
16 fullyParallel: true,
17 /* Fail the build on CI if you accidentally left test.only in the source code. */
18 forbidOnly: !!process.env.CI,
19 /* Retry on CI only */
20 retries: process.env.CI ? 2 : 0,
21 /* Opt out of parallel tests on CI. */
22 workers: process.env.CI ? 1 : undefined,
23 /* Reporter to use. See https://playwright.dev/docs/test-reporters */
24 reporter: 'html',
25 /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
26 use: {
27 /* Base URL to use in actions like `await page.goto('/')`. */
28 // baseURL: 'http://127.0.0.1:3000',
29
30 /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */
31 trace: 'on-first-retry',
32 },
33
34 /* Configure projects for major browsers */
35 projects: [
36 {
37 name: 'chromium',
38 use: { ...devices['Desktop Chrome'] },
39 },
40
41 {
42 name: 'firefox',
43 use: { ...devices['Desktop Firefox'] },
44 },
45
46 {
47 name: 'webkit',
48 use: { ...devices['Desktop Safari'] },
49 },
50
51 /* Test against mobile viewports. */
52 // {
53 // name: 'Mobile Chrome',
54 // use: { ...devices['Pixel 5'] },
55 // },
56 // {
57 // name: 'Mobile Safari',
58 // use: { ...devices['iPhone 12'] },
59 // },
60
61 /* Test against branded browsers. */
62 // {
63 // name: 'Microsoft Edge',
64 // use: { ...devices['Desktop Edge'], channel: 'msedge' },
65 // },
66 // {
67 // name: 'Google Chrome',
68 // use: { ...devices['Desktop Chrome'], channel: 'chrome' },
69 // },
70 ],
71
72 /* Run your local dev server before starting the tests */
73 // webServer: {
74 // command: 'npm run start',
75 // url: 'http://127.0.0.1:3000',
76 // reuseExistingServer: !process.env.CI,
77 // },
78
79 globalTimeout: 60*1000,
80});
start.bat
Downloadstart.sh
Download.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "Rulemaking-proposals",
4 "title": "Project Playwright Crawler JavaScript",
5 "description": "Crawlee and Playwright project in JavaScript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-playwright-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/input_schema.json
1{
2 "title": "PlaywrightCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with.",
10 "editor": "requestListSources",
11 "prefill": [
12 {
13 "url": "https://apify.com"
14 }
15 ]
16 }
17 }
18}
src/get.js
1const fs = require('fs').promises;
2const path = require('path');
3
4async function getLinks() {
5 const filePath = path.join(__dirname, 'links.json');
6 try {
7 const data = await fs.readFile(filePath, 'utf-8');
8 console.log(data.length)
9 return JSON.parse(data);
10 } catch (error) {
11 console.error('Error reading links.json:', error);
12 return [];
13 }
14}
15
16module.exports = { getLinks };
src/links.json
1[
2 {
3 "href": "https://www.regulations.gov/document/NCUA-2024-0026-0001"
4 },
5 {
6 "href": "https://www.regulations.gov/document/NCUA-2023-0070-0007"
7 },
8 {
9 "href": "https://www.regulations.gov/document/NCUA-2024-0008-0001"
10 },
11 {
12 "href": "https://www.regulations.gov/document/NCUA-2023-0142-0001"
13 },
14 {
15 "href": "https://www.regulations.gov/document/NCUA-2023-0137-0003"
16 },
17 {
18 "href": "https://www.regulations.gov/document/NCUA-2023-0117-0001"
19 },
20 {
21 "href": "https://www.regulations.gov/document/NCUA-2023-0023-0001"
22 },
23 {
24 "href": "https://www.regulations.gov/document/NCUA-2023-0082-0001"
25 },
26 {
27 "href": "https://www.regulations.gov/document/NCUA-2023-0118-0003"
28 },
29 {
30 "href": "https://www.regulations.gov/document/NCUA_FRDOC_0001-0347"
31 },
32 {
33 "href": "https://www.regulations.gov/document/NCUA-2023-0072-0001"
34 },
35 {
36 "href": "https://www.regulations.gov/document/NCUA-2023-0070-0001"
37 },
38 {
39 "href": "https://www.regulations.gov/document/NCUA-2023-0061-0001"
40 },
41 {
42 "href": "https://www.regulations.gov/document/NCUA-2023-0019-0001"
43 },
44 {
45 "href": "https://www.regulations.gov/document/NCUA-2023-0043-0009"
46 },
47 {
48 "href": "https://www.regulations.gov/document/NCUA-2023-0045-0001"
49 },
50 {
51 "href": "https://www.regulations.gov/document/NCUA-2022-0138-0023"
52 },
53 {
54 "href": "https://www.regulations.gov/docket/NCUA-2022-0179/"
55 },
56 {
57 "href": "https://www.regulations.gov/docket/NCUA-2022-0099"
58 },
59 {
60 "href": "https://www.regulations.gov/document/NCUA_FRDOC_0001-0310"
61 },
62 {
63 "href": "https://www.regulations.gov/docket/NCUA-2022-0185"
64 },
65 {
66 "href": "https://www.regulations.gov/docket/NCUA-2022-0145"
67 },
68 {
69 "href": "https://www.regulations.gov/docket/NCUA-2022-0138"
70 },
71 {
72 "href": "https://www.regulations.gov/docket/NCUA-2022-0132"
73 },
74 {
75 "href": "https://www.regulations.gov/docket/NCUA-2022-0123"
76 },
77 {
78 "href": "https://www.regulations.gov/docket/NCUA-2022-0099"
79 },
80 {
81 "href": "https://www.regulations.gov/document/NCUA-2022-0008-0007"
82 },
83 {
84 "href": "https://www.regulations.gov/docket/NCUA-2022-0008"
85 },
86 {
87 "href": "https://www.regulations.gov/docket/NCUA-2022-0005"
88 },
89 {
90 "href": "https://www.regulations.gov/docket/NCUA-2022-0016"
91 },
92 {
93 "href": "https://www.regulations.gov/docket/NCUA-2022-0041"
94 },
95 {
96 "href": "https://www.regulations.gov/docket/NCUA-2022-0038"
97 },
98 {
99 "href": "https://www.regulations.gov/docket/NCUA-2021-0072"
100 },
101 {
102 "href": "https://www.regulations.gov/docket/NCUA-2020-0125"
103 },
104 {
105 "href": "https://www.regulations.gov/docket/NCUA-2022-0040"
106 },
107 {
108 "href": "https://www.regulations.gov/docket/NCUA-2021-0100"
109 },
110 {
111 "href": "https://www.regulations.gov/docket/NCUA-2021-0149"
112 },
113 {
114 "href": "https://www.regulations.gov/docket/NCUA-2021-0079"
115 },
116 {
117 "href": "https://www.regulations.gov/docket/NCUA-2021-0036"
118 },
119 {
120 "href": "https://www.regulations.gov/docket/NCUA-2021-0127"
121 },
122 {
123 "href": "https://www.regulations.gov/docket/NCUA-2021-0102"
124 },
125 {
126 "href": "https://www.regulations.gov/docket/NCUA-2021-0072"
127 },
128 {
129 "href": "https://www.regulations.gov/docket/NCUA-2020-0074"
130 },
131 {
132 "href": "https://www.regulations.gov/docket/NCUA-2020-0114"
133 },
134 {
135 "href": "https://www.regulations.gov/docket/NCUA-2020-0107"
136 },
137 {
138 "href": "https://www.regulations.gov/docket/NCUA-2021-0038"
139 },
140 {
141 "href": "https://www.regulations.gov/docket/NCUA-2021-0056"
142 },
143 {
144 "href": "https://www.regulations.gov/docket/NCUA-2021-0036"
145 },
146 {
147 "href": "https://www.regulations.gov/docket/NCUA-2021-0037"
148 },
149 {
150 "href": "https://www.regulations.gov/docket/NCUA-2021-0045"
151 },
152 {
153 "href": "https://www.regulations.gov/docket/NCUA-2021-0042"
154 },
155 {
156 "href": "https://www.regulations.gov/docket/NCUA-2020-0056"
157 },
158 {
159 "href": "https://www.regulations.gov/docket/NCUA-2020-0098"
160 },
161 {
162 "href": "https://www.regulations.gov/docket/NCUA-2020-0033"
163 },
164 {
165 "href": "https://www.regulations.gov/docket/NCUA-2021-0028"
166 },
167 {
168 "href": "https://www.regulations.gov/docket/NCUA-2021-0010"
169 },
170 {
171 "href": "https://www.regulations.gov/docket/NCUA-2021-0030"
172 },
173 {
174 "href": "https://www.regulations.gov/docket/NCUA-2021-0006"
175 },
176 {
177 "href": "https://www.regulations.gov/docket/NCUA-2022-0042"
178 },
179 {
180 "href": "https://www.regulations.gov/docket/NCUA-2020-0080"
181 },
182 {
183 "href": "https://www.regulations.gov/docket/NCUA-2020-0081"
184 },
185 {
186 "href": "https://www.regulations.gov/docket/NCUA-2020-0016"
187 },
188 {
189 "href": "https://www.regulations.gov/docket/NCUA-2022-0043"
190 },
191 {
192 "href": "https://www.regulations.gov/docket/NCUA-2020-0125"
193 },
194 {
195 "href": "https://www.regulations.gov/docket/NCUA-2021-0019"
196 },
197 {
198 "href": "https://www.regulations.gov/docket/NCUA-2021-0021"
199 },
200 {
201 "href": "https://www.regulations.gov/docket/NCUA-2021-0001"
202 },
203 {
204 "href": "https://www.regulations.gov/docket/NCUA-2020-0114"
205 },
206 {
207 "href": "https://www.regulations.gov/docket/NCUA-2020-0116"
208 },
209 {
210 "href": "https://www.regulations.gov/docket/NCUA-2020-0098"
211 },
212 {
213 "href": "https://www.regulations.gov/docket/NCUA-2020-0033"
214 },
215 {
216 "href": "https://www.regulations.gov/docket/NCUA-2020-0107"
217 },
218 {
219 "href": "https://www.regulations.gov/docket/NCUA-2022-0036"
220 },
221 {
222 "href": "https://www.regulations.gov/docket/NCUA-2022-0026"
223 },
224 {
225 "href": "https://www.regulations.gov/docket/NCUA-2020-0080"
226 },
227 {
228 "href": "https://www.regulations.gov/docket/NCUA-2020-0081"
229 },
230 {
231 "href": "https://www.regulations.gov/docket/NCUA-2020-0074"
232 },
233 {
234 "href": "https://www.regulations.gov/docket/NCUA-2022-0037"
235 },
236 {
237 "href": "https://www.regulations.gov/docket/NCUA-2020-0064"
238 },
239 {
240 "href": "https://www.regulations.gov/document/NCUA_FRDOC_0001-0143"
241 },
242 {
243 "href": "https://www.regulations.gov/docket/NCUA-2020-0054"
244 },
245 {
246 "href": "https://www.regulations.gov/docket/NCUA-2020-0056"
247 },
248 {
249 "href": "https://www.regulations.gov/docket/NCUA-2020-0043"
250 },
251 {
252 "href": "https://www.regulations.gov/docket/NCUA-2019-0112"
253 },
254 {
255 "href": "https://www.regulations.gov/docket/NCUA-2019-0112"
256 },
257 {
258 "href": "https://www.regulations.gov/docket/NCUA-2022-0035"
259 },
260 {
261 "href": "https://www.regulations.gov/docket/NCUA-2020-0044"
262 },
263 {
264 "href": "https://www.regulations.gov/docket/NCUA-2020-0033"
265 },
266 {
267 "href": "https://www.regulations.gov/docket/NCUA-2020-0016"
268 },
269 {
270 "href": "https://www.regulations.gov/docket/NCUA-2020-0015"
271 },
272 {
273 "href": "https://www.regulations.gov/docket/NCUA-2022-0034"
274 },
275 {
276 "href": "https://www.regulations.gov/docket/NCUA-2022-0025"
277 },
278 {
279 "href": "https://www.regulations.gov/docket/NCUA-2022-0033"
280 },
281 {
282 "href": "https://www.regulations.gov/docket/NCUA-2019-0112"
283 },
284 {
285 "href": "https://www.regulations.gov/document/NCUA_FRDOC_0001-0076"
286 },
287 {
288 "href": "https://www.regulations.gov/docket/NCUA-2022-0028"
289 },
290 {
291 "href": "https://www.regulations.gov/docket/NCUA-2022-0026"
292 },
293 {
294 "href": "https://www.regulations.gov/docket/NCUA-2019-0121"
295 },
296 {
297 "href": "https://www.regulations.gov/docket/NCUA-2022-0027"
298 },
299 {
300 "href": "https://www.regulations.gov/docket/NCUA-2018-0030"
301 },
302 {
303 "href": "https://www.regulations.gov/docket/NCUA-2018-0017"
304 },
305 {
306 "href": "https://www.regulations.gov/docket/NCUA-2019-0120"
307 },
308 {
309 "href": "https://www.regulations.gov/docket/NCUA-2022-0044"
310 },
311 {
312 "href": "https://www.regulations.gov/docket/NCUA-2022-0030"
313 },
314 {
315 "href": "https://www.regulations.gov/docket/NCUA-2018-0051"
316 },
317 {
318 "href": "https://www.regulations.gov/docket/NCUA-2022-0033"
319 },
320 {
321 "href": "https://www.regulations.gov/docket/NCUA-2022-0025"
322 },
323 {
324 "href": "https://www.regulations.gov/docket/NCUA-2022-0028"
325 },
326 {
327 "href": "https://www.regulations.gov/docket/NCUA-2022-0029"
328 },
329 {
330 "href": "https://www.regulations.gov/docket/NCUA-2018-0041"
331 },
332 {
333 "href": "https://www.regulations.gov/docket/NCUA-2022-0027"
334 },
335 {
336 "href": "https://www.regulations.gov/docket/NCUA-2016-0075"
337 },
338 {
339 "href": "https://www.regulations.gov/docket/NCUA-2022-0032"
340 },
341 {
342 "href": "https://www.regulations.gov/docket/NCUA-2022-0031"
343 },
344 {
345 "href": "https://www.regulations.gov/docket/NCUA-2022-0030"
346 },
347 {
348 "href": "https://www.regulations.gov/docket/NCUA-2018-0039"
349 },
350 {
351 "href": "https://www.regulations.gov/docket/NCUA-2018-0017"
352 },
353 {
354 "href": "https://www.regulations.gov/docket/NCUA-2018-0050"
355 },
356 {
357 "href": "https://www.regulations.gov/docket/NCUA-2018-0051"
358 },
359 {
360 "href": "https://www.regulations.gov/docket/NCUA-2018-0040"
361 },
362 {
363 "href": "https://www.regulations.gov/docket/NCUA-2018-0041"
364 },
365 {
366 "href": "https://www.regulations.gov/docket/NCUA-2018-0039"
367 },
368 {
369 "href": "https://www.regulations.gov/docket/NCUA-2018-0033"
370 },
371 {
372 "href": "https://www.regulations.gov/docket/NCUA-2018-0034"
373 },
374 {
375 "href": "https://www.regulations.gov/docket/NCUA-2018-0031"
376 },
377 {
378 "href": "https://www.regulations.gov/docket/NCUA-2018-0027"
379 },
380 {
381 "href": "https://www.regulations.gov/docket/NCUA-2018-0030"
382 },
383 {
384 "href": "https://www.regulations.gov/docket/NCUA-2018-0024"
385 },
386 {
387 "href": "https://www.regulations.gov/docket/NCUA-2018-0023"
388 },
389 {
390 "href": "https://www.regulations.gov/docket/NCUA-2018-0017"
391 },
392 {
393 "href": "https://www.regulations.gov/docket/NCUA-2018-0016"
394 },
395 {
396 "href": "https://www.regulations.gov/docket/NCUA-2018-0009"
397 },
398 {
399 "href": "https://www.regulations.gov/docket/NCUA-2018-0006"
400 },
401 {
402 "href": "https://www.regulations.gov/docket/NCUA-2018-0005"
403 },
404 {
405 "href": "https://www.regulations.gov/docket/NCUA-2018-0003"
406 },
407 {
408 "href": "https://www.regulations.gov/docket/NCUA-2017-0055"
409 },
410 {
411 "href": "https://www.regulations.gov/docket/NCUA-2017-0054"
412 },
413 {
414 "href": "https://www.regulations.gov/docket/NCUA-2017-0049"
415 },
416 {
417 "href": "https://www.regulations.gov/docket/NCUA-2017-0045"
418 },
419 {
420 "href": "https://www.regulations.gov/docket/NCUA-2017-0043"
421 },
422 {
423 "href": "https://www.regulations.gov/docket/NCUA-2017-0044"
424 },
425 {
426 "href": "https://www.regulations.gov/docket/NCUA-2017-0042"
427 },
428 {
429 "href": "/about/pages/budget-strategic-planning/budget-comments-2018.aspx"
430 },
431 {
432 "href": "https://www.regulations.gov/docket/NCUA-2017-0036"
433 },
434 {
435 "href": "https://www.regulations.gov/docket/NCUA-2017-0038"
436 },
437 {
438 "href": "https://www.regulations.gov/docket/NCUA-2017-0032"
439 },
440 {
441 "href": "https://www.regulations.gov/docket/NCUA-2017-0031"
442 },
443 {
444 "href": "https://www.regulations.gov/docket/NCUA-2017-0030"
445 },
446 {
447 "href": "https://www.regulations.gov/docket/NCUA-2017-0025"
448 },
449 {
450 "href": "https://www.regulations.gov/docket/NCUA-2017-0022"
451 },
452 {
453 "href": "https://www.regulations.gov/docket/NCUA-2017-0024"
454 },
455 {
456 "href": "https://www.regulations.gov/docket/NCUA-2017-0026"
457 },
458 {
459 "href": "https://www.regulations.gov/docket/NCUA-2017-0023"
460 },
461 {
462 "href": "https://www.regulations.gov/docket/NCUA-2017-0017"
463 },
464 {
465 "href": "https://www.regulations.gov/docket/NCUA-2017-0018"
466 },
467 {
468 "href": "https://www.regulations.gov/docket/NCUA-2017-0016"
469 },
470 {
471 "href": "https://www.regulations.gov/docket/NCUA-2017-0007"
472 },
473 {
474 "href": "https://www.regulations.gov/docket/NCUA-2017-0002"
475 },
476 {
477 "href": "https://www.regulations.gov/docket/NCUA-2016-0089"
478 },
479 {
480 "href": "https://www.regulations.gov/docket/NCUA-2016-0088"
481 },
482 {
483 "href": "https://www.regulations.gov/docket/NCUA-2016-0081"
484 },
485 {
486 "href": "https://www.regulations.gov/docket/NCUA-2016-0085"
487 },
488 {
489 "href": "https://www.regulations.gov/docket/NCUA-2016-0039"
490 },
491 {
492 "href": "https://www.regulations.gov/docket/NCUA-2016-0073"
493 },
494 {
495 "href": "https://www.regulations.gov/docket/NCUA-2016-0075"
496 },
497 {
498 "href": "https://www.regulations.gov/docket/NCUA-2016-0076"
499 },
500 {
501 "href": "/About/Pages/budget-strategic-planning/budget-comments-2017.aspx"
502 },
503 {
504 "href": "https://www.regulations.gov/docket/NCUA-2016-0044"
505 },
506 {
507 "href": "https://www.regulations.gov/docket/NCUA-2016-0039"
508 },
509 {
510 "href": "https://www.regulations.gov/docket/NCUA-2016-0040"
511 },
512 {
513 "href": "https://www.regulations.gov/docket/NCUA-2016-0033"
514 },
515 {
516 "href": "https://www.regulations.gov/docket/NCUA-2016-0024"
517 },
518 {
519 "href": "https://www.regulations.gov/docket/NCUA-2016-0016"
520 },
521 {
522 "href": "https://www.regulations.gov/docket/NCUA-2016-0013"
523 },
524 {
525 "href": "https://www.regulations.gov/docket/NCUA-2016-0004"
526 },
527 {
528 "href": "https://www.regulations.gov/docket/NCUA-2016-0006"
529 },
530 {
531 "href": "https://www.regulations.gov/docket/NCUA-2016-0003"
532 },
533 {
534 "href": "https://www.regulations.gov/docket/NCUA-2016-0005"
535 },
536 {
537 "href": "https://www.regulations.gov/docket/NCUA-2015-0060"
538 },
539 {
540 "href": "https://www.regulations.gov/docket/NCUA-2015-0059"
541 },
542 {
543 "href": "https://www.regulations.gov/docket/NCUA-2015-0055"
544 },
545 {
546 "href": "https://www.regulations.gov/docket/NCUA-2015-0049"
547 },
548 {
549 "href": "https://www.regulations.gov/docket/NCUA-2015-0048"
550 },
551 {
552 "href": "https://www.regulations.gov/docket/NCUA-2015-0044"
553 },
554 {
555 "href": "https://www.regulations.gov/docket/NCUA-2015-0042"
556 },
557 {
558 "href": "https://www.regulations.gov/docket/NCUA-2015-0043"
559 },
560 {
561 "href": "https://www.regulations.gov/docket/NCUA-2015-0038"
562 },
563 {
564 "href": "https://www.regulations.gov/docket/NCUA-2015-0037"
565 },
566 {
567 "href": "https://www.regulations.gov/docket/NCUA-2015-0034"
568 },
569 {
570 "href": "https://www.regulations.gov/docket/NCUA-2015-0030"
571 },
572 {
573 "href": "https://www.regulations.gov/docket/NCUA-2015-0031"
574 },
575 {
576 "href": "https://www.regulations.gov/docket/NCUA-2015-0029"
577 },
578 {
579 "href": "https://www.regulations.gov/docket/NCUA-2015-0019"
580 },
581 {
582 "href": "https://www.regulations.gov/docket/NCUA-2015-0018"
583 },
584 {
585 "href": "https://www.regulations.gov/docket/NCUA-2015-0020"
586 },
587 {
588 "href": "https://www.regulations.gov/docket/NCUA-2015-0021"
589 },
590 {
591 "href": "https://www.regulations.gov/docket/NCUA-2015-0015"
592 },
593 {
594 "href": "https://www.regulations.gov/docket/NCUA-2015-0013"
595 },
596 {
597 "href": "https://www.regulations.gov/docket/NCUA-2015-0010"
598 },
599 {
600 "href": "https://www.regulations.gov/docket/NCUA-2015-0011"
601 },
602 {
603 "href": "https://www.regulations.gov/docket/NCUA-2014-0043"
604 },
605 {
606 "href": "https://www.regulations.gov/docket/NCUA-2014-0037"
607 },
608 {
609 "href": "https://www.regulations.gov/docket/NCUA-2014-0036"
610 },
611 {
612 "href": "https://www.regulations.gov/docket/NCUA-2014-0034"
613 },
614 {
615 "href": "https://www.regulations.gov/docket/NCUA-2014-0031"
616 },
617 {
618 "href": "https://www.regulations.gov/docket/NCUA-2014-0027"
619 },
620 {
621 "href": "https://www.regulations.gov/docket/NCUA-2014-0026"
622 },
623 {
624 "href": "https://www.regulations.gov/docket/NCUA-2014-0028"
625 },
626 {
627 "href": "https://www.regulations.gov/docket/NCUA-2014-0025"
628 },
629 {
630 "href": "https://www.regulations.gov/docket/NCUA-2014-0020"
631 },
632 {
633 "href": "https://www.regulations.gov/docket/NCUA-2014-0016"
634 },
635 {
636 "href": "https://www.regulations.gov/docket/NCUA-2014-0017"
637 },
638 {
639 "href": "https://www.regulations.gov/docket/NCUA-2014-0045"
640 },
641 {
642 "href": "https://www.regulations.gov/docket/NCUA-2014-0008"
643 },
644 {
645 "href": "https://www.regulations.gov/docket/NCUA-2014-0004"
646 },
647 {
648 "href": "https://www.regulations.gov/docket/NCUA-2014-0007"
649 },
650 {
651 "href": "/About/Pages/budget-strategic-planning/comments.aspx"
652 },
653 {
654 "href": "https://www.regulations.gov/docket/NCUA-2013-0123"
655 },
656 {
657 "href": "https://www.regulations.gov/docket/NCUA-2013-0124"
658 },
659 {
660 "href": "https://www.regulations.gov/docket/NCUA-2013-0122"
661 },
662 {
663 "href": "https://www.regulations.gov/docket/NCUA-2013-0125"
664 },
665 {
666 "href": "https://www.regulations.gov/docket/FRS-2013-0386"
667 },
668 {
669 "href": "https://www.regulations.gov/docket/NCUA-2013-0120"
670 },
671 {
672 "href": "https://www.regulations.gov/docket/NCUA-2013-0112"
673 },
674 {
675 "href": "https://www.regulations.gov/docket/NCUA-2013-0113"
676 },
677 {
678 "href": "https://www.regulations.gov/docket/NCUA-2013-0116"
679 },
680 {
681 "href": "https://www.regulations.gov/docket/NCUA-2013-0114"
682 },
683 {
684 "href": "https://www.regulations.gov/docket/NCUA-2013-0111"
685 },
686 {
687 "href": "https://www.regulations.gov/docket/NCUA-2013-0095"
688 },
689 {
690 "href": "https://www.regulations.gov/docket/NCUA-2013-0096"
691 },
692 {
693 "href": "https://www.regulations.gov/docket/NCUA-2013-0060"
694 },
695 {
696 "href": "https://www.regulations.gov/docket/NCUA-2013-0062"
697 },
698 {
699 "href": "https://www.regulations.gov/docket/FRS-2013-0227"
700 },
701 {
702 "href": "https://www.regulations.gov/docket/NCUA-2013-0037"
703 },
704 {
705 "href": "https://www.regulations.gov/docket/NCUA-2013-0034"
706 },
707 {
708 "href": "https://www.regulations.gov/docket/NCUA-2013-0030"
709 },
710 {
711 "href": "https://www.regulations.gov/docket/NCUA-2013-0029"
712 },
713 {
714 "href": "https://www.regulations.gov/docket/NCUA-2013-0021"
715 },
716 {
717 "href": "https://www.regulations.gov/docket/NCUA-2013-0014"
718 },
719 {
720 "href": "https://www.regulations.gov/docket/NCUA-2013-0013"
721 },
722 {
723 "href": "https://www.regulations.gov/docket/NCUA-2013-0011"
724 },
725 {
726 "href": "https://www.regulations.gov/docket/NCUA-2013-0003"
727 },
728 {
729 "href": "https://www.regulations.gov/docket/NCUA-2013-0005"
730 },
731 {
732 "href": "https://www.regulations.gov/docket/NCUA-2013-0004"
733 },
734 {
735 "href": "https://www.regulations.gov/docket/NCUA-2013-0006"
736 },
737 {
738 "href": "https://www.regulations.gov/docket/NCUA-2012-0041"
739 },
740 {
741 "href": "https://www.regulations.gov/docket/NCUA-2012-0040"
742 },
743 {
744 "href": "https://www.regulations.gov/docket/NCUA-2012-0038"
745 },
746 {
747 "href": "https://www.regulations.gov/docket/NCUA-2012-0015"
748 },
749 {
750 "href": "https://www.regulations.gov/docket/NCUA-2012-0026"
751 },
752 {
753 "href": "https://www.regulations.gov/docket/NCUA-2012-0016"
754 },
755 {
756 "href": "https://www.regulations.gov/docket/NCUA-2012-0017"
757 },
758 {
759 "href": "https://www.regulations.gov/docket/NCUA-2012-0010"
760 },
761 {
762 "href": "https://www.regulations.gov/docket/NCUA-2012-0005"
763 },
764 {
765 "href": "https://www.regulations.gov/docket/NCUA-2011-0073"
766 },
767 {
768 "href": "https://www.regulations.gov/docket/NCUA-2011-0076"
769 },
770 {
771 "href": "https://www.regulations.gov/docket/NCUA-2011-0067"
772 },
773 {
774 "href": "https://www.regulations.gov/docket/NCUA-2011-0066"
775 },
776 {
777 "href": "https://www.regulations.gov/docket/NCUA-2011-0063"
778 },
779 {
780 "href": "https://www.regulations.gov/docket/NCUA-2011-0055"
781 },
782 {
783 "href": "https://www.regulations.gov/docket/NCUA-2011-0048"
784 },
785 {
786 "href": "https://www.regulations.gov/docket/NCUA-2011-0039"
787 },
788 {
789 "href": "https://www.regulations.gov/docket/NCUA-2011-0041"
790 },
791 {
792 "href": "https://www.regulations.gov/docket/NCUA-2011-0034"
793 },
794 {
795 "href": "https://www.regulations.gov/docket/NCUA-2011-0035"
796 },
797 {
798 "href": "https://www.regulations.gov/docket/NCUA-2011-0032"
799 },
800 {
801 "href": "https://www.regulations.gov/docket/NCUA-2011-0028"
802 },
803 {
804 "href": "https://www.regulations.gov/docket/NCUA-2011-0029"
805 },
806 {
807 "href": "https://www.regulations.gov/docket/NCUA-2011-0016"
808 },
809 {
810 "href": "https://www.regulations.gov/docket/NCUA-2011-0014"
811 },
812 {
813 "href": "https://www.regulations.gov/docket/NCUA-2011-0011"
814 },
815 {
816 "href": "https://www.regulations.gov/docket/NCUA-2011-0003"
817 },
818 {
819 "href": "https://www.regulations.gov/docket/NCUA-2010-0060"
820 },
821 {
822 "href": "https://www.regulations.gov/docket/NCUA-2010-0062"
823 },
824 {
825 "href": "https://www.regulations.gov/docket/NCUA-2010-0061"
826 },
827 {
828 "href": "https://www.regulations.gov/docket/NCUA-2010-0051"
829 },
830 {
831 "href": "https://www.regulations.gov/docket/NCUA-2010-0048"
832 },
833 {
834 "href": "https://www.regulations.gov/docket/NCUA-2010-0047"
835 },
836 {
837 "href": "https://www.regulations.gov/docket/NCUA-2010-0045"
838 },
839 {
840 "href": "https://www.regulations.gov/docket/NCUA-2010-0040"
841 },
842 {
843 "href": "https://www.regulations.gov/docket/NCUA-2010-0039"
844 },
845 {
846 "href": "https://www.regulations.gov/docket/NCUA-2010-0035"
847 },
848 {
849 "href": "https://www.regulations.gov/docket/NCUA-2010-0031"
850 },
851 {
852 "href": "https://www.regulations.gov/docket/NCUA-2010-0022"
853 },
854 {
855 "href": "https://www.regulations.gov/docket/NCUA-2010-0020"
856 },
857 {
858 "href": "https://www.regulations.gov/docket/NCUA-2010-0028"
859 },
860 {
861 "href": "https://www.regulations.gov/docket/NCUA-2010-0028"
862 },
863 {
864 "href": "https://www.regulations.gov/docket/NCUA-2010-0005"
865 },
866 {
867 "href": "https://www.regulations.gov/docket/NCUA-2010-0003"
868 }
869]
src/linksdata.js
1const { chromium } = require("playwright");
2const fs = require('fs').promises;
3const path = require('path');
4
5async function scrapeLinks() {
6 const browser = await chromium.launch();
7 const page = await browser.newPage();
8
9 try {
10 await page.goto("https://ncua.gov/regulation-supervision/rulemakings-proposals-comment");
11
12 const links = await page.$$eval(".dynamic-table-main tbody tr td a", (elements) => {
13 return elements.map((element) => ({
14 href: element.getAttribute("href"),
15 }));
16 });
17
18 const fileName = 'links.json';
19 const folderName = 'src';
20 const folderPath = path.join(process.cwd(), folderName);
21 const filePath = path.join(folderPath, fileName);
22
23 // Create the folder if it doesn't exist
24 await fs.mkdir(folderPath, { recursive: true });
25
26 await fs.writeFile(filePath, JSON.stringify(links, null, 2));
27 console.log(`Links saved to ${filePath}`);
28 } catch (error) {
29 console.error("Error:", error);
30 } finally {
31 await browser.close();
32 }
33}
34
35scrapeLinks();
src/main.js
1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3const { getLinks } = require("./get");
4const fs = require('fs').promises;
5const path = require('path');
6
7Actor.main(async () => {
8 const initialArrayOfObjects = await getLinks();
9 const links = initialArrayOfObjects.map((obj) => obj.href);
10
11 const validLinks = links.filter(link => {
12 return typeof link === 'string' && link.trim() !== '' && link.startsWith('http');
13 });
14 console.log(validLinks.length);
15 if (validLinks.length === 0) {
16 throw new Error("No valid links found");
17 }
18 const crawler = new PlaywrightCrawler({
19 async requestHandler({ request, page, log }) {
20 try {
21 const url = request.url;
22 log.info(`Processing ${url}`);
23
24 await page.goto(url);
25
26 const selectorExists = await page.waitForSelector('//*[@class="px-2"]')
27 .then(() => true)
28 .catch(() => false);
29
30 let paragraphs = [];
31 if (selectorExists) {
32 paragraphs = await page.$$eval('p', elements =>
33 elements.map(element => element.innerText.trim())
34 );
35 }
36
37 const data = { url, paragraphs };
38
39 // Push data to the dataset
40
41 if(paragraphs.length == 0){
42 paragraphs ="We're sorry, an error has occurred\n\nA general error occurred while processing your request.\n\nWhat could have caused this?\n\nThe link may be outdated or incorrect.\n\nThe document or docket you are looking for was not found.\n\nWhat can you do?\n\nPlease try typing in the URL again or return to the home page.\n\nIf you believe you should not be getting this error page or the problem persists, please contact the Help Desk"
43 }
44 await Dataset.pushData(data);
45 log.info(`Data pushed to dataset for URL: ${url}`);
46
47 } catch (error) {
48 log.error(`An error occurred while processing URL ${url}: ${error}`);
49
50 // Push error data to the dataset
51 const errorData = { url, errorMessage: error.toString() };
52 await Dataset.pushData(errorData);
53
54 log.error(`Error data pushed to dataset for URL: ${url}`);
55 }
56 },
57 });
58 await crawler.addRequests(validLinks);
59
60 // Run the crawler
61 await crawler.run();
62
63 // Save the paragraphsByLink array to a JSON file if needed
64 // const fileName = 'paragraphsByLink.json';
65 // const filePath = path.join(process.cwd(), fileName);
66 // await fs.writeFile(filePath, JSON.stringify(paragraphsByLink, null, 2));
67 // console.log(`Data saved to ${filePath}`);
68});
src/routes.js
1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5const fs = require("fs");
6
7router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
8 const title = await page.title();
9 log.info(`${title}`, { url: request.loadedUrl });
10 x = title;
11
12 await enqueueLinks({
13 selector: ".field-type-text_with_summary h2 a[href*=regulatory-review]",
14 label: "detail5",
15 });
16
17 await enqueueLinks({
18 selector:
19 ".field-type-text_with_summary h2 a[href*=regulatory-reform-agenda]",
20 label: "detail6",
21 });
22
23 await enqueueLinks({
24 selector:
25 ".field-type-text_with_summary h2 a[href*=interpretive-rulings-policy-statements]",
26 label: "detail7",
27 });
28
29 // async function fetchPDF(pdfLink) {
30 // const { default: fetch } = await import("node-fetch");
31 // try {
32 // const mkdirAsync = promisify(fs.mkdir);
33 // const directory = "storage/PDFs";
34 // await mkdirAsync(directory, { recursive: true });
35 // const writeFileAsync = promisify(fs.writeFile);
36 // const response = await fetch(pdfLink);
37 // const buffer = await response.buffer();
38 // const pdfText = await pdfParse(buffer);
39 // const serialNumber = pdfLink.substring(
40 // pdfLink.lastIndexOf("/") + 1,
41 // pdfLink.lastIndexOf(".")
42 // );
43 // const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
44 // const jsonData = JSON.stringify(
45 // { link: pdfLink, text: pdfText.text },
46 // null,
47 // 2
48 // );
49 // await writeFileAsync(filename, jsonData);
50 // console.log(`JSON file "${filename}" created successfully.`);
51 // } catch (error) {
52 // const writeFileAsync = promisify(fs.writeFile);
53 // const serialNumber = pdfLink.substring(
54 // pdfLink.lastIndexOf("/") + 1,
55 // pdfLink.lastIndexOf(".")
56 // );
57 // const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
58 // const jsonData = JSON.stringify(
59 // {
60 // link: pdfLink,
61 // error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
62 // },
63 // null,
64 // 2
65 // );
66 // await writeFileAsync(filename, jsonData);
67 // console.error(
68 // `Error fetching or parsing PDF from ${pdfLink}:`,
69 // error
70 // );
71 // }
72 // }
73});
74
75router.addHandler("detail4", async ({ request, page, log }) => {
76 try {
77 const title = await page.title();
78 log.info(`${title}`, { url: request.loadedUrl });
79 // const result = await page.evaluate(() => {
80 // const result = {
81 // Category: "Rules and Regulations",
82 // Title:
83 // document.querySelector(".page-title")?.innerText || "N/A",
84 // MainParagraphText:
85 // document.querySelector(".field-type-text_with_summary")
86 // ?.innerText || "N/A",
87 // Links: [],
88 // PDFs: [],
89 // };
90
91 // const linkElements = document.querySelectorAll(
92 // ".field-type-text_with_summary a"
93 // );
94 // for (const el of Array.from(linkElements)) {
95 // const obj = {
96 // linkText: el.innerText || "N/A",
97 // link: el.href || "",
98 // };
99 // const numericValue = Number(obj.linkText);
100
101 // if (
102 // isNaN(numericValue) &&
103 // !obj.link.includes("mailto") &&
104 // obj.link !== ""
105 // ) {
106 // if (obj.link.endsWith(".pdf")) {
107 // result.PDFs.push(obj);
108 // } else result.Links.push(obj);
109 // }
110 // }
111
112 // return result;
113 // });
114
115 if (request.errorMessages.includes("Data item is too large")) {
116 await Dataset.pushData({
117 url: request.url,
118 ...result,
119 PDFs: PDFs.map((item) => ({
120 ...item,
121 text: "Please retrieve manually due to size limitations",
122 })),
123 Links: Links.map((item) => ({
124 ...item,
125 text: "Please retrieve manually due to size limitations",
126 })),
127 });
128 } else {
129 await Dataset.pushData({
130 url2: request.url,
131 // ...result,
132 // Links,
133 // PDFs,
134 // InnerPDFs
135 });
136 }
137 } catch (error) {
138 log.error(
139 `An unexpected error occurred: ${error.message || error.code}`
140 );
141 }
142});
143
144
145router.addHandler("detail5", async ({ request, page, log }) => {
146 try {
147 const title = await page.title();
148 const url = request.loadedUrl;
149 log.info(`${title}`, { url: request.loadedUrl });
150 const result = await page.evaluate(() => {
151 const result = {
152 Category: "Rules and Regulations",
153 Title:
154 document.querySelector(".page-title")?.innerText || "N/A",
155 MainParagraphText:
156 document.querySelector(".field-type-text_with_summary")
157 ?.innerText || "N/A",
158 Links: [],
159 PDFs: [],
160 };
161
162 const linkElements = document.querySelectorAll(
163 ".field-type-text_with_summary a"
164 );
165 for (const el of Array.from(linkElements)) {
166 const obj = {
167 linkText: el.innerText || "N/A",
168 link: el.href || "",
169 };
170 const numericValue = Number(obj.linkText);
171
172 if (
173 isNaN(numericValue) &&
174 !obj.link.includes("mailto") &&
175 obj.link !== ""
176 ) {
177 if (obj.link.endsWith(".pdf")) {
178 result.PDFs.push(obj);
179 } else result.Links.push(obj);
180 }
181 }
182
183 return result;
184 });
185
186 if (request.errorMessages.includes("Data item is too large")) {
187 await Dataset.pushData({
188 url: request.url,
189 ...result,
190 PDFs: PDFs.map((item) => ({
191 ...item,
192 text: "Please retrieve manually due to size limitations",
193 })),
194 Links: Links.map((item) => ({
195 ...item,
196 text: "Please retrieve manually due to size limitations",
197 })),
198 });
199 } else {
200 await Dataset.pushData({
201 url2: request.url,
202 ...result,
203 // Links,
204 // PDFs,
205 // InnerPDFs
206 });
207 }
208 } catch (error) {
209 log.error(
210 `An unexpected error occurred: ${error.message || error.code}`
211 );
212 }
213});
214
215router.addHandler("detail6", async ({ request, page, log }) => {
216 try {
217 const title = await page.title();
218 const url = request.loadedUrl;
219 log.info(`${title}`, { url: request.loadedUrl });
220 const result = await page.evaluate(() => {
221 const result = {
222 Category: "Rules and Regulations",
223 Title:
224 document.querySelector(".page-title")?.innerText || "N/A",
225 MainParagraphText:
226 document.querySelector(".field-type-text_with_summary")
227 ?.innerText || "N/A",
228 Links: [],
229 PDFs: [],
230 };
231
232 const linkElements = document.querySelectorAll(
233 ".field-type-text_with_summary a"
234 );
235 for (const el of Array.from(linkElements)) {
236 const obj = {
237 linkText: el.innerText || "N/A",
238 link: el.href || "",
239 };
240 const numericValue = Number(obj.linkText);
241
242 if (
243 isNaN(numericValue) &&
244 !obj.link.includes("mailto") &&
245 obj.link !== ""
246 ) {
247 if (obj.link.endsWith(".pdf")) {
248 result.PDFs.push(obj);
249 } else result.Links.push(obj);
250 }
251 }
252
253 return result;
254 });
255
256 // const Links = (
257 // await Promise.allSettled(
258 // result.Links.map(
259 // (link) =>
260 // new Promise(async (res, rej) => {
261 // try {
262 // // let innerTitle;
263 // if (!link.link.includes(".pdf")) {
264 // const FederalRegisterResponse =
265 // await page.request.fetch(link.link);
266 // const $ = load(
267 // await FederalRegisterResponse.text()
268 // );
269 // const contentDiv = $(".layout-content");
270 // innerLinks = contentDiv
271 // .find("a")
272 // .map((i, el) => $(el).attr("href"))
273 // .get();
274 // innerLinks = innerLinks.map((innerLink) => {
275 // if (!innerLink.startsWith("http")) {
276 // return (
277 // "https://ncua.gov" + innerLink
278 // );
279 // }
280 // return innerLink;
281 // });
282 // innerLinks = Array.from(
283 // new Set(innerLinks)
284 // );
285 // PDFLinks = innerLinks.filter((link) =>
286 // link.endsWith(".pdf")
287 // );
288 // innerLinks = innerLinks.filter(
289 // (link) => !link.endsWith(".pdf")
290 // );
291 // innerLinks = innerLinks.filter(
292 // (link) => !link.endsWith("@ncua.gov")
293 // );
294 // innerLinks = innerLinks.filter(
295 // (link) => !link.includes("#ftn")
296 // );
297 // innerText = $("p").text();
298 // }
299 // res({
300 // ...link,
301 // innerText,
302 // innerLinks,
303 // PDFLinks,
304 // });
305 // } catch (e) {
306 // // console.log(e);
307 // res({
308 // ...link,
309 // // error: e.message || e.code || true,
310 // error: "404 page not found",
311 // });
312 // }
313 // })
314 // )
315 // )
316 // ).map((p) => p.value);
317
318 // const InnerPDFs = (
319 // await Promise.allSettled(
320 // Links.map(
321 // (pdf) =>
322 // new Promise(async (res, rej) => {
323 // try {
324 // const pdfDataArray = [];
325
326 // // Loop through all the PDF links in the `PDFLinks` array
327 // for (const pdfLink of pdf.PDFLinks) {
328 // try {
329 // // Fetch the PDF content from the current link
330 // const link = pdfLink;
331 // const pdfResponse =
332 // await page.request.fetch(pdfLink);
333
334 // // Parse the fetched PDF using pdf-parse
335 // const pdfText = await pdfParse(
336 // (
337 // await pdfResponse.body()
338 // ).buffer
339 // );
340
341 // // Store the parsed information for this PDF
342 // pdfDataArray.push({
343 // link,
344 // text: pdfText.text,
345 // });
346 // } catch (innerError) {
347 // pdfDataArray.push({
348 // link: pdfLink,
349 // error:
350 // innerError.message ||
351 // innerError.code ||
352 // true,
353 // });
354 // }
355 // }
356
357 // res({
358 // ...pdf,
359 // pdfDataArray,
360 // });
361 // } catch (e) {
362 // // console.log(e);
363 // res({
364 // ...pdf,
365 // error: e.message || e.code || true,
366 // });
367 // }
368 // })
369 // )
370 // )
371 // ).map((p) => p.value);
372
373 const PDFs = (
374 await Promise.allSettled(
375 result.PDFs.map(
376 (pdf) =>
377 new Promise(async (res, rej) => {
378 try {
379 const pdfResponse = await page.request.fetch(
380 pdf.link
381 );
382
383 // Parse the PDF using pdf-parse
384 const pdfText = await pdfParse(
385 (
386 await pdfResponse.body()
387 ).buffer
388 );
389
390 res({
391 ...pdf,
392 text: pdfText.text,
393 });
394 } catch (e) {
395 // console.log(e);
396 res({
397 ...pdf,
398 error: e.message || e.code || true,
399 });
400 }
401 })
402 )
403 )
404 ).map((p) => p.value);
405
406 // If the request has large data errors, mark the data for manual processing
407 if (request.errorMessages.includes("Data item is too large")) {
408 await Dataset.pushData({
409 url: request.url,
410 ...result,
411 PDFs: PDFs.map((item) => ({
412 ...item,
413 text: "Please retrieve manually due to size limitations",
414 })),
415 Links: Links.map((item) => ({
416 ...item,
417 text: "Please retrieve manually due to size limitations",
418 })),
419 });
420 } else {
421 await Dataset.pushData({
422 url: request.url,
423 ...result,
424 // Links,
425 PDFs,
426 // InnerPDFs
427 });
428 }
429 } catch (error) {
430 log.error(
431 `An unexpected error occurred: ${error.message || error.code}`
432 );
433 }
434});
435
436router.addHandler("detail7", async ({ request, page, log }) => {
437 try {
438 const title = await page.title();
439 log.info(`${title}`, { url: request.loadedUrl });
440 const result = await page.evaluate(() => {
441 const result = {
442 Category: "Rules and Regulations",
443 Title:
444 document.querySelector(".page-title")?.innerText || "N/A",
445 MainParagraphText:
446 document.querySelector(".field-type-text_with_summary")
447 ?.innerText || "N/A",
448 Links: [],
449 PDFs: [],
450 };
451
452 const linkElements = document.querySelectorAll(
453 ".field-type-text_with_summary a"
454 );
455 for (const el of Array.from(linkElements)) {
456 const obj = {
457 linkText: el.innerText || "N/A",
458 link: el.href || "",
459 };
460 const numericValue = Number(obj.linkText);
461
462 if (
463 isNaN(numericValue) &&
464 !obj.link.includes("mailto") &&
465 obj.link !== ""
466 ) {
467 if (obj.link.endsWith(".pdf")) {
468 result.PDFs.push(obj);
469 } else result.Links.push(obj);
470 }
471 }
472
473 return result;
474 });
475
476 const PDFs = (
477 await Promise.allSettled(
478 result.PDFs.map(
479 (pdf) =>
480 new Promise(async (res, rej) => {
481 try {
482 const pdfResponse = await page.request.fetch(
483 pdf.link
484 );
485
486 // Parse the PDF using pdf-parse
487 const pdfText = await pdfParse(
488 (
489 await pdfResponse.body()
490 ).buffer
491 );
492
493 res({
494 ...pdf,
495 text: pdfText.text,
496 });
497 } catch (e) {
498 // console.log(e);
499 res({
500 ...pdf,
501 error: e.message || e.code || true,
502 });
503 }
504 })
505 )
506 )
507 ).map((p) => p.value);
508
509 if (request.errorMessages.includes("Data item is too large")) {
510 await Dataset.pushData({
511 url: request.url,
512 ...result,
513 PDFs: PDFs.map((item) => ({
514 ...item,
515 text: "Please retrieve manually due to size limitations",
516 })),
517 Links: Links.map((item) => ({
518 ...item,
519 text: "Please retrieve manually due to size limitations",
520 })),
521 });
522 } else {
523 await Dataset.pushData({
524 url: request.url,
525 ...result,
526 // Links,
527 PDFs,
528 // InnerPDFs
529 });
530 }
531 } catch (error) {
532 log.error(
533 `An unexpected error occurred: ${error.message || error.code}`
534 );
535 }
536});
537
538module.exports = { router };
Developer
Maintained by Community
Categories