Rulemaking Proposals avatar
Rulemaking Proposals
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Rulemaking Proposals

Rulemaking Proposals

nondescript_cord/rulemaking-proposals

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv

crawlee.json

1{
2	"purgeOnStart": false
3}

Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY --chown=myuser . ./
26
27
28# Run the image. If you know you won't need headful browsers,
29# you can remove the XVFB start script for a micro perf gain.
30CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

package.json

1{
2	"name": "consumerfinance",
3	"version": "0.0.1",
4	"type": "commonjs",
5	"description": "consumerfinance rules extractor",
6	"dependencies": {
7		"22": "^0.0.0",
8		"apify": "^3.1.10",
9		"crawlee": "^3.5.4",
10		"fs": "^0.0.1-security",
11		"node-fetch": "^3.3.2",
12		"path": "^0.12.7",
13		"pdf-parse": "^1.1.1",
14		"playwright": "^1.43.1"
15	},
16	"devDependencies": {
17		"@apify/eslint-config": "^0.4.0",
18		"@types/pdf-parse": "^1.1.4",
19		"eslint": "^8.50.0",
20		"@playwright/test": "^1.43.1"
21	},
22	"scripts": {
23		"start": "node src/linksdata.js && node src/main.js",
24		"lint": "eslint ./src --ext .js,.jsx",
25		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
26		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
27		"postinstall": "npx crawlee install-playwright-browsers"
28	},
29	"author": "Moazzam Malek"
30}

playwright.config.js

1// @ts-check
2const { defineConfig, devices } = require('@playwright/test');
3
4/**
5 * Read environment variables from file.
6 * https://github.com/motdotla/dotenv
7 */
8// require('dotenv').config();
9
10/**
11 * @see https://playwright.dev/docs/test-configuration
12 */
13module.exports = defineConfig({
14  testDir: './tests',
15  /* Run tests in files in parallel */
16  fullyParallel: true,
17  /* Fail the build on CI if you accidentally left test.only in the source code. */
18  forbidOnly: !!process.env.CI,
19  /* Retry on CI only */
20  retries: process.env.CI ? 2 : 0,
21  /* Opt out of parallel tests on CI. */
22  workers: process.env.CI ? 1 : undefined,
23  /* Reporter to use. See https://playwright.dev/docs/test-reporters */
24  reporter: 'html',
25  /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
26  use: {
27    /* Base URL to use in actions like `await page.goto('/')`. */
28    // baseURL: 'http://127.0.0.1:3000',
29
30    /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */
31    trace: 'on-first-retry',
32  },
33
34  /* Configure projects for major browsers */
35  projects: [
36    {
37      name: 'chromium',
38      use: { ...devices['Desktop Chrome'] },
39    },
40
41    {
42      name: 'firefox',
43      use: { ...devices['Desktop Firefox'] },
44    },
45
46    {
47      name: 'webkit',
48      use: { ...devices['Desktop Safari'] },
49    },
50
51    /* Test against mobile viewports. */
52    // {
53    //   name: 'Mobile Chrome',
54    //   use: { ...devices['Pixel 5'] },
55    // },
56    // {
57    //   name: 'Mobile Safari',
58    //   use: { ...devices['iPhone 12'] },
59    // },
60
61    /* Test against branded browsers. */
62    // {
63    //   name: 'Microsoft Edge',
64    //   use: { ...devices['Desktop Edge'], channel: 'msedge' },
65    // },
66    // {
67    //   name: 'Google Chrome',
68    //   use: { ...devices['Desktop Chrome'], channel: 'chrome' },
69    // },
70  ],
71
72  /* Run your local dev server before starting the tests */
73  // webServer: {
74  //   command: 'npm run start',
75  //   url: 'http://127.0.0.1:3000',
76  //   reuseExistingServer: !process.env.CI,
77  // },
78
79  globalTimeout: 60*1000,
80});

start.bat

Download

start.sh

Download

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "Rulemaking-proposals",
4	"title": "Project Playwright Crawler JavaScript",
5	"description": "Crawlee and Playwright project in JavaScript.",
6	"version": "0.0",
7	"meta": {
8		"templateId": "js-crawlee-playwright-chrome"
9	},
10	"input": "./input_schema.json",
11	"dockerfile": "./Dockerfile"
12}

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://apify.com"
14                }
15            ]
16        }
17    }
18}

src/get.js

1const fs = require('fs').promises;
2const path = require('path');
3
4async function getLinks() {
5  const filePath = path.join(__dirname, 'links.json');
6  try {
7    const data = await fs.readFile(filePath, 'utf-8');
8    console.log(data.length)
9    return JSON.parse(data);
10  } catch (error) {
11    console.error('Error reading links.json:', error);
12    return [];
13  }
14}
15
16module.exports = { getLinks };

src/links.json

1[
2  {
3    "href": "https://www.regulations.gov/document/NCUA-2024-0026-0001"
4  },
5  {
6    "href": "https://www.regulations.gov/document/NCUA-2023-0070-0007"
7  },
8  {
9    "href": "https://www.regulations.gov/document/NCUA-2024-0008-0001"
10  },
11  {
12    "href": "https://www.regulations.gov/document/NCUA-2023-0142-0001"
13  },
14  {
15    "href": "https://www.regulations.gov/document/NCUA-2023-0137-0003"
16  },
17  {
18    "href": "https://www.regulations.gov/document/NCUA-2023-0117-0001"
19  },
20  {
21    "href": "https://www.regulations.gov/document/NCUA-2023-0023-0001"
22  },
23  {
24    "href": "https://www.regulations.gov/document/NCUA-2023-0082-0001"
25  },
26  {
27    "href": "https://www.regulations.gov/document/NCUA-2023-0118-0003"
28  },
29  {
30    "href": "https://www.regulations.gov/document/NCUA_FRDOC_0001-0347"
31  },
32  {
33    "href": "https://www.regulations.gov/document/NCUA-2023-0072-0001"
34  },
35  {
36    "href": "https://www.regulations.gov/document/NCUA-2023-0070-0001"
37  },
38  {
39    "href": "https://www.regulations.gov/document/NCUA-2023-0061-0001"
40  },
41  {
42    "href": "https://www.regulations.gov/document/NCUA-2023-0019-0001"
43  },
44  {
45    "href": "https://www.regulations.gov/document/NCUA-2023-0043-0009"
46  },
47  {
48    "href": "https://www.regulations.gov/document/NCUA-2023-0045-0001"
49  },
50  {
51    "href": "https://www.regulations.gov/document/NCUA-2022-0138-0023"
52  },
53  {
54    "href": "https://www.regulations.gov/docket/NCUA-2022-0179/"
55  },
56  {
57    "href": "https://www.regulations.gov/docket/NCUA-2022-0099"
58  },
59  {
60    "href": "https://www.regulations.gov/document/NCUA_FRDOC_0001-0310"
61  },
62  {
63    "href": "https://www.regulations.gov/docket/NCUA-2022-0185"
64  },
65  {
66    "href": "https://www.regulations.gov/docket/NCUA-2022-0145"
67  },
68  {
69    "href": "https://www.regulations.gov/docket/NCUA-2022-0138"
70  },
71  {
72    "href": "https://www.regulations.gov/docket/NCUA-2022-0132"
73  },
74  {
75    "href": "https://www.regulations.gov/docket/NCUA-2022-0123"
76  },
77  {
78    "href": "https://www.regulations.gov/docket/NCUA-2022-0099"
79  },
80  {
81    "href": "https://www.regulations.gov/document/NCUA-2022-0008-0007"
82  },
83  {
84    "href": "https://www.regulations.gov/docket/NCUA-2022-0008"
85  },
86  {
87    "href": "https://www.regulations.gov/docket/NCUA-2022-0005"
88  },
89  {
90    "href": "https://www.regulations.gov/docket/NCUA-2022-0016"
91  },
92  {
93    "href": "https://www.regulations.gov/docket/NCUA-2022-0041"
94  },
95  {
96    "href": "https://www.regulations.gov/docket/NCUA-2022-0038"
97  },
98  {
99    "href": "https://www.regulations.gov/docket/NCUA-2021-0072"
100  },
101  {
102    "href": "https://www.regulations.gov/docket/NCUA-2020-0125"
103  },
104  {
105    "href": "https://www.regulations.gov/docket/NCUA-2022-0040"
106  },
107  {
108    "href": "https://www.regulations.gov/docket/NCUA-2021-0100"
109  },
110  {
111    "href": "https://www.regulations.gov/docket/NCUA-2021-0149"
112  },
113  {
114    "href": "https://www.regulations.gov/docket/NCUA-2021-0079"
115  },
116  {
117    "href": "https://www.regulations.gov/docket/NCUA-2021-0036"
118  },
119  {
120    "href": "https://www.regulations.gov/docket/NCUA-2021-0127"
121  },
122  {
123    "href": "https://www.regulations.gov/docket/NCUA-2021-0102"
124  },
125  {
126    "href": "https://www.regulations.gov/docket/NCUA-2021-0072"
127  },
128  {
129    "href": "https://www.regulations.gov/docket/NCUA-2020-0074"
130  },
131  {
132    "href": "https://www.regulations.gov/docket/NCUA-2020-0114"
133  },
134  {
135    "href": "https://www.regulations.gov/docket/NCUA-2020-0107"
136  },
137  {
138    "href": "https://www.regulations.gov/docket/NCUA-2021-0038"
139  },
140  {
141    "href": "https://www.regulations.gov/docket/NCUA-2021-0056"
142  },
143  {
144    "href": "https://www.regulations.gov/docket/NCUA-2021-0036"
145  },
146  {
147    "href": "https://www.regulations.gov/docket/NCUA-2021-0037"
148  },
149  {
150    "href": "https://www.regulations.gov/docket/NCUA-2021-0045"
151  },
152  {
153    "href": "https://www.regulations.gov/docket/NCUA-2021-0042"
154  },
155  {
156    "href": "https://www.regulations.gov/docket/NCUA-2020-0056"
157  },
158  {
159    "href": "https://www.regulations.gov/docket/NCUA-2020-0098"
160  },
161  {
162    "href": "https://www.regulations.gov/docket/NCUA-2020-0033"
163  },
164  {
165    "href": "https://www.regulations.gov/docket/NCUA-2021-0028"
166  },
167  {
168    "href": "https://www.regulations.gov/docket/NCUA-2021-0010"
169  },
170  {
171    "href": "https://www.regulations.gov/docket/NCUA-2021-0030"
172  },
173  {
174    "href": "https://www.regulations.gov/docket/NCUA-2021-0006"
175  },
176  {
177    "href": "https://www.regulations.gov/docket/NCUA-2022-0042"
178  },
179  {
180    "href": "https://www.regulations.gov/docket/NCUA-2020-0080"
181  },
182  {
183    "href": "https://www.regulations.gov/docket/NCUA-2020-0081"
184  },
185  {
186    "href": "https://www.regulations.gov/docket/NCUA-2020-0016"
187  },
188  {
189    "href": "https://www.regulations.gov/docket/NCUA-2022-0043"
190  },
191  {
192    "href": "https://www.regulations.gov/docket/NCUA-2020-0125"
193  },
194  {
195    "href": "https://www.regulations.gov/docket/NCUA-2021-0019"
196  },
197  {
198    "href": "https://www.regulations.gov/docket/NCUA-2021-0021"
199  },
200  {
201    "href": "https://www.regulations.gov/docket/NCUA-2021-0001"
202  },
203  {
204    "href": "https://www.regulations.gov/docket/NCUA-2020-0114"
205  },
206  {
207    "href": "https://www.regulations.gov/docket/NCUA-2020-0116"
208  },
209  {
210    "href": "https://www.regulations.gov/docket/NCUA-2020-0098"
211  },
212  {
213    "href": "https://www.regulations.gov/docket/NCUA-2020-0033"
214  },
215  {
216    "href": "https://www.regulations.gov/docket/NCUA-2020-0107"
217  },
218  {
219    "href": "https://www.regulations.gov/docket/NCUA-2022-0036"
220  },
221  {
222    "href": "https://www.regulations.gov/docket/NCUA-2022-0026"
223  },
224  {
225    "href": "https://www.regulations.gov/docket/NCUA-2020-0080"
226  },
227  {
228    "href": "https://www.regulations.gov/docket/NCUA-2020-0081"
229  },
230  {
231    "href": "https://www.regulations.gov/docket/NCUA-2020-0074"
232  },
233  {
234    "href": "https://www.regulations.gov/docket/NCUA-2022-0037"
235  },
236  {
237    "href": "https://www.regulations.gov/docket/NCUA-2020-0064"
238  },
239  {
240    "href": "https://www.regulations.gov/document/NCUA_FRDOC_0001-0143"
241  },
242  {
243    "href": "https://www.regulations.gov/docket/NCUA-2020-0054"
244  },
245  {
246    "href": "https://www.regulations.gov/docket/NCUA-2020-0056"
247  },
248  {
249    "href": "https://www.regulations.gov/docket/NCUA-2020-0043"
250  },
251  {
252    "href": "https://www.regulations.gov/docket/NCUA-2019-0112"
253  },
254  {
255    "href": "https://www.regulations.gov/docket/NCUA-2019-0112"
256  },
257  {
258    "href": "https://www.regulations.gov/docket/NCUA-2022-0035"
259  },
260  {
261    "href": "https://www.regulations.gov/docket/NCUA-2020-0044"
262  },
263  {
264    "href": "https://www.regulations.gov/docket/NCUA-2020-0033"
265  },
266  {
267    "href": "https://www.regulations.gov/docket/NCUA-2020-0016"
268  },
269  {
270    "href": "https://www.regulations.gov/docket/NCUA-2020-0015"
271  },
272  {
273    "href": "https://www.regulations.gov/docket/NCUA-2022-0034"
274  },
275  {
276    "href": "https://www.regulations.gov/docket/NCUA-2022-0025"
277  },
278  {
279    "href": "https://www.regulations.gov/docket/NCUA-2022-0033"
280  },
281  {
282    "href": "https://www.regulations.gov/docket/NCUA-2019-0112"
283  },
284  {
285    "href": "https://www.regulations.gov/document/NCUA_FRDOC_0001-0076"
286  },
287  {
288    "href": "https://www.regulations.gov/docket/NCUA-2022-0028"
289  },
290  {
291    "href": "https://www.regulations.gov/docket/NCUA-2022-0026"
292  },
293  {
294    "href": "https://www.regulations.gov/docket/NCUA-2019-0121"
295  },
296  {
297    "href": "https://www.regulations.gov/docket/NCUA-2022-0027"
298  },
299  {
300    "href": "https://www.regulations.gov/docket/NCUA-2018-0030"
301  },
302  {
303    "href": "https://www.regulations.gov/docket/NCUA-2018-0017"
304  },
305  {
306    "href": "https://www.regulations.gov/docket/NCUA-2019-0120"
307  },
308  {
309    "href": "https://www.regulations.gov/docket/NCUA-2022-0044"
310  },
311  {
312    "href": "https://www.regulations.gov/docket/NCUA-2022-0030"
313  },
314  {
315    "href": "https://www.regulations.gov/docket/NCUA-2018-0051"
316  },
317  {
318    "href": "https://www.regulations.gov/docket/NCUA-2022-0033"
319  },
320  {
321    "href": "https://www.regulations.gov/docket/NCUA-2022-0025"
322  },
323  {
324    "href": "https://www.regulations.gov/docket/NCUA-2022-0028"
325  },
326  {
327    "href": "https://www.regulations.gov/docket/NCUA-2022-0029"
328  },
329  {
330    "href": "https://www.regulations.gov/docket/NCUA-2018-0041"
331  },
332  {
333    "href": "https://www.regulations.gov/docket/NCUA-2022-0027"
334  },
335  {
336    "href": "https://www.regulations.gov/docket/NCUA-2016-0075"
337  },
338  {
339    "href": "https://www.regulations.gov/docket/NCUA-2022-0032"
340  },
341  {
342    "href": "https://www.regulations.gov/docket/NCUA-2022-0031"
343  },
344  {
345    "href": "https://www.regulations.gov/docket/NCUA-2022-0030"
346  },
347  {
348    "href": "https://www.regulations.gov/docket/NCUA-2018-0039"
349  },
350  {
351    "href": "https://www.regulations.gov/docket/NCUA-2018-0017"
352  },
353  {
354    "href": "https://www.regulations.gov/docket/NCUA-2018-0050"
355  },
356  {
357    "href": "https://www.regulations.gov/docket/NCUA-2018-0051"
358  },
359  {
360    "href": "https://www.regulations.gov/docket/NCUA-2018-0040"
361  },
362  {
363    "href": "https://www.regulations.gov/docket/NCUA-2018-0041"
364  },
365  {
366    "href": "https://www.regulations.gov/docket/NCUA-2018-0039"
367  },
368  {
369    "href": "https://www.regulations.gov/docket/NCUA-2018-0033"
370  },
371  {
372    "href": "https://www.regulations.gov/docket/NCUA-2018-0034"
373  },
374  {
375    "href": "https://www.regulations.gov/docket/NCUA-2018-0031"
376  },
377  {
378    "href": "https://www.regulations.gov/docket/NCUA-2018-0027"
379  },
380  {
381    "href": "https://www.regulations.gov/docket/NCUA-2018-0030"
382  },
383  {
384    "href": "https://www.regulations.gov/docket/NCUA-2018-0024"
385  },
386  {
387    "href": "https://www.regulations.gov/docket/NCUA-2018-0023"
388  },
389  {
390    "href": "https://www.regulations.gov/docket/NCUA-2018-0017"
391  },
392  {
393    "href": "https://www.regulations.gov/docket/NCUA-2018-0016"
394  },
395  {
396    "href": "https://www.regulations.gov/docket/NCUA-2018-0009"
397  },
398  {
399    "href": "https://www.regulations.gov/docket/NCUA-2018-0006"
400  },
401  {
402    "href": "https://www.regulations.gov/docket/NCUA-2018-0005"
403  },
404  {
405    "href": "https://www.regulations.gov/docket/NCUA-2018-0003"
406  },
407  {
408    "href": "https://www.regulations.gov/docket/NCUA-2017-0055"
409  },
410  {
411    "href": "https://www.regulations.gov/docket/NCUA-2017-0054"
412  },
413  {
414    "href": "https://www.regulations.gov/docket/NCUA-2017-0049"
415  },
416  {
417    "href": "https://www.regulations.gov/docket/NCUA-2017-0045"
418  },
419  {
420    "href": "https://www.regulations.gov/docket/NCUA-2017-0043"
421  },
422  {
423    "href": "https://www.regulations.gov/docket/NCUA-2017-0044"
424  },
425  {
426    "href": "https://www.regulations.gov/docket/NCUA-2017-0042"
427  },
428  {
429    "href": "/about/pages/budget-strategic-planning/budget-comments-2018.aspx"
430  },
431  {
432    "href": "https://www.regulations.gov/docket/NCUA-2017-0036"
433  },
434  {
435    "href": "https://www.regulations.gov/docket/NCUA-2017-0038"
436  },
437  {
438    "href": "https://www.regulations.gov/docket/NCUA-2017-0032"
439  },
440  {
441    "href": "https://www.regulations.gov/docket/NCUA-2017-0031"
442  },
443  {
444    "href": "https://www.regulations.gov/docket/NCUA-2017-0030"
445  },
446  {
447    "href": "https://www.regulations.gov/docket/NCUA-2017-0025"
448  },
449  {
450    "href": "https://www.regulations.gov/docket/NCUA-2017-0022"
451  },
452  {
453    "href": "https://www.regulations.gov/docket/NCUA-2017-0024"
454  },
455  {
456    "href": "https://www.regulations.gov/docket/NCUA-2017-0026"
457  },
458  {
459    "href": "https://www.regulations.gov/docket/NCUA-2017-0023"
460  },
461  {
462    "href": "https://www.regulations.gov/docket/NCUA-2017-0017"
463  },
464  {
465    "href": "https://www.regulations.gov/docket/NCUA-2017-0018"
466  },
467  {
468    "href": "https://www.regulations.gov/docket/NCUA-2017-0016"
469  },
470  {
471    "href": "https://www.regulations.gov/docket/NCUA-2017-0007"
472  },
473  {
474    "href": "https://www.regulations.gov/docket/NCUA-2017-0002"
475  },
476  {
477    "href": "https://www.regulations.gov/docket/NCUA-2016-0089"
478  },
479  {
480    "href": "https://www.regulations.gov/docket/NCUA-2016-0088"
481  },
482  {
483    "href": "https://www.regulations.gov/docket/NCUA-2016-0081"
484  },
485  {
486    "href": "https://www.regulations.gov/docket/NCUA-2016-0085"
487  },
488  {
489    "href": "https://www.regulations.gov/docket/NCUA-2016-0039"
490  },
491  {
492    "href": "https://www.regulations.gov/docket/NCUA-2016-0073"
493  },
494  {
495    "href": "https://www.regulations.gov/docket/NCUA-2016-0075"
496  },
497  {
498    "href": "https://www.regulations.gov/docket/NCUA-2016-0076"
499  },
500  {
501    "href": "/About/Pages/budget-strategic-planning/budget-comments-2017.aspx"
502  },
503  {
504    "href": "https://www.regulations.gov/docket/NCUA-2016-0044"
505  },
506  {
507    "href": "https://www.regulations.gov/docket/NCUA-2016-0039"
508  },
509  {
510    "href": "https://www.regulations.gov/docket/NCUA-2016-0040"
511  },
512  {
513    "href": "https://www.regulations.gov/docket/NCUA-2016-0033"
514  },
515  {
516    "href": "https://www.regulations.gov/docket/NCUA-2016-0024"
517  },
518  {
519    "href": "https://www.regulations.gov/docket/NCUA-2016-0016"
520  },
521  {
522    "href": "https://www.regulations.gov/docket/NCUA-2016-0013"
523  },
524  {
525    "href": "https://www.regulations.gov/docket/NCUA-2016-0004"
526  },
527  {
528    "href": "https://www.regulations.gov/docket/NCUA-2016-0006"
529  },
530  {
531    "href": "https://www.regulations.gov/docket/NCUA-2016-0003"
532  },
533  {
534    "href": "https://www.regulations.gov/docket/NCUA-2016-0005"
535  },
536  {
537    "href": "https://www.regulations.gov/docket/NCUA-2015-0060"
538  },
539  {
540    "href": "https://www.regulations.gov/docket/NCUA-2015-0059"
541  },
542  {
543    "href": "https://www.regulations.gov/docket/NCUA-2015-0055"
544  },
545  {
546    "href": "https://www.regulations.gov/docket/NCUA-2015-0049"
547  },
548  {
549    "href": "https://www.regulations.gov/docket/NCUA-2015-0048"
550  },
551  {
552    "href": "https://www.regulations.gov/docket/NCUA-2015-0044"
553  },
554  {
555    "href": "https://www.regulations.gov/docket/NCUA-2015-0042"
556  },
557  {
558    "href": "https://www.regulations.gov/docket/NCUA-2015-0043"
559  },
560  {
561    "href": "https://www.regulations.gov/docket/NCUA-2015-0038"
562  },
563  {
564    "href": "https://www.regulations.gov/docket/NCUA-2015-0037"
565  },
566  {
567    "href": "https://www.regulations.gov/docket/NCUA-2015-0034"
568  },
569  {
570    "href": "https://www.regulations.gov/docket/NCUA-2015-0030"
571  },
572  {
573    "href": "https://www.regulations.gov/docket/NCUA-2015-0031"
574  },
575  {
576    "href": "https://www.regulations.gov/docket/NCUA-2015-0029"
577  },
578  {
579    "href": "https://www.regulations.gov/docket/NCUA-2015-0019"
580  },
581  {
582    "href": "https://www.regulations.gov/docket/NCUA-2015-0018"
583  },
584  {
585    "href": "https://www.regulations.gov/docket/NCUA-2015-0020"
586  },
587  {
588    "href": "https://www.regulations.gov/docket/NCUA-2015-0021"
589  },
590  {
591    "href": "https://www.regulations.gov/docket/NCUA-2015-0015"
592  },
593  {
594    "href": "https://www.regulations.gov/docket/NCUA-2015-0013"
595  },
596  {
597    "href": "https://www.regulations.gov/docket/NCUA-2015-0010"
598  },
599  {
600    "href": "https://www.regulations.gov/docket/NCUA-2015-0011"
601  },
602  {
603    "href": "https://www.regulations.gov/docket/NCUA-2014-0043"
604  },
605  {
606    "href": "https://www.regulations.gov/docket/NCUA-2014-0037"
607  },
608  {
609    "href": "https://www.regulations.gov/docket/NCUA-2014-0036"
610  },
611  {
612    "href": "https://www.regulations.gov/docket/NCUA-2014-0034"
613  },
614  {
615    "href": "https://www.regulations.gov/docket/NCUA-2014-0031"
616  },
617  {
618    "href": "https://www.regulations.gov/docket/NCUA-2014-0027"
619  },
620  {
621    "href": "https://www.regulations.gov/docket/NCUA-2014-0026"
622  },
623  {
624    "href": "https://www.regulations.gov/docket/NCUA-2014-0028"
625  },
626  {
627    "href": "https://www.regulations.gov/docket/NCUA-2014-0025"
628  },
629  {
630    "href": "https://www.regulations.gov/docket/NCUA-2014-0020"
631  },
632  {
633    "href": "https://www.regulations.gov/docket/NCUA-2014-0016"
634  },
635  {
636    "href": "https://www.regulations.gov/docket/NCUA-2014-0017"
637  },
638  {
639    "href": "https://www.regulations.gov/docket/NCUA-2014-0045"
640  },
641  {
642    "href": "https://www.regulations.gov/docket/NCUA-2014-0008"
643  },
644  {
645    "href": "https://www.regulations.gov/docket/NCUA-2014-0004"
646  },
647  {
648    "href": "https://www.regulations.gov/docket/NCUA-2014-0007"
649  },
650  {
651    "href": "/About/Pages/budget-strategic-planning/comments.aspx"
652  },
653  {
654    "href": "https://www.regulations.gov/docket/NCUA-2013-0123"
655  },
656  {
657    "href": "https://www.regulations.gov/docket/NCUA-2013-0124"
658  },
659  {
660    "href": "https://www.regulations.gov/docket/NCUA-2013-0122"
661  },
662  {
663    "href": "https://www.regulations.gov/docket/NCUA-2013-0125"
664  },
665  {
666    "href": "https://www.regulations.gov/docket/FRS-2013-0386"
667  },
668  {
669    "href": "https://www.regulations.gov/docket/NCUA-2013-0120"
670  },
671  {
672    "href": "https://www.regulations.gov/docket/NCUA-2013-0112"
673  },
674  {
675    "href": "https://www.regulations.gov/docket/NCUA-2013-0113"
676  },
677  {
678    "href": "https://www.regulations.gov/docket/NCUA-2013-0116"
679  },
680  {
681    "href": "https://www.regulations.gov/docket/NCUA-2013-0114"
682  },
683  {
684    "href": "https://www.regulations.gov/docket/NCUA-2013-0111"
685  },
686  {
687    "href": "https://www.regulations.gov/docket/NCUA-2013-0095"
688  },
689  {
690    "href": "https://www.regulations.gov/docket/NCUA-2013-0096"
691  },
692  {
693    "href": "https://www.regulations.gov/docket/NCUA-2013-0060"
694  },
695  {
696    "href": "https://www.regulations.gov/docket/NCUA-2013-0062"
697  },
698  {
699    "href": "https://www.regulations.gov/docket/FRS-2013-0227"
700  },
701  {
702    "href": "https://www.regulations.gov/docket/NCUA-2013-0037"
703  },
704  {
705    "href": "https://www.regulations.gov/docket/NCUA-2013-0034"
706  },
707  {
708    "href": "https://www.regulations.gov/docket/NCUA-2013-0030"
709  },
710  {
711    "href": "https://www.regulations.gov/docket/NCUA-2013-0029"
712  },
713  {
714    "href": "https://www.regulations.gov/docket/NCUA-2013-0021"
715  },
716  {
717    "href": "https://www.regulations.gov/docket/NCUA-2013-0014"
718  },
719  {
720    "href": "https://www.regulations.gov/docket/NCUA-2013-0013"
721  },
722  {
723    "href": "https://www.regulations.gov/docket/NCUA-2013-0011"
724  },
725  {
726    "href": "https://www.regulations.gov/docket/NCUA-2013-0003"
727  },
728  {
729    "href": "https://www.regulations.gov/docket/NCUA-2013-0005"
730  },
731  {
732    "href": "https://www.regulations.gov/docket/NCUA-2013-0004"
733  },
734  {
735    "href": "https://www.regulations.gov/docket/NCUA-2013-0006"
736  },
737  {
738    "href": "https://www.regulations.gov/docket/NCUA-2012-0041"
739  },
740  {
741    "href": "https://www.regulations.gov/docket/NCUA-2012-0040"
742  },
743  {
744    "href": "https://www.regulations.gov/docket/NCUA-2012-0038"
745  },
746  {
747    "href": "https://www.regulations.gov/docket/NCUA-2012-0015"
748  },
749  {
750    "href": "https://www.regulations.gov/docket/NCUA-2012-0026"
751  },
752  {
753    "href": "https://www.regulations.gov/docket/NCUA-2012-0016"
754  },
755  {
756    "href": "https://www.regulations.gov/docket/NCUA-2012-0017"
757  },
758  {
759    "href": "https://www.regulations.gov/docket/NCUA-2012-0010"
760  },
761  {
762    "href": "https://www.regulations.gov/docket/NCUA-2012-0005"
763  },
764  {
765    "href": "https://www.regulations.gov/docket/NCUA-2011-0073"
766  },
767  {
768    "href": "https://www.regulations.gov/docket/NCUA-2011-0076"
769  },
770  {
771    "href": "https://www.regulations.gov/docket/NCUA-2011-0067"
772  },
773  {
774    "href": "https://www.regulations.gov/docket/NCUA-2011-0066"
775  },
776  {
777    "href": "https://www.regulations.gov/docket/NCUA-2011-0063"
778  },
779  {
780    "href": "https://www.regulations.gov/docket/NCUA-2011-0055"
781  },
782  {
783    "href": "https://www.regulations.gov/docket/NCUA-2011-0048"
784  },
785  {
786    "href": "https://www.regulations.gov/docket/NCUA-2011-0039"
787  },
788  {
789    "href": "https://www.regulations.gov/docket/NCUA-2011-0041"
790  },
791  {
792    "href": "https://www.regulations.gov/docket/NCUA-2011-0034"
793  },
794  {
795    "href": "https://www.regulations.gov/docket/NCUA-2011-0035"
796  },
797  {
798    "href": "https://www.regulations.gov/docket/NCUA-2011-0032"
799  },
800  {
801    "href": "https://www.regulations.gov/docket/NCUA-2011-0028"
802  },
803  {
804    "href": "https://www.regulations.gov/docket/NCUA-2011-0029"
805  },
806  {
807    "href": "https://www.regulations.gov/docket/NCUA-2011-0016"
808  },
809  {
810    "href": "https://www.regulations.gov/docket/NCUA-2011-0014"
811  },
812  {
813    "href": "https://www.regulations.gov/docket/NCUA-2011-0011"
814  },
815  {
816    "href": "https://www.regulations.gov/docket/NCUA-2011-0003"
817  },
818  {
819    "href": "https://www.regulations.gov/docket/NCUA-2010-0060"
820  },
821  {
822    "href": "https://www.regulations.gov/docket/NCUA-2010-0062"
823  },
824  {
825    "href": "https://www.regulations.gov/docket/NCUA-2010-0061"
826  },
827  {
828    "href": "https://www.regulations.gov/docket/NCUA-2010-0051"
829  },
830  {
831    "href": "https://www.regulations.gov/docket/NCUA-2010-0048"
832  },
833  {
834    "href": "https://www.regulations.gov/docket/NCUA-2010-0047"
835  },
836  {
837    "href": "https://www.regulations.gov/docket/NCUA-2010-0045"
838  },
839  {
840    "href": "https://www.regulations.gov/docket/NCUA-2010-0040"
841  },
842  {
843    "href": "https://www.regulations.gov/docket/NCUA-2010-0039"
844  },
845  {
846    "href": "https://www.regulations.gov/docket/NCUA-2010-0035"
847  },
848  {
849    "href": "https://www.regulations.gov/docket/NCUA-2010-0031"
850  },
851  {
852    "href": "https://www.regulations.gov/docket/NCUA-2010-0022"
853  },
854  {
855    "href": "https://www.regulations.gov/docket/NCUA-2010-0020"
856  },
857  {
858    "href": "https://www.regulations.gov/docket/NCUA-2010-0028"
859  },
860  {
861    "href": "https://www.regulations.gov/docket/NCUA-2010-0028"
862  },
863  {
864    "href": "https://www.regulations.gov/docket/NCUA-2010-0005"
865  },
866  {
867    "href": "https://www.regulations.gov/docket/NCUA-2010-0003"
868  }
869]

src/linksdata.js

1const { chromium } = require("playwright");
2const fs = require('fs').promises;
3const path = require('path');
4
5async function scrapeLinks() {
6  const browser = await chromium.launch();
7  const page = await browser.newPage();
8
9  try {
10    await page.goto("https://ncua.gov/regulation-supervision/rulemakings-proposals-comment");
11
12    const links = await page.$$eval(".dynamic-table-main tbody tr td a", (elements) => {
13      return elements.map((element) => ({
14        href: element.getAttribute("href"),
15      }));
16    });
17
18    const fileName = 'links.json';
19    const folderName = 'src';
20    const folderPath = path.join(process.cwd(), folderName);
21    const filePath = path.join(folderPath, fileName);
22
23    // Create the folder if it doesn't exist
24    await fs.mkdir(folderPath, { recursive: true });
25
26    await fs.writeFile(filePath, JSON.stringify(links, null, 2));
27    console.log(`Links saved to ${filePath}`);
28  } catch (error) {
29    console.error("Error:", error);
30  } finally {
31    await browser.close();
32  }
33}
34
35scrapeLinks();

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3const { getLinks } = require("./get");
4const fs = require('fs').promises;
5const path = require('path');
6
7Actor.main(async () => {
8    const initialArrayOfObjects = await getLinks();
9    const links = initialArrayOfObjects.map((obj) => obj.href);
10
11    const validLinks = links.filter(link => {
12        return typeof link === 'string' && link.trim() !== '' && link.startsWith('http');
13    });
14    console.log(validLinks.length);
15    if (validLinks.length === 0) {
16        throw new Error("No valid links found");
17    }
18    const crawler = new PlaywrightCrawler({
19        async requestHandler({ request, page, log }) {
20            try {
21                    const url = request.url;
22                    log.info(`Processing ${url}`);
23
24                    await page.goto(url);
25
26                    const selectorExists = await page.waitForSelector('//*[@class="px-2"]')
27                        .then(() => true)
28                        .catch(() => false);
29
30                    let paragraphs = [];
31                    if (selectorExists) {
32                        paragraphs = await page.$$eval('p', elements =>
33                            elements.map(element => element.innerText.trim())
34                        );
35                    }
36
37                    const data = { url, paragraphs };
38
39                    // Push data to the dataset
40                  
41                    if(paragraphs.length == 0){
42                        paragraphs ="We're sorry, an error has occurred\n\nA general error occurred while processing your request.\n\nWhat could have caused this?\n\nThe link may be outdated or incorrect.\n\nThe document or docket you are looking for was not found.\n\nWhat can you do?\n\nPlease try typing in the URL again or return to the home page.\n\nIf you believe you should not be getting this error page or the problem persists, please contact the Help Desk"
43                    }
44                    await Dataset.pushData(data);
45                    log.info(`Data pushed to dataset for URL: ${url}`);
46            
47            } catch (error) {
48                log.error(`An error occurred while processing URL ${url}: ${error}`);
49
50                // Push error data to the dataset
51                const errorData = { url, errorMessage: error.toString() };
52                await Dataset.pushData(errorData);
53
54                log.error(`Error data pushed to dataset for URL: ${url}`);
55            }
56        },
57    });
58    await crawler.addRequests(validLinks);
59
60    // Run the crawler
61    await crawler.run();
62
63    // Save the paragraphsByLink array to a JSON file if needed
64    // const fileName = 'paragraphsByLink.json';
65    // const filePath = path.join(process.cwd(), fileName);
66    // await fs.writeFile(filePath, JSON.stringify(paragraphsByLink, null, 2));
67    // console.log(`Data saved to ${filePath}`);
68});

src/routes.js

1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5const fs = require("fs");
6
7router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
8    const title = await page.title();
9    log.info(`${title}`, { url: request.loadedUrl });
10    x = title;
11
12    await enqueueLinks({
13        selector: ".field-type-text_with_summary h2 a[href*=regulatory-review]",
14        label: "detail5",
15    });
16
17    await enqueueLinks({
18        selector:
19            ".field-type-text_with_summary h2 a[href*=regulatory-reform-agenda]",
20        label: "detail6",
21    });
22
23    await enqueueLinks({
24        selector:
25            ".field-type-text_with_summary h2 a[href*=interpretive-rulings-policy-statements]",
26        label: "detail7",
27    });
28
29    // async function fetchPDF(pdfLink) {
30    //     const { default: fetch } = await import("node-fetch");
31    //     try {
32    //         const mkdirAsync = promisify(fs.mkdir);
33    //         const directory = "storage/PDFs";
34    //         await mkdirAsync(directory, { recursive: true });
35    //         const writeFileAsync = promisify(fs.writeFile);
36    //         const response = await fetch(pdfLink);
37    //         const buffer = await response.buffer();
38    //         const pdfText = await pdfParse(buffer);
39    //         const serialNumber = pdfLink.substring(
40    //             pdfLink.lastIndexOf("/") + 1,
41    //             pdfLink.lastIndexOf(".")
42    //         );
43    //         const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
44    //         const jsonData = JSON.stringify(
45    //             { link: pdfLink, text: pdfText.text },
46    //             null,
47    //             2
48    //         );
49    //         await writeFileAsync(filename, jsonData);
50    //         console.log(`JSON file "${filename}" created successfully.`);
51    //     } catch (error) {
52    //         const writeFileAsync = promisify(fs.writeFile);
53    //         const serialNumber = pdfLink.substring(
54    //             pdfLink.lastIndexOf("/") + 1,
55    //             pdfLink.lastIndexOf(".")
56    //         );
57    //         const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
58    //         const jsonData = JSON.stringify(
59    //             {
60    //                 link: pdfLink,
61    //                 error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
62    //             },
63    //             null,
64    //             2
65    //         );
66    //         await writeFileAsync(filename, jsonData);
67    //         console.error(
68    //             `Error fetching or parsing PDF from ${pdfLink}:`,
69    //             error
70    //         );
71    //     }
72    // }
73});
74
75router.addHandler("detail4", async ({ request, page, log }) => {
76    try {
77        const title = await page.title();
78        log.info(`${title}`, { url: request.loadedUrl });
79        // const result = await page.evaluate(() => {
80        //     const result = {
81        //         Category: "Rules and Regulations",
82        //         Title:
83        //             document.querySelector(".page-title")?.innerText || "N/A",
84        //         MainParagraphText:
85        //             document.querySelector(".field-type-text_with_summary")
86        //                 ?.innerText || "N/A",
87        //         Links: [],
88        //         PDFs: [],
89        //     };
90
91        //     const linkElements = document.querySelectorAll(
92        //         ".field-type-text_with_summary a"
93        //     );
94        //     for (const el of Array.from(linkElements)) {
95        //         const obj = {
96        //             linkText: el.innerText || "N/A",
97        //             link: el.href || "",
98        //         };
99        //         const numericValue = Number(obj.linkText);
100
101        //         if (
102        //             isNaN(numericValue) &&
103        //             !obj.link.includes("mailto") &&
104        //             obj.link !== ""
105        //         ) {
106        //             if (obj.link.endsWith(".pdf")) {
107        //                 result.PDFs.push(obj);
108        //             } else result.Links.push(obj);
109        //         }
110        //     }
111
112        //     return result;
113        // });
114
115        if (request.errorMessages.includes("Data item is too large")) {
116            await Dataset.pushData({
117                url: request.url,
118                ...result,
119                PDFs: PDFs.map((item) => ({
120                    ...item,
121                    text: "Please retrieve manually due to size limitations",
122                })),
123                Links: Links.map((item) => ({
124                    ...item,
125                    text: "Please retrieve manually due to size limitations",
126                })),
127            });
128        } else {
129            await Dataset.pushData({
130                url2: request.url,
131                // ...result,
132                // Links,
133                // PDFs,
134                // InnerPDFs
135            });
136        }
137    } catch (error) {
138        log.error(
139            `An unexpected error occurred: ${error.message || error.code}`
140        );
141    }
142});
143
144
145router.addHandler("detail5", async ({ request, page, log }) => {
146    try {
147        const title = await page.title();
148        const url = request.loadedUrl;
149        log.info(`${title}`, { url: request.loadedUrl });
150        const result = await page.evaluate(() => {
151            const result = {
152                Category: "Rules and Regulations",
153                Title:
154                    document.querySelector(".page-title")?.innerText || "N/A",
155                MainParagraphText:
156                    document.querySelector(".field-type-text_with_summary")
157                        ?.innerText || "N/A",
158                Links: [],
159                PDFs: [],
160            };
161
162            const linkElements = document.querySelectorAll(
163                ".field-type-text_with_summary a"
164            );
165            for (const el of Array.from(linkElements)) {
166                const obj = {
167                    linkText: el.innerText || "N/A",
168                    link: el.href || "",
169                };
170                const numericValue = Number(obj.linkText);
171
172                if (
173                    isNaN(numericValue) &&
174                    !obj.link.includes("mailto") &&
175                    obj.link !== ""
176                ) {
177                    if (obj.link.endsWith(".pdf")) {
178                        result.PDFs.push(obj);
179                    } else result.Links.push(obj);
180                }
181            }
182
183            return result;
184        });
185
186        if (request.errorMessages.includes("Data item is too large")) {
187            await Dataset.pushData({
188                url: request.url,
189                ...result,
190                PDFs: PDFs.map((item) => ({
191                    ...item,
192                    text: "Please retrieve manually due to size limitations",
193                })),
194                Links: Links.map((item) => ({
195                    ...item,
196                    text: "Please retrieve manually due to size limitations",
197                })),
198            });
199        } else {
200            await Dataset.pushData({
201                url2: request.url,
202                ...result,
203                // Links,
204                // PDFs,
205                // InnerPDFs
206            });
207        }
208    } catch (error) {
209        log.error(
210            `An unexpected error occurred: ${error.message || error.code}`
211        );
212    }
213});
214
215router.addHandler("detail6", async ({ request, page, log }) => {
216    try {
217        const title = await page.title();
218        const url = request.loadedUrl;
219        log.info(`${title}`, { url: request.loadedUrl });
220        const result = await page.evaluate(() => {
221            const result = {
222                Category: "Rules and Regulations",
223                Title:
224                    document.querySelector(".page-title")?.innerText || "N/A",
225                MainParagraphText:
226                    document.querySelector(".field-type-text_with_summary")
227                        ?.innerText || "N/A",
228                Links: [],
229                PDFs: [],
230            };
231
232            const linkElements = document.querySelectorAll(
233                ".field-type-text_with_summary a"
234            );
235            for (const el of Array.from(linkElements)) {
236                const obj = {
237                    linkText: el.innerText || "N/A",
238                    link: el.href || "",
239                };
240                const numericValue = Number(obj.linkText);
241
242                if (
243                    isNaN(numericValue) &&
244                    !obj.link.includes("mailto") &&
245                    obj.link !== ""
246                ) {
247                    if (obj.link.endsWith(".pdf")) {
248                        result.PDFs.push(obj);
249                    } else result.Links.push(obj);
250                }
251            }
252
253            return result;
254        });
255
256        // const Links = (
257        //     await Promise.allSettled(
258        //         result.Links.map(
259        //             (link) =>
260        //                 new Promise(async (res, rej) => {
261        //                     try {
262        //                         // let innerTitle;
263        //                         if (!link.link.includes(".pdf")) {
264        //                             const FederalRegisterResponse =
265        //                                 await page.request.fetch(link.link);
266        //                             const $ = load(
267        //                                 await FederalRegisterResponse.text()
268        //                             );
269        //                             const contentDiv = $(".layout-content");
270        //                             innerLinks = contentDiv
271        //                                 .find("a")
272        //                                 .map((i, el) => $(el).attr("href"))
273        //                                 .get();
274        //                             innerLinks = innerLinks.map((innerLink) => {
275        //                                 if (!innerLink.startsWith("http")) {
276        //                                     return (
277        //                                         "https://ncua.gov" + innerLink
278        //                                     );
279        //                                 }
280        //                                 return innerLink;
281        //                             });
282        //                             innerLinks = Array.from(
283        //                                 new Set(innerLinks)
284        //                             );
285        //                             PDFLinks = innerLinks.filter((link) =>
286        //                                 link.endsWith(".pdf")
287        //                             );
288        //                             innerLinks = innerLinks.filter(
289        //                                 (link) => !link.endsWith(".pdf")
290        //                             );
291        //                             innerLinks = innerLinks.filter(
292        //                                 (link) => !link.endsWith("@ncua.gov")
293        //                             );
294        //                             innerLinks = innerLinks.filter(
295        //                                 (link) => !link.includes("#ftn")
296        //                             );
297        //                             innerText = $("p").text();
298        //                         }
299        //                         res({
300        //                             ...link,
301        //                             innerText,
302        //                             innerLinks,
303        //                             PDFLinks,
304        //                         });
305        //                     } catch (e) {
306        //                         // console.log(e);
307        //                         res({
308        //                             ...link,
309        //                             // error: e.message || e.code || true,
310        //                             error: "404 page not found",
311        //                         });
312        //                     }
313        //                 })
314        //         )
315        //     )
316        // ).map((p) => p.value);
317
318        // const InnerPDFs = (
319        //     await Promise.allSettled(
320        //         Links.map(
321        //             (pdf) =>
322        //                 new Promise(async (res, rej) => {
323        //                     try {
324        //                         const pdfDataArray = [];
325
326        //                         // Loop through all the PDF links in the `PDFLinks` array
327        //                         for (const pdfLink of pdf.PDFLinks) {
328        //                             try {
329        //                                 // Fetch the PDF content from the current link
330        //                                 const link = pdfLink;
331        //                                 const pdfResponse =
332        //                                     await page.request.fetch(pdfLink);
333
334        //                                 // Parse the fetched PDF using pdf-parse
335        //                                 const pdfText = await pdfParse(
336        //                                     (
337        //                                         await pdfResponse.body()
338        //                                     ).buffer
339        //                                 );
340
341        //                                 // Store the parsed information for this PDF
342        //                                 pdfDataArray.push({
343        //                                     link,
344        //                                     text: pdfText.text,
345        //                                 });
346        //                             } catch (innerError) {
347        //                                 pdfDataArray.push({
348        //                                     link: pdfLink,
349        //                                     error:
350        //                                         innerError.message ||
351        //                                         innerError.code ||
352        //                                         true,
353        //                                 });
354        //                             }
355        //                         }
356
357        //                         res({
358        //                             ...pdf,
359        //                             pdfDataArray,
360        //                         });
361        //                     } catch (e) {
362        //                         // console.log(e);
363        //                         res({
364        //                             ...pdf,
365        //                             error: e.message || e.code || true,
366        //                         });
367        //                     }
368        //                 })
369        //         )
370        //     )
371        // ).map((p) => p.value);
372
373        const PDFs = (
374            await Promise.allSettled(
375                result.PDFs.map(
376                    (pdf) =>
377                        new Promise(async (res, rej) => {
378                            try {
379                                const pdfResponse = await page.request.fetch(
380                                    pdf.link
381                                );
382
383                                // Parse the PDF using pdf-parse
384                                const pdfText = await pdfParse(
385                                    (
386                                        await pdfResponse.body()
387                                    ).buffer
388                                );
389
390                                res({
391                                    ...pdf,
392                                    text: pdfText.text,
393                                });
394                            } catch (e) {
395                                // console.log(e);
396                                res({
397                                    ...pdf,
398                                    error: e.message || e.code || true,
399                                });
400                            }
401                        })
402                )
403            )
404        ).map((p) => p.value);
405
406        // If the request has large data errors, mark the data for manual processing
407        if (request.errorMessages.includes("Data item is too large")) {
408            await Dataset.pushData({
409                url: request.url,
410                ...result,
411                PDFs: PDFs.map((item) => ({
412                    ...item,
413                    text: "Please retrieve manually due to size limitations",
414                })),
415                Links: Links.map((item) => ({
416                    ...item,
417                    text: "Please retrieve manually due to size limitations",
418                })),
419            });
420        } else {
421            await Dataset.pushData({
422                url: request.url,
423                ...result,
424                // Links,
425                PDFs,
426                // InnerPDFs
427            });
428        }
429    } catch (error) {
430        log.error(
431            `An unexpected error occurred: ${error.message || error.code}`
432        );
433    }
434});
435
436router.addHandler("detail7", async ({ request, page, log }) => {
437    try {
438        const title = await page.title();
439        log.info(`${title}`, { url: request.loadedUrl });
440        const result = await page.evaluate(() => {
441            const result = {
442                Category: "Rules and Regulations",
443                Title:
444                    document.querySelector(".page-title")?.innerText || "N/A",
445                MainParagraphText:
446                    document.querySelector(".field-type-text_with_summary")
447                        ?.innerText || "N/A",
448                Links: [],
449                PDFs: [],
450            };
451
452            const linkElements = document.querySelectorAll(
453                ".field-type-text_with_summary a"
454            );
455            for (const el of Array.from(linkElements)) {
456                const obj = {
457                    linkText: el.innerText || "N/A",
458                    link: el.href || "",
459                };
460                const numericValue = Number(obj.linkText);
461
462                if (
463                    isNaN(numericValue) &&
464                    !obj.link.includes("mailto") &&
465                    obj.link !== ""
466                ) {
467                    if (obj.link.endsWith(".pdf")) {
468                        result.PDFs.push(obj);
469                    } else result.Links.push(obj);
470                }
471            }
472
473            return result;
474        });
475
476        const PDFs = (
477            await Promise.allSettled(
478                result.PDFs.map(
479                    (pdf) =>
480                        new Promise(async (res, rej) => {
481                            try {
482                                const pdfResponse = await page.request.fetch(
483                                    pdf.link
484                                );
485
486                                // Parse the PDF using pdf-parse
487                                const pdfText = await pdfParse(
488                                    (
489                                        await pdfResponse.body()
490                                    ).buffer
491                                );
492
493                                res({
494                                    ...pdf,
495                                    text: pdfText.text,
496                                });
497                            } catch (e) {
498                                // console.log(e);
499                                res({
500                                    ...pdf,
501                                    error: e.message || e.code || true,
502                                });
503                            }
504                        })
505                )
506            )
507        ).map((p) => p.value);
508
509        if (request.errorMessages.includes("Data item is too large")) {
510            await Dataset.pushData({
511                url: request.url,
512                ...result,
513                PDFs: PDFs.map((item) => ({
514                    ...item,
515                    text: "Please retrieve manually due to size limitations",
516                })),
517                Links: Links.map((item) => ({
518                    ...item,
519                    text: "Please retrieve manually due to size limitations",
520                })),
521            });
522        } else {
523            await Dataset.pushData({
524                url: request.url,
525                ...result,
526                // Links,
527                PDFs,
528                // InnerPDFs
529            });
530        }
531    } catch (error) {
532        log.error(
533            `An unexpected error occurred: ${error.message || error.code}`
534        );
535    }
536});
537
538module.exports = { router };
Developer
Maintained by Community
Categories