R R I avatar

R R I

Try for free

No credit card required

Go to Store
R R I

R R I

nondescript_cord/r-r-i
Try for free

No credit card required

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv

crawlee.json

1{
2	"purgeOnStart": false
3}

Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY --chown=myuser . ./
26
27
28# Run the image. If you know you won't need headful browsers,
29# you can remove the XVFB start script for a micro perf gain.
30CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

package.json

1{
2	"name": "consumerfinance",
3	"version": "0.0.1",
4	"type": "commonjs",
5	"description": "consumerfinance rules extractor",
6	"dependencies": {
7		"22": "^0.0.0",
8		"apify": "^3.1.10",
9		"crawlee": "^3.5.4",
10		"fs": "^0.0.1-security",
11		"node-fetch": "^3.3.2",
12		"path": "^0.12.7",
13		"pdf-parse": "^1.1.1",
14		"playwright": "^1.43.1"
15	},
16	"devDependencies": {
17		"@apify/eslint-config": "^0.4.0",
18		"@types/pdf-parse": "^1.1.4",
19		"eslint": "^8.50.0",
20		"@playwright/test": "^1.43.1"
21	},
22	"scripts": {
23		"start": "node src/main.js",
24		"lint": "eslint ./src --ext .js,.jsx",
25		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
26		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
27		"postinstall": "npx crawlee install-playwright-browsers"
28	},
29	"author": "Moazzam Malek"
30}

start.bat

Download

start.sh

Download

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "R-R-I",
4	"title": "Project Playwright Crawler JavaScript",
5	"description": "Crawlee and Playwright project in JavaScript.",
6	"version": "0.0",
7	"meta": {
8		"templateId": "js-crawlee-playwright-chrome"
9	},
10	"input": "./input_schema.json",
11	"dockerfile": "./Dockerfile"
12}

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://apify.com"
14                }
15            ]
16        }
17    }
18}

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5    const { router } = require("./routes.js");
6    const startUrls = [
7        "https://ncua.gov/regulation-supervision/rules-regulations",
8    ];
9
10    // const proxyConfiguration = await Actor.createProxyConfiguration();
11
12    const crawler = new PlaywrightCrawler({
13        // proxyConfiguration,
14        maxConcurrency: 3,
15        launchContext: {
16            launchOptions: { javaScriptEnabled: false },
17        },
18
19        maxRequestRetries: 5,
20        requestHandler: router,
21        requestHandlerTimeoutSecs: 300,
22        navigationTimeoutSecs: 300,
23    });
24
25    await crawler.run(startUrls);
26    await Dataset.exportToCSV("OUTPUT");
27    await Dataset.exportToJSON("OUTPUT");
28});

src/routes.js

1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5const fs = require("fs");
6
7router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
8    const title = await page.title();
9    log.info(`${title}`, { url: request.loadedUrl });
10    x = title;
11
12    await enqueueLinks({
13        selector: ".field-type-text_with_summary h2 a[href*=regulatory-review]",
14        label: "detail5",
15    });
16
17    await enqueueLinks({
18        selector:
19            ".field-type-text_with_summary h2 a[href*=regulatory-reform-agenda]",
20        label: "detail6",
21    });
22
23    await enqueueLinks({
24        selector:
25            ".field-type-text_with_summary h2 a[href*=interpretive-rulings-policy-statements]",
26        label: "detail7",
27    });
28});
29
30router.addHandler("detail5", async ({ request, page, log }) => {
31    try {
32        const title = await page.title();
33        const url = request.loadedUrl;
34        log.info(`${title}`, { url: request.loadedUrl });
35        const result = await page.evaluate(() => {
36            const result = {
37                Category: "Rules and Regulations",
38                Title:
39                    document.querySelector(".page-title")?.innerText || "N/A",
40                MainParagraphText:
41                    document.querySelector(".field-type-text_with_summary")
42                        ?.innerText || "N/A",
43                Links: [],
44                PDFs: [],
45            };
46
47            const linkElements = document.querySelectorAll(
48                ".field-type-text_with_summary a"
49            );
50            for (const el of Array.from(linkElements)) {
51                const obj = {
52                    linkText: el.innerText || "N/A",
53                    link: el.href || "",
54                };
55                const numericValue = Number(obj.linkText);
56
57                if (
58                    isNaN(numericValue) &&
59                    !obj.link.includes("mailto") &&
60                    obj.link !== ""
61                ) {
62                    if (obj.link.endsWith(".pdf")) {
63                        result.PDFs.push(obj);
64                    } else result.Links.push(obj);
65                }
66            }
67
68            return result;
69        });
70
71        const PDFs = (
72            await Promise.allSettled(
73                result.PDFs.map(
74                    (pdf) =>
75                        new Promise(async (res, rej) => {
76                            try {
77                                const pdfResponse = await page.request.fetch(
78                                    pdf.link
79                                );
80
81                                // Parse the PDF using pdf-parse
82                                const pdfText = await pdfParse(
83                                    (
84                                        await pdfResponse.body()
85                                    ).buffer
86                                );
87
88                                res({
89                                    ...pdf,
90                                    text: pdfText.text,
91                                });
92                            } catch (e) {
93                                // console.log(e);
94                                res({
95                                    ...pdf,
96                                    error: e.message || e.code || true,
97                                });
98                            }
99                        })
100                )
101            )
102        ).map((p) => p.value);
103
104        if (request.errorMessages.includes("Data item is too large")) {
105            await Dataset.pushData({
106                url: request.url,
107                ...result,
108                PDFs: PDFs.map((item) => ({
109                    ...item,
110                    text: "Please retrieve manually due to size limitations",
111                })),
112                Links: Links.map((item) => ({
113                    ...item,
114                    text: "Please retrieve manually due to size limitations",
115                })),
116            });
117        } else {
118            await Dataset.pushData({
119                url2: request.url,
120                ...result,
121                PDFs,
122            });
123        }
124    } catch (error) {
125        log.error(
126            `An unexpected error occurred: ${error.message || error.code}`
127        );
128    }
129});
130
131router.addHandler("detail6", async ({ request, page, log }) => {
132    try {
133        const title = await page.title();
134        const url = request.loadedUrl;
135        log.info(`${title}`, { url: request.loadedUrl });
136        const result = await page.evaluate(() => {
137            const result = {
138                Category: "Rules and Regulations",
139                Title:
140                    document.querySelector(".page-title")?.innerText || "N/A",
141                MainParagraphText:
142                    document.querySelector(".field-type-text_with_summary")
143                        ?.innerText || "N/A",
144                Links: [],
145                PDFs: [],
146            };
147
148            const linkElements = document.querySelectorAll(
149                ".field-type-text_with_summary a"
150            );
151            for (const el of Array.from(linkElements)) {
152                const obj = {
153                    linkText: el.innerText || "N/A",
154                    link: el.href || "",
155                };
156                const numericValue = Number(obj.linkText);
157
158                if (
159                    isNaN(numericValue) &&
160                    !obj.link.includes("mailto") &&
161                    obj.link !== ""
162                ) {
163                    if (obj.link.endsWith(".pdf")) {
164                        result.PDFs.push(obj);
165                    } else result.Links.push(obj);
166                }
167            }
168
169            return result;
170        });
171
172
173        const PDFs = (
174            await Promise.allSettled(
175                result.PDFs.map(
176                    (pdf) =>
177                        new Promise(async (res, rej) => {
178                            try {
179                                const pdfResponse = await page.request.fetch(
180                                    pdf.link
181                                );
182
183                                // Parse the PDF using pdf-parse
184                                const pdfText = await pdfParse(
185                                    (
186                                        await pdfResponse.body()
187                                    ).buffer
188                                );
189
190                                res({
191                                    ...pdf,
192                                    text: pdfText.text,
193                                });
194                            } catch (e) {
195                                // console.log(e);
196                                res({
197                                    ...pdf,
198                                    error: e.message || e.code || true,
199                                });
200                            }
201                        })
202                )
203            )
204        ).map((p) => p.value);
205
206        // If the request has large data errors, mark the data for manual processing
207        if (request.errorMessages.includes("Data item is too large")) {
208            await Dataset.pushData({
209                url: request.url,
210                ...result,
211                PDFs: PDFs.map((item) => ({
212                    ...item,
213                    text: "Please retrieve manually due to size limitations",
214                })),
215                Links: Links.map((item) => ({
216                    ...item,
217                    text: "Please retrieve manually due to size limitations",
218                })),
219            });
220        } else {
221            await Dataset.pushData({
222                url: request.url,
223                ...result,
224                // Links,
225                PDFs,
226                // InnerPDFs
227            });
228        }
229    } catch (error) {
230        log.error(
231            `An unexpected error occurred: ${error.message || error.code}`
232        );
233    }
234});
235
236router.addHandler("detail7", async ({ request, page, log }) => {
237    try {
238        const title = await page.title();
239        log.info(`${title}`, { url: request.loadedUrl });
240        const result = await page.evaluate(() => {
241            const result = {
242                Category: "Rules and Regulations",
243                Title:
244                    document.querySelector(".page-title")?.innerText || "N/A",
245                MainParagraphText:
246                    document.querySelector(".field-type-text_with_summary")
247                        ?.innerText || "N/A",
248                Links: [],
249                PDFs: [],
250            };
251
252            const linkElements = document.querySelectorAll(
253                ".field-type-text_with_summary a"
254            );
255            for (const el of Array.from(linkElements)) {
256                const obj = {
257                    linkText: el.innerText || "N/A",
258                    link: el.href || "",
259                };
260                const numericValue = Number(obj.linkText);
261
262                if (
263                    isNaN(numericValue) &&
264                    !obj.link.includes("mailto") &&
265                    obj.link !== ""
266                ) {
267                    if (obj.link.endsWith(".pdf")) {
268                        result.PDFs.push(obj);
269                    } else result.Links.push(obj);
270                }
271            }
272
273            return result;
274        });
275
276        const PDFs = (
277            await Promise.allSettled(
278                result.PDFs.map(
279                    (pdf) =>
280                        new Promise(async (res, rej) => {
281                            try {
282                                const pdfResponse = await page.request.fetch(
283                                    pdf.link
284                                );
285
286                                // Parse the PDF using pdf-parse
287                                const pdfText = await pdfParse(
288                                    (
289                                        await pdfResponse.body()
290                                    ).buffer
291                                );
292
293                                res({
294                                    ...pdf,
295                                    text: pdfText.text,
296                                });
297                            } catch (e) {
298                                // console.log(e);
299                                res({
300                                    ...pdf,
301                                    error: e.message || e.code || true,
302                                });
303                            }
304                        })
305                )
306            )
307        ).map((p) => p.value);
308
309        if (request.errorMessages.includes("Data item is too large")) {
310            await Dataset.pushData({
311                url: request.url,
312                ...result,
313                PDFs: PDFs.map((item) => ({
314                    ...item,
315                    text: "Please retrieve manually due to size limitations",
316                })),
317                Links: Links.map((item) => ({
318                    ...item,
319                    text: "Please retrieve manually due to size limitations",
320                })),
321            });
322        } else {
323            await Dataset.pushData({
324                url: request.url,
325                ...result,
326                // Links,
327                PDFs,
328                // InnerPDFs
329            });
330        }
331    } catch (error) {
332        log.error(
333            `An unexpected error occurred: ${error.message || error.code}`
334        );
335    }
336});
337
338module.exports = { router };
Developer
Maintained by Community

Actor Metrics

  • 2 monthly users

  • 1 star

  • >99% runs succeeded

  • Created in May 2024

  • Modified 8 months ago