Letters To Credit Unions And Other Guidance avatar
Letters To Credit Unions And Other Guidance
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Letters To Credit Unions And Other Guidance

Letters To Credit Unions And Other Guidance

nondescript_cord/letters-to-credit-unions-and-other-guidance

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv

crawlee.json

1{
2	"purgeOnStart": false
3}

package.json

1{
2	"name": "consumerfinance",
3	"version": "0.0.1",
4	"type": "commonjs",
5	"description": "consumerfinance rules extractor",
6	"dependencies": {
7		"22": "^0.0.0",
8		"apify": "^3.1.10",
9		"crawlee": "^3.5.4",
10		"node-fetch": "^3.3.2",
11		"pdf-parse": "^1.1.1",
12		"playwright": "*"
13	},
14	"devDependencies": {
15		"@apify/eslint-config": "^0.4.0",
16		"@types/pdf-parse": "^1.1.4",
17		"eslint": "^8.50.0"
18	},
19	"scripts": {
20		"start": "node src/main.js",
21		"lint": "eslint ./src --ext .js,.jsx",
22		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
23		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
24		"postinstall": "npx crawlee install-playwright-browsers"
25	},
26	"author": "Moazzam Malek"
27}

start.bat

Download

start.sh

Download

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "Letters-to-Credit-Unions-and-Other-Guidance",
4	"title": "Project Playwright Crawler JavaScript",
5	"description": "Crawlee and Playwright project in JavaScript.",
6	"version": "0.0",
7	"meta": {
8		"templateId": "js-crawlee-playwright-chrome"
9	},
10	"input": "./input_schema.json",
11	"dockerfile": "./Dockerfile"
12}

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://apify.com"
14                }
15            ]
16        }
17    }
18}

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5    const { router } = require("./routes.js");
6    const startUrls = [
7        "https://ncua.gov/regulation-supervision/letters-credit-unions-other-guidance?page=0&sort=date&dir=desc&sq=",
8    ];
9
10    // const proxyConfiguration = await Actor.createProxyConfiguration();
11
12    const crawler = new PlaywrightCrawler({
13        // proxyConfiguration,
14        maxConcurrency: 3,
15        launchContext: {
16            launchOptions: { javaScriptEnabled: false },
17        },
18
19        maxRequestRetries: 5,
20        requestHandler: router,
21        requestHandlerTimeoutSecs: 300,
22        navigationTimeoutSecs: 300,
23    });
24
25    await crawler.run(startUrls);
26    await Dataset.exportToCSV("OUTPUT");
27    await Dataset.exportToJSON("OUTPUT");
28    
29});

src/routes.js

1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5// const fetch = require("node-fetch");
6// const { default: fetch } = await import("node-fetch");
7const fs = require("fs");
8const { promisify } = require("util");
9
10router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
11    const title = await page.title();
12    log.info(`${title}`, { url: request.loadedUrl });
13    x = title;
14    await enqueueLinks({
15        selector: "#edit-next-container a",
16    });
17
18    await enqueueLinks({
19        selector: "tbody td a",
20        label: "detail",
21        transformRequestFunction(request) {
22            if (request.url.endsWith(".pdf")) {
23                log.info(`PDF ${request.url}`);
24                fetchPDF(request.url);
25                return false;
26            } else {
27                return request;
28            }
29        },
30    });
31    async function fetchPDF(pdfLink) {
32        const { default: fetch } = await import("node-fetch");
33        try {
34            // const mkdirAsync = promisify(fs.mkdir);
35            // const directory = "storage/PDFs";
36            // await mkdirAsync(directory, { recursive: true });
37            // const writeFileAsync = promisify(fs.writeFile);
38            const response = await fetch(pdfLink);
39            const buffer = await response.arrayBuffer();
40            const pdfText = await pdfParse(buffer);
41            // const serialNumber = pdfLink.substring(
42            //     pdfLink.lastIndexOf("/") + 1,
43            //     pdfLink.lastIndexOf(".")
44            // );
45            // const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
46            // const jsonData = JSON.stringify(
47            //     { link: pdfLink, text: pdfText.text },
48            //     null,
49            //     2
50            // );
51            // await writeFileAsync(filename, jsonData);
52            // console.log(`JSON file "${filename}" created successfully.`);
53            await Dataset.pushData({
54                url: pdfLink,
55                pdftext: pdfText.text,
56            });
57
58        } catch (error) {
59            // const writeFileAsync = promisify(fs.writeFile);
60            // const serialNumber = pdfLink.substring(
61            //     pdfLink.lastIndexOf("/") + 1,
62            //     pdfLink.lastIndexOf(".")
63            // );
64            // const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
65            // const jsonData = JSON.stringify(
66            //     {
67            //         link: pdfLink,
68            //         error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
69            //     },
70            //     null,
71            //     2
72            // );
73            // await writeFileAsync(filename, jsonData);
74            // console.error(
75            //     `Error fetching or parsing PDF from ${pdfLink}:`,
76            //     error
77            // );
78            await Dataset.pushData({
79                link: pdfLink,
80                error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
81            });
82        }
83    }
84});
85
86router.addHandler("detail", async ({ request, page, log }) => {
87    try {
88        const title = await page.title();
89        const url = request.loadedUrl;
90        log.info(`${title}`, { url: request.loadedUrl });
91        const result = await page.evaluate(() => {
92            const result = {
93                Docket:
94                    document.querySelector("span[class*=docket]")?.innerText ||
95                    "N/A",
96                Date:
97                    document.querySelector("span[class*=date]")?.innerText ||
98                    "N/A",
99                Category:
100                    document.querySelector(
101                        "a[href*=letters-credit-unions-other-guidance]"
102                    )?.innerText || "N/A",
103                Title:
104                    document.querySelector(".pseudo-title")?.innerText || "N/A",
105                MainParagraphText:
106                    document.querySelector(".row.no-gutters .body")
107                        ?.innerText || "N/A",
108                Links: [],
109                PDFs: [],
110            };
111
112            const linkElements = document.querySelectorAll(
113                ".row.no-gutters .body a"
114            );
115            for (const el of Array.from(linkElements)) {
116                const obj = {
117                    linkText: el.innerText || "N/A",
118                    link: el.href || "",
119                };
120                const numericValue = Number(obj.linkText);
121
122               
123
124                if (
125                    isNaN(numericValue) &&
126                    !obj.link.includes("mailto") &&
127                    obj.link !== ""
128                ) {
129                    if(obj.link.endsWith('.pdf')){
130                        result.PDFs.push(obj);
131                    }
132                    else result.Links.push(obj);
133                }
134            }
135
136            return result;
137        });
138
139        const Links = (
140            await Promise.allSettled(
141                result.Links.map(
142                    (link) =>
143                        new Promise(async (res, rej) => {
144                            try {
145                                // let innerTitle;
146                                if (!link.link.includes(".pdf")) {
147                                    const FederalRegisterResponse =
148                                        await page.request.fetch(link.link);
149                                    const $ = load(
150                                        await FederalRegisterResponse.text()
151                                    );
152                                    const contentDiv = $(".layout-content");
153                                    innerLinks = contentDiv
154                                        .find("a")
155                                        .map((i, el) => $(el).attr("href"))
156                                        .get();
157                                    innerLinks = innerLinks.map((innerLink) => {
158                                        if (!innerLink.startsWith("http")) {
159                                            return (
160                                                "https://ncua.gov" + innerLink
161                                            );
162                                        }
163                                        return innerLink;
164                                    });
165                                    innerLinks = Array.from(
166                                        new Set(innerLinks)
167                                    );
168                                    PDFLinks = innerLinks.filter((link) =>
169                                        link.endsWith(".pdf")
170                                    );
171                                    innerLinks = innerLinks.filter(
172                                        (link) => !link.endsWith(".pdf")
173                                    );
174                                    innerLinks = innerLinks.filter(
175                                        (link) => !link.endsWith("@ncua.gov")
176                                    );
177                                    innerLinks = innerLinks.filter(
178                                        (link) => !link.includes("#ftn")
179                                    );
180                                    innerText = $("p").text();
181                                }
182                                res({
183                                    ...link,
184                                    innerText,
185                                    innerLinks,
186                                    PDFLinks,
187                                });
188                            } catch (e) {
189                                // console.log(e);
190                                res({
191                                    ...link,
192                                    // error: e.message || e.code || true,
193                                    error: "404 page not found",
194                                });
195                            }
196                        })
197                )
198            )
199        ).map((p) => p.value);
200
201        const InnerPDFs = (
202            await Promise.allSettled(
203                Links.map(
204                    (pdf) =>
205                        new Promise(async (res, rej) => {
206                            try {
207                                const pdfDataArray = [];
208
209                                // Loop through all the PDF links in the `PDFLinks` array
210                                for (const pdfLink of pdf.PDFLinks) {
211                                    try {
212                                        // Fetch the PDF content from the current link
213                                        const link = pdfLink;
214                                        const pdfResponse =
215                                            await page.request.fetch(pdfLink);
216
217                                        // Parse the fetched PDF using pdf-parse
218                                        const pdfText = await pdfParse(
219                                            (
220                                                await pdfResponse.body()
221                                            ).buffer
222                                        );
223
224                                        // Store the parsed information for this PDF
225                                        pdfDataArray.push({
226                                            link,
227                                            text: pdfText.text,
228                                        });
229                                    } catch (innerError) {
230                                        pdfDataArray.push({
231                                            link: pdfLink,
232                                            error:
233                                                innerError.message ||
234                                                innerError.code ||
235                                                true,
236                                        });
237                                    }
238                                }
239
240                                res({
241                                    ...pdf,
242                                    pdfDataArray,
243                                });
244                            } catch (e) {
245                                // console.log(e);
246                                res({
247                                    ...pdf,
248                                    error: e.message || e.code || true,
249                                });
250                            }
251                        })
252                )
253            )
254        ).map((p) => p.value);
255
256        const PDFs = (
257            await Promise.allSettled(
258                result.PDFs.map(
259                    (pdf) =>
260                        new Promise(async (res, rej) => {
261                            try {
262                                const pdfResponse = await page.request.fetch(
263                                    pdf.link
264                                );
265    
266                                // Parse the PDF using pdf-parse
267                                const pdfText = await pdfParse(
268                                    (
269                                        await pdfResponse.body()
270                                    ).buffer
271                                );
272    
273                                res({
274                                    ...pdf,
275                                    text: pdfText.text,
276                                });
277                            } catch (e) {
278                                // console.log(e);
279                                res({ ...pdf, error: e.message || e.code || true });
280                            }
281                        })
282                )
283            )
284        ).map((p) => p.value);
285
286        // If the request has large data errors, mark the data for manual processing
287        if (request.errorMessages.includes("Data item is too large")) {
288            await Dataset.pushData({
289                url: request.url,
290                ...result,
291                PDFs: PDFs.map((item) => ({
292                    ...item,
293                    text: "Please retrieve manually due to size limitations",
294                })),
295                Links: Links.map((item) => ({
296                    ...item,
297                    text: "Please retrieve manually due to size limitations",
298                })),
299            });
300        } else {
301            await Dataset.pushData({
302                url: request.url,
303                ...result,
304                Links,
305                PDFs,
306                InnerPDFs
307            });
308        }
309    } catch (error) {
310        log.error(
311            `An unexpected error occurred: ${error.message || error.code}`
312        );
313    }
314});
315module.exports = { router };
Developer
Maintained by Community
Categories