Letters To Credit Unions And Other Guidance avatar
Letters To Credit Unions And Other Guidance

Deprecated

Pricing

Pay per usage

Go to Store
Letters To Credit Unions And Other Guidance

Letters To Credit Unions And Other Guidance

Deprecated

Developed by

Yash Agarwal

Yash Agarwal

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

2

Monthly users

2

Last modified

a year ago

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage
# Added by Apify CLI
.venv

crawlee.json

{
"purgeOnStart": false
}

package.json

{
"name": "consumerfinance",
"version": "0.0.1",
"type": "commonjs",
"description": "consumerfinance rules extractor",
"dependencies": {
"22": "^0.0.0",
"apify": "^3.1.10",
"crawlee": "^3.5.4",
"node-fetch": "^3.3.2",
"pdf-parse": "^1.1.1",
"playwright": "*"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"@types/pdf-parse": "^1.1.4",
"eslint": "^8.50.0"
},
"scripts": {
"start": "node src/main.js",
"lint": "eslint ./src --ext .js,.jsx",
"lint:fix": "eslint ./src --ext .js,.jsx --fix",
"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
"postinstall": "npx crawlee install-playwright-browsers"
},
"author": "Moazzam Malek"
}

start.bat

Download

start.sh

Download

.actor/actor.json

{
"actorSpecification": 1,
"name": "Letters-to-Credit-Unions-and-Other-Guidance",
"title": "Project Playwright Crawler JavaScript",
"description": "Crawlee and Playwright project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-playwright-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/input_schema.json

{
"title": "PlaywrightCrawler Template",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Start URLs",
"type": "array",
"description": "URLs to start with.",
"editor": "requestListSources",
"prefill": [
{
"url": "https://apify.com"
}
]
}
}
}

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5 const { router } = require("./routes.js");
6 const startUrls = [
7 "https://ncua.gov/regulation-supervision/letters-credit-unions-other-guidance?page=0&sort=date&dir=desc&sq=",
8 ];
9
10 // const proxyConfiguration = await Actor.createProxyConfiguration();
11
12 const crawler = new PlaywrightCrawler({
13 // proxyConfiguration,
14 maxConcurrency: 3,
15 launchContext: {
16 launchOptions: { javaScriptEnabled: false },
17 },
18
19 maxRequestRetries: 5,
20 requestHandler: router,
21 requestHandlerTimeoutSecs: 300,
22 navigationTimeoutSecs: 300,
23 });
24
25 await crawler.run(startUrls);
26 await Dataset.exportToCSV("OUTPUT");
27 await Dataset.exportToJSON("OUTPUT");
28
29});

src/routes.js

1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5// const fetch = require("node-fetch");
6// const { default: fetch } = await import("node-fetch");
7const fs = require("fs");
8const { promisify } = require("util");
9
10router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
11 const title = await page.title();
12 log.info(`${title}`, { url: request.loadedUrl });
13 x = title;
14 await enqueueLinks({
15 selector: "#edit-next-container a",
16 });
17
18 await enqueueLinks({
19 selector: "tbody td a",
20 label: "detail",
21 transformRequestFunction(request) {
22 if (request.url.endsWith(".pdf")) {
23 log.info(`PDF ${request.url}`);
24 fetchPDF(request.url);
25 return false;
26 } else {
27 return request;
28 }
29 },
30 });
31 async function fetchPDF(pdfLink) {
32 const { default: fetch } = await import("node-fetch");
33 try {
34 // const mkdirAsync = promisify(fs.mkdir);
35 // const directory = "storage/PDFs";
36 // await mkdirAsync(directory, { recursive: true });
37 // const writeFileAsync = promisify(fs.writeFile);
38 const response = await fetch(pdfLink);
39 const buffer = await response.arrayBuffer();
40 const pdfText = await pdfParse(buffer);
41 // const serialNumber = pdfLink.substring(
42 // pdfLink.lastIndexOf("/") + 1,
43 // pdfLink.lastIndexOf(".")
44 // );
45 // const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
46 // const jsonData = JSON.stringify(
47 // { link: pdfLink, text: pdfText.text },
48 // null,
49 // 2
50 // );
51 // await writeFileAsync(filename, jsonData);
52 // console.log(`JSON file "${filename}" created successfully.`);
53 await Dataset.pushData({
54 url: pdfLink,
55 pdftext: pdfText.text,
56 });
57
58 } catch (error) {
59 // const writeFileAsync = promisify(fs.writeFile);
60 // const serialNumber = pdfLink.substring(
61 // pdfLink.lastIndexOf("/") + 1,
62 // pdfLink.lastIndexOf(".")
63 // );
64 // const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
65 // const jsonData = JSON.stringify(
66 // {
67 // link: pdfLink,
68 // error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
69 // },
70 // null,
71 // 2
72 // );
73 // await writeFileAsync(filename, jsonData);
74 // console.error(
75 // `Error fetching or parsing PDF from ${pdfLink}:`,
76 // error
77 // );
78 await Dataset.pushData({
79 link: pdfLink,
80 error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
81 });
82 }
83 }
84});
85
86router.addHandler("detail", async ({ request, page, log }) => {
87 try {
88 const title = await page.title();
89 const url = request.loadedUrl;
90 log.info(`${title}`, { url: request.loadedUrl });
91 const result = await page.evaluate(() => {
92 const result = {
93 Docket:
94 document.querySelector("span[class*=docket]")?.innerText ||
95 "N/A",
96 Date:
97 document.querySelector("span[class*=date]")?.innerText ||
98 "N/A",
99 Category:
100 document.querySelector(
101 "a[href*=letters-credit-unions-other-guidance]"
102 )?.innerText || "N/A",
103 Title:
104 document.querySelector(".pseudo-title")?.innerText || "N/A",
105 MainParagraphText:
106 document.querySelector(".row.no-gutters .body")
107 ?.innerText || "N/A",
108 Links: [],
109 PDFs: [],
110 };
111
112 const linkElements = document.querySelectorAll(
113 ".row.no-gutters .body a"
114 );
115 for (const el of Array.from(linkElements)) {
116 const obj = {
117 linkText: el.innerText || "N/A",
118 link: el.href || "",
119 };
120 const numericValue = Number(obj.linkText);
121
122
123
124 if (
125 isNaN(numericValue) &&
126 !obj.link.includes("mailto") &&
127 obj.link !== ""
128 ) {
129 if(obj.link.endsWith('.pdf')){
130 result.PDFs.push(obj);
131 }
132 else result.Links.push(obj);
133 }
134 }
135
136 return result;
137 });
138
139 const Links = (
140 await Promise.allSettled(
141 result.Links.map(
142 (link) =>
143 new Promise(async (res, rej) => {
144 try {
145 // let innerTitle;
146 if (!link.link.includes(".pdf")) {
147 const FederalRegisterResponse =
148 await page.request.fetch(link.link);
149 const $ = load(
150 await FederalRegisterResponse.text()
151 );
152 const contentDiv = $(".layout-content");
153 innerLinks = contentDiv
154 .find("a")
155 .map((i, el) => $(el).attr("href"))
156 .get();
157 innerLinks = innerLinks.map((innerLink) => {
158 if (!innerLink.startsWith("http")) {
159 return (
160 "https://ncua.gov" + innerLink
161 );
162 }
163 return innerLink;
164 });
165 innerLinks = Array.from(
166 new Set(innerLinks)
167 );
168 PDFLinks = innerLinks.filter((link) =>
169 link.endsWith(".pdf")
170 );
171 innerLinks = innerLinks.filter(
172 (link) => !link.endsWith(".pdf")
173 );
174 innerLinks = innerLinks.filter(
175 (link) => !link.endsWith("@ncua.gov")
176 );
177 innerLinks = innerLinks.filter(
178 (link) => !link.includes("#ftn")
179 );
180 innerText = $("p").text();
181 }
182 res({
183 ...link,
184 innerText,
185 innerLinks,
186 PDFLinks,
187 });
188 } catch (e) {
189 // console.log(e);
190 res({
191 ...link,
192 // error: e.message || e.code || true,
193 error: "404 page not found",
194 });
195 }
196 })
197 )
198 )
199 ).map((p) => p.value);
200
201 const InnerPDFs = (
202 await Promise.allSettled(
203 Links.map(
204 (pdf) =>
205 new Promise(async (res, rej) => {
206 try {
207 const pdfDataArray = [];
208
209 // Loop through all the PDF links in the `PDFLinks` array
210 for (const pdfLink of pdf.PDFLinks) {
211 try {
212 // Fetch the PDF content from the current link
213 const link = pdfLink;
214 const pdfResponse =
215 await page.request.fetch(pdfLink);
216
217 // Parse the fetched PDF using pdf-parse
218 const pdfText = await pdfParse(
219 (
220 await pdfResponse.body()
221 ).buffer
222 );
223
224 // Store the parsed information for this PDF
225 pdfDataArray.push({
226 link,
227 text: pdfText.text,
228 });
229 } catch (innerError) {
230 pdfDataArray.push({
231 link: pdfLink,
232 error:
233 innerError.message ||
234 innerError.code ||
235 true,
236 });
237 }
238 }
239
240 res({
241 ...pdf,
242 pdfDataArray,
243 });
244 } catch (e) {
245 // console.log(e);
246 res({
247 ...pdf,
248 error: e.message || e.code || true,
249 });
250 }
251 })
252 )
253 )
254 ).map((p) => p.value);
255
256 const PDFs = (
257 await Promise.allSettled(
258 result.PDFs.map(
259 (pdf) =>
260 new Promise(async (res, rej) => {
261 try {
262 const pdfResponse = await page.request.fetch(
263 pdf.link
264 );
265
266 // Parse the PDF using pdf-parse
267 const pdfText = await pdfParse(
268 (
269 await pdfResponse.body()
270 ).buffer
271 );
272
273 res({
274 ...pdf,
275 text: pdfText.text,
276 });
277 } catch (e) {
278 // console.log(e);
279 res({ ...pdf, error: e.message || e.code || true });
280 }
281 })
282 )
283 )
284 ).map((p) => p.value);
285
286 // If the request has large data errors, mark the data for manual processing
287 if (request.errorMessages.includes("Data item is too large")) {
288 await Dataset.pushData({
289 url: request.url,
290 ...result,
291 PDFs: PDFs.map((item) => ({
292 ...item,
293 text: "Please retrieve manually due to size limitations",
294 })),
295 Links: Links.map((item) => ({
296 ...item,
297 text: "Please retrieve manually due to size limitations",
298 })),
299 });
300 } else {
301 await Dataset.pushData({
302 url: request.url,
303 ...result,
304 Links,
305 PDFs,
306 InnerPDFs
307 });
308 }
309 } catch (error) {
310 log.error(
311 `An unexpected error occurred: ${error.message || error.code}`
312 );
313 }
314});
315module.exports = { router };