Letters To Credit Unions And Other Guidance
DeprecatedView all Actors![Letters To Credit Unions And Other Guidance](https://apify.com/img/store/actor_picture.svg)
This Actor is deprecated
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsLetters To Credit Unions And Other Guidance
nondescript_cord/letters-to-credit-unions-and-other-guidance
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv
crawlee.json
1{
2 "purgeOnStart": false
3}
package.json
1{
2 "name": "consumerfinance",
3 "version": "0.0.1",
4 "type": "commonjs",
5 "description": "consumerfinance rules extractor",
6 "dependencies": {
7 "22": "^0.0.0",
8 "apify": "^3.1.10",
9 "crawlee": "^3.5.4",
10 "node-fetch": "^3.3.2",
11 "pdf-parse": "^1.1.1",
12 "playwright": "*"
13 },
14 "devDependencies": {
15 "@apify/eslint-config": "^0.4.0",
16 "@types/pdf-parse": "^1.1.4",
17 "eslint": "^8.50.0"
18 },
19 "scripts": {
20 "start": "node src/main.js",
21 "lint": "eslint ./src --ext .js,.jsx",
22 "lint:fix": "eslint ./src --ext .js,.jsx --fix",
23 "test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
24 "postinstall": "npx crawlee install-playwright-browsers"
25 },
26 "author": "Moazzam Malek"
27}
start.bat
Downloadstart.sh
Download.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "Letters-to-Credit-Unions-and-Other-Guidance",
4 "title": "Project Playwright Crawler JavaScript",
5 "description": "Crawlee and Playwright project in JavaScript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-playwright-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/input_schema.json
1{
2 "title": "PlaywrightCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with.",
10 "editor": "requestListSources",
11 "prefill": [
12 {
13 "url": "https://apify.com"
14 }
15 ]
16 }
17 }
18}
src/main.js
1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5 const { router } = require("./routes.js");
6 const startUrls = [
7 "https://ncua.gov/regulation-supervision/letters-credit-unions-other-guidance?page=0&sort=date&dir=desc&sq=",
8 ];
9
10 // const proxyConfiguration = await Actor.createProxyConfiguration();
11
12 const crawler = new PlaywrightCrawler({
13 // proxyConfiguration,
14 maxConcurrency: 3,
15 launchContext: {
16 launchOptions: { javaScriptEnabled: false },
17 },
18
19 maxRequestRetries: 5,
20 requestHandler: router,
21 requestHandlerTimeoutSecs: 300,
22 navigationTimeoutSecs: 300,
23 });
24
25 await crawler.run(startUrls);
26 await Dataset.exportToCSV("OUTPUT");
27 await Dataset.exportToJSON("OUTPUT");
28
29});
src/routes.js
1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5// const fetch = require("node-fetch");
6// const { default: fetch } = await import("node-fetch");
7const fs = require("fs");
8const { promisify } = require("util");
9
10router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
11 const title = await page.title();
12 log.info(`${title}`, { url: request.loadedUrl });
13 x = title;
14 await enqueueLinks({
15 selector: "#edit-next-container a",
16 });
17
18 await enqueueLinks({
19 selector: "tbody td a",
20 label: "detail",
21 transformRequestFunction(request) {
22 if (request.url.endsWith(".pdf")) {
23 log.info(`PDF ${request.url}`);
24 fetchPDF(request.url);
25 return false;
26 } else {
27 return request;
28 }
29 },
30 });
31 async function fetchPDF(pdfLink) {
32 const { default: fetch } = await import("node-fetch");
33 try {
34 // const mkdirAsync = promisify(fs.mkdir);
35 // const directory = "storage/PDFs";
36 // await mkdirAsync(directory, { recursive: true });
37 // const writeFileAsync = promisify(fs.writeFile);
38 const response = await fetch(pdfLink);
39 const buffer = await response.arrayBuffer();
40 const pdfText = await pdfParse(buffer);
41 // const serialNumber = pdfLink.substring(
42 // pdfLink.lastIndexOf("/") + 1,
43 // pdfLink.lastIndexOf(".")
44 // );
45 // const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
46 // const jsonData = JSON.stringify(
47 // { link: pdfLink, text: pdfText.text },
48 // null,
49 // 2
50 // );
51 // await writeFileAsync(filename, jsonData);
52 // console.log(`JSON file "${filename}" created successfully.`);
53 await Dataset.pushData({
54 url: pdfLink,
55 pdftext: pdfText.text,
56 });
57
58 } catch (error) {
59 // const writeFileAsync = promisify(fs.writeFile);
60 // const serialNumber = pdfLink.substring(
61 // pdfLink.lastIndexOf("/") + 1,
62 // pdfLink.lastIndexOf(".")
63 // );
64 // const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
65 // const jsonData = JSON.stringify(
66 // {
67 // link: pdfLink,
68 // error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
69 // },
70 // null,
71 // 2
72 // );
73 // await writeFileAsync(filename, jsonData);
74 // console.error(
75 // `Error fetching or parsing PDF from ${pdfLink}:`,
76 // error
77 // );
78 await Dataset.pushData({
79 link: pdfLink,
80 error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
81 });
82 }
83 }
84});
85
86router.addHandler("detail", async ({ request, page, log }) => {
87 try {
88 const title = await page.title();
89 const url = request.loadedUrl;
90 log.info(`${title}`, { url: request.loadedUrl });
91 const result = await page.evaluate(() => {
92 const result = {
93 Docket:
94 document.querySelector("span[class*=docket]")?.innerText ||
95 "N/A",
96 Date:
97 document.querySelector("span[class*=date]")?.innerText ||
98 "N/A",
99 Category:
100 document.querySelector(
101 "a[href*=letters-credit-unions-other-guidance]"
102 )?.innerText || "N/A",
103 Title:
104 document.querySelector(".pseudo-title")?.innerText || "N/A",
105 MainParagraphText:
106 document.querySelector(".row.no-gutters .body")
107 ?.innerText || "N/A",
108 Links: [],
109 PDFs: [],
110 };
111
112 const linkElements = document.querySelectorAll(
113 ".row.no-gutters .body a"
114 );
115 for (const el of Array.from(linkElements)) {
116 const obj = {
117 linkText: el.innerText || "N/A",
118 link: el.href || "",
119 };
120 const numericValue = Number(obj.linkText);
121
122
123
124 if (
125 isNaN(numericValue) &&
126 !obj.link.includes("mailto") &&
127 obj.link !== ""
128 ) {
129 if(obj.link.endsWith('.pdf')){
130 result.PDFs.push(obj);
131 }
132 else result.Links.push(obj);
133 }
134 }
135
136 return result;
137 });
138
139 const Links = (
140 await Promise.allSettled(
141 result.Links.map(
142 (link) =>
143 new Promise(async (res, rej) => {
144 try {
145 // let innerTitle;
146 if (!link.link.includes(".pdf")) {
147 const FederalRegisterResponse =
148 await page.request.fetch(link.link);
149 const $ = load(
150 await FederalRegisterResponse.text()
151 );
152 const contentDiv = $(".layout-content");
153 innerLinks = contentDiv
154 .find("a")
155 .map((i, el) => $(el).attr("href"))
156 .get();
157 innerLinks = innerLinks.map((innerLink) => {
158 if (!innerLink.startsWith("http")) {
159 return (
160 "https://ncua.gov" + innerLink
161 );
162 }
163 return innerLink;
164 });
165 innerLinks = Array.from(
166 new Set(innerLinks)
167 );
168 PDFLinks = innerLinks.filter((link) =>
169 link.endsWith(".pdf")
170 );
171 innerLinks = innerLinks.filter(
172 (link) => !link.endsWith(".pdf")
173 );
174 innerLinks = innerLinks.filter(
175 (link) => !link.endsWith("@ncua.gov")
176 );
177 innerLinks = innerLinks.filter(
178 (link) => !link.includes("#ftn")
179 );
180 innerText = $("p").text();
181 }
182 res({
183 ...link,
184 innerText,
185 innerLinks,
186 PDFLinks,
187 });
188 } catch (e) {
189 // console.log(e);
190 res({
191 ...link,
192 // error: e.message || e.code || true,
193 error: "404 page not found",
194 });
195 }
196 })
197 )
198 )
199 ).map((p) => p.value);
200
201 const InnerPDFs = (
202 await Promise.allSettled(
203 Links.map(
204 (pdf) =>
205 new Promise(async (res, rej) => {
206 try {
207 const pdfDataArray = [];
208
209 // Loop through all the PDF links in the `PDFLinks` array
210 for (const pdfLink of pdf.PDFLinks) {
211 try {
212 // Fetch the PDF content from the current link
213 const link = pdfLink;
214 const pdfResponse =
215 await page.request.fetch(pdfLink);
216
217 // Parse the fetched PDF using pdf-parse
218 const pdfText = await pdfParse(
219 (
220 await pdfResponse.body()
221 ).buffer
222 );
223
224 // Store the parsed information for this PDF
225 pdfDataArray.push({
226 link,
227 text: pdfText.text,
228 });
229 } catch (innerError) {
230 pdfDataArray.push({
231 link: pdfLink,
232 error:
233 innerError.message ||
234 innerError.code ||
235 true,
236 });
237 }
238 }
239
240 res({
241 ...pdf,
242 pdfDataArray,
243 });
244 } catch (e) {
245 // console.log(e);
246 res({
247 ...pdf,
248 error: e.message || e.code || true,
249 });
250 }
251 })
252 )
253 )
254 ).map((p) => p.value);
255
256 const PDFs = (
257 await Promise.allSettled(
258 result.PDFs.map(
259 (pdf) =>
260 new Promise(async (res, rej) => {
261 try {
262 const pdfResponse = await page.request.fetch(
263 pdf.link
264 );
265
266 // Parse the PDF using pdf-parse
267 const pdfText = await pdfParse(
268 (
269 await pdfResponse.body()
270 ).buffer
271 );
272
273 res({
274 ...pdf,
275 text: pdfText.text,
276 });
277 } catch (e) {
278 // console.log(e);
279 res({ ...pdf, error: e.message || e.code || true });
280 }
281 })
282 )
283 )
284 ).map((p) => p.value);
285
286 // If the request has large data errors, mark the data for manual processing
287 if (request.errorMessages.includes("Data item is too large")) {
288 await Dataset.pushData({
289 url: request.url,
290 ...result,
291 PDFs: PDFs.map((item) => ({
292 ...item,
293 text: "Please retrieve manually due to size limitations",
294 })),
295 Links: Links.map((item) => ({
296 ...item,
297 text: "Please retrieve manually due to size limitations",
298 })),
299 });
300 } else {
301 await Dataset.pushData({
302 url: request.url,
303 ...result,
304 Links,
305 PDFs,
306 InnerPDFs
307 });
308 }
309 } catch (error) {
310 log.error(
311 `An unexpected error occurred: ${error.message || error.code}`
312 );
313 }
314});
315module.exports = { router };
Developer
Maintained by Community
Categories