R R I avatar
R R I

Pricing

Pay per usage

Go to Store
R R I

R R I

Developed by

Yash Agarwal

Yash Agarwal

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

2

Monthly users

2

Runs succeeded

>99%

Last modified

a year ago

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage
# Added by Apify CLI
.venv

crawlee.json

{
"purgeOnStart": false
}

Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:20
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

package.json

{
"name": "consumerfinance",
"version": "0.0.1",
"type": "commonjs",
"description": "consumerfinance rules extractor",
"dependencies": {
"22": "^0.0.0",
"apify": "^3.1.10",
"crawlee": "^3.5.4",
"fs": "^0.0.1-security",
"node-fetch": "^3.3.2",
"path": "^0.12.7",
"pdf-parse": "^1.1.1",
"playwright": "^1.43.1"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"@types/pdf-parse": "^1.1.4",
"eslint": "^8.50.0",
"@playwright/test": "^1.43.1"
},
"scripts": {
"start": "node src/main.js",
"lint": "eslint ./src --ext .js,.jsx",
"lint:fix": "eslint ./src --ext .js,.jsx --fix",
"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
"postinstall": "npx crawlee install-playwright-browsers"
},
"author": "Moazzam Malek"
}

start.bat

Download

start.sh

Download

.actor/actor.json

{
"actorSpecification": 1,
"name": "R-R-I",
"title": "Project Playwright Crawler JavaScript",
"description": "Crawlee and Playwright project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-playwright-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/input_schema.json

{
"title": "PlaywrightCrawler Template",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Start URLs",
"type": "array",
"description": "URLs to start with.",
"editor": "requestListSources",
"prefill": [
{
"url": "https://apify.com"
}
]
}
}
}

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5 const { router } = require("./routes.js");
6 const startUrls = [
7 "https://ncua.gov/regulation-supervision/rules-regulations",
8 ];
9
10 // const proxyConfiguration = await Actor.createProxyConfiguration();
11
12 const crawler = new PlaywrightCrawler({
13 // proxyConfiguration,
14 maxConcurrency: 3,
15 launchContext: {
16 launchOptions: { javaScriptEnabled: false },
17 },
18
19 maxRequestRetries: 5,
20 requestHandler: router,
21 requestHandlerTimeoutSecs: 300,
22 navigationTimeoutSecs: 300,
23 });
24
25 await crawler.run(startUrls);
26 await Dataset.exportToCSV("OUTPUT");
27 await Dataset.exportToJSON("OUTPUT");
28});

src/routes.js

1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5const fs = require("fs");
6
7router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
8 const title = await page.title();
9 log.info(`${title}`, { url: request.loadedUrl });
10 x = title;
11
12 await enqueueLinks({
13 selector: ".field-type-text_with_summary h2 a[href*=regulatory-review]",
14 label: "detail5",
15 });
16
17 await enqueueLinks({
18 selector:
19 ".field-type-text_with_summary h2 a[href*=regulatory-reform-agenda]",
20 label: "detail6",
21 });
22
23 await enqueueLinks({
24 selector:
25 ".field-type-text_with_summary h2 a[href*=interpretive-rulings-policy-statements]",
26 label: "detail7",
27 });
28});
29
30router.addHandler("detail5", async ({ request, page, log }) => {
31 try {
32 const title = await page.title();
33 const url = request.loadedUrl;
34 log.info(`${title}`, { url: request.loadedUrl });
35 const result = await page.evaluate(() => {
36 const result = {
37 Category: "Rules and Regulations",
38 Title:
39 document.querySelector(".page-title")?.innerText || "N/A",
40 MainParagraphText:
41 document.querySelector(".field-type-text_with_summary")
42 ?.innerText || "N/A",
43 Links: [],
44 PDFs: [],
45 };
46
47 const linkElements = document.querySelectorAll(
48 ".field-type-text_with_summary a"
49 );
50 for (const el of Array.from(linkElements)) {
51 const obj = {
52 linkText: el.innerText || "N/A",
53 link: el.href || "",
54 };
55 const numericValue = Number(obj.linkText);
56
57 if (
58 isNaN(numericValue) &&
59 !obj.link.includes("mailto") &&
60 obj.link !== ""
61 ) {
62 if (obj.link.endsWith(".pdf")) {
63 result.PDFs.push(obj);
64 } else result.Links.push(obj);
65 }
66 }
67
68 return result;
69 });
70
71 const PDFs = (
72 await Promise.allSettled(
73 result.PDFs.map(
74 (pdf) =>
75 new Promise(async (res, rej) => {
76 try {
77 const pdfResponse = await page.request.fetch(
78 pdf.link
79 );
80
81 // Parse the PDF using pdf-parse
82 const pdfText = await pdfParse(
83 (
84 await pdfResponse.body()
85 ).buffer
86 );
87
88 res({
89 ...pdf,
90 text: pdfText.text,
91 });
92 } catch (e) {
93 // console.log(e);
94 res({
95 ...pdf,
96 error: e.message || e.code || true,
97 });
98 }
99 })
100 )
101 )
102 ).map((p) => p.value);
103
104 if (request.errorMessages.includes("Data item is too large")) {
105 await Dataset.pushData({
106 url: request.url,
107 ...result,
108 PDFs: PDFs.map((item) => ({
109 ...item,
110 text: "Please retrieve manually due to size limitations",
111 })),
112 Links: Links.map((item) => ({
113 ...item,
114 text: "Please retrieve manually due to size limitations",
115 })),
116 });
117 } else {
118 await Dataset.pushData({
119 url2: request.url,
120 ...result,
121 PDFs,
122 });
123 }
124 } catch (error) {
125 log.error(
126 `An unexpected error occurred: ${error.message || error.code}`
127 );
128 }
129});
130
131router.addHandler("detail6", async ({ request, page, log }) => {
132 try {
133 const title = await page.title();
134 const url = request.loadedUrl;
135 log.info(`${title}`, { url: request.loadedUrl });
136 const result = await page.evaluate(() => {
137 const result = {
138 Category: "Rules and Regulations",
139 Title:
140 document.querySelector(".page-title")?.innerText || "N/A",
141 MainParagraphText:
142 document.querySelector(".field-type-text_with_summary")
143 ?.innerText || "N/A",
144 Links: [],
145 PDFs: [],
146 };
147
148 const linkElements = document.querySelectorAll(
149 ".field-type-text_with_summary a"
150 );
151 for (const el of Array.from(linkElements)) {
152 const obj = {
153 linkText: el.innerText || "N/A",
154 link: el.href || "",
155 };
156 const numericValue = Number(obj.linkText);
157
158 if (
159 isNaN(numericValue) &&
160 !obj.link.includes("mailto") &&
161 obj.link !== ""
162 ) {
163 if (obj.link.endsWith(".pdf")) {
164 result.PDFs.push(obj);
165 } else result.Links.push(obj);
166 }
167 }
168
169 return result;
170 });
171
172
173 const PDFs = (
174 await Promise.allSettled(
175 result.PDFs.map(
176 (pdf) =>
177 new Promise(async (res, rej) => {
178 try {
179 const pdfResponse = await page.request.fetch(
180 pdf.link
181 );
182
183 // Parse the PDF using pdf-parse
184 const pdfText = await pdfParse(
185 (
186 await pdfResponse.body()
187 ).buffer
188 );
189
190 res({
191 ...pdf,
192 text: pdfText.text,
193 });
194 } catch (e) {
195 // console.log(e);
196 res({
197 ...pdf,
198 error: e.message || e.code || true,
199 });
200 }
201 })
202 )
203 )
204 ).map((p) => p.value);
205
206 // If the request has large data errors, mark the data for manual processing
207 if (request.errorMessages.includes("Data item is too large")) {
208 await Dataset.pushData({
209 url: request.url,
210 ...result,
211 PDFs: PDFs.map((item) => ({
212 ...item,
213 text: "Please retrieve manually due to size limitations",
214 })),
215 Links: Links.map((item) => ({
216 ...item,
217 text: "Please retrieve manually due to size limitations",
218 })),
219 });
220 } else {
221 await Dataset.pushData({
222 url: request.url,
223 ...result,
224 // Links,
225 PDFs,
226 // InnerPDFs
227 });
228 }
229 } catch (error) {
230 log.error(
231 `An unexpected error occurred: ${error.message || error.code}`
232 );
233 }
234});
235
236router.addHandler("detail7", async ({ request, page, log }) => {
237 try {
238 const title = await page.title();
239 log.info(`${title}`, { url: request.loadedUrl });
240 const result = await page.evaluate(() => {
241 const result = {
242 Category: "Rules and Regulations",
243 Title:
244 document.querySelector(".page-title")?.innerText || "N/A",
245 MainParagraphText:
246 document.querySelector(".field-type-text_with_summary")
247 ?.innerText || "N/A",
248 Links: [],
249 PDFs: [],
250 };
251
252 const linkElements = document.querySelectorAll(
253 ".field-type-text_with_summary a"
254 );
255 for (const el of Array.from(linkElements)) {
256 const obj = {
257 linkText: el.innerText || "N/A",
258 link: el.href || "",
259 };
260 const numericValue = Number(obj.linkText);
261
262 if (
263 isNaN(numericValue) &&
264 !obj.link.includes("mailto") &&
265 obj.link !== ""
266 ) {
267 if (obj.link.endsWith(".pdf")) {
268 result.PDFs.push(obj);
269 } else result.Links.push(obj);
270 }
271 }
272
273 return result;
274 });
275
276 const PDFs = (
277 await Promise.allSettled(
278 result.PDFs.map(
279 (pdf) =>
280 new Promise(async (res, rej) => {
281 try {
282 const pdfResponse = await page.request.fetch(
283 pdf.link
284 );
285
286 // Parse the PDF using pdf-parse
287 const pdfText = await pdfParse(
288 (
289 await pdfResponse.body()
290 ).buffer
291 );
292
293 res({
294 ...pdf,
295 text: pdfText.text,
296 });
297 } catch (e) {
298 // console.log(e);
299 res({
300 ...pdf,
301 error: e.message || e.code || true,
302 });
303 }
304 })
305 )
306 )
307 ).map((p) => p.value);
308
309 if (request.errorMessages.includes("Data item is too large")) {
310 await Dataset.pushData({
311 url: request.url,
312 ...result,
313 PDFs: PDFs.map((item) => ({
314 ...item,
315 text: "Please retrieve manually due to size limitations",
316 })),
317 Links: Links.map((item) => ({
318 ...item,
319 text: "Please retrieve manually due to size limitations",
320 })),
321 });
322 } else {
323 await Dataset.pushData({
324 url: request.url,
325 ...result,
326 // Links,
327 PDFs,
328 // InnerPDFs
329 });
330 }
331 } catch (error) {
332 log.error(
333 `An unexpected error occurred: ${error.message || error.code}`
334 );
335 }
336});
337
338module.exports = { router };