R R I
Try for free
No credit card required
Go to Store
R R I
nondescript_cord/r-r-i
Try for free
No credit card required
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv
crawlee.json
1{
2 "purgeOnStart": false
3}
Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27
28# Run the image. If you know you won't need headful browsers,
29# you can remove the XVFB start script for a micro perf gain.
30CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
package.json
1{
2 "name": "consumerfinance",
3 "version": "0.0.1",
4 "type": "commonjs",
5 "description": "consumerfinance rules extractor",
6 "dependencies": {
7 "22": "^0.0.0",
8 "apify": "^3.1.10",
9 "crawlee": "^3.5.4",
10 "fs": "^0.0.1-security",
11 "node-fetch": "^3.3.2",
12 "path": "^0.12.7",
13 "pdf-parse": "^1.1.1",
14 "playwright": "^1.43.1"
15 },
16 "devDependencies": {
17 "@apify/eslint-config": "^0.4.0",
18 "@types/pdf-parse": "^1.1.4",
19 "eslint": "^8.50.0",
20 "@playwright/test": "^1.43.1"
21 },
22 "scripts": {
23 "start": "node src/main.js",
24 "lint": "eslint ./src --ext .js,.jsx",
25 "lint:fix": "eslint ./src --ext .js,.jsx --fix",
26 "test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
27 "postinstall": "npx crawlee install-playwright-browsers"
28 },
29 "author": "Moazzam Malek"
30}
start.bat
Downloadstart.sh
Download.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "R-R-I",
4 "title": "Project Playwright Crawler JavaScript",
5 "description": "Crawlee and Playwright project in JavaScript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-playwright-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/input_schema.json
1{
2 "title": "PlaywrightCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with.",
10 "editor": "requestListSources",
11 "prefill": [
12 {
13 "url": "https://apify.com"
14 }
15 ]
16 }
17 }
18}
src/main.js
1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5 const { router } = require("./routes.js");
6 const startUrls = [
7 "https://ncua.gov/regulation-supervision/rules-regulations",
8 ];
9
10 // const proxyConfiguration = await Actor.createProxyConfiguration();
11
12 const crawler = new PlaywrightCrawler({
13 // proxyConfiguration,
14 maxConcurrency: 3,
15 launchContext: {
16 launchOptions: { javaScriptEnabled: false },
17 },
18
19 maxRequestRetries: 5,
20 requestHandler: router,
21 requestHandlerTimeoutSecs: 300,
22 navigationTimeoutSecs: 300,
23 });
24
25 await crawler.run(startUrls);
26 await Dataset.exportToCSV("OUTPUT");
27 await Dataset.exportToJSON("OUTPUT");
28});
src/routes.js
1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5const fs = require("fs");
6
7router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
8 const title = await page.title();
9 log.info(`${title}`, { url: request.loadedUrl });
10 x = title;
11
12 await enqueueLinks({
13 selector: ".field-type-text_with_summary h2 a[href*=regulatory-review]",
14 label: "detail5",
15 });
16
17 await enqueueLinks({
18 selector:
19 ".field-type-text_with_summary h2 a[href*=regulatory-reform-agenda]",
20 label: "detail6",
21 });
22
23 await enqueueLinks({
24 selector:
25 ".field-type-text_with_summary h2 a[href*=interpretive-rulings-policy-statements]",
26 label: "detail7",
27 });
28});
29
30router.addHandler("detail5", async ({ request, page, log }) => {
31 try {
32 const title = await page.title();
33 const url = request.loadedUrl;
34 log.info(`${title}`, { url: request.loadedUrl });
35 const result = await page.evaluate(() => {
36 const result = {
37 Category: "Rules and Regulations",
38 Title:
39 document.querySelector(".page-title")?.innerText || "N/A",
40 MainParagraphText:
41 document.querySelector(".field-type-text_with_summary")
42 ?.innerText || "N/A",
43 Links: [],
44 PDFs: [],
45 };
46
47 const linkElements = document.querySelectorAll(
48 ".field-type-text_with_summary a"
49 );
50 for (const el of Array.from(linkElements)) {
51 const obj = {
52 linkText: el.innerText || "N/A",
53 link: el.href || "",
54 };
55 const numericValue = Number(obj.linkText);
56
57 if (
58 isNaN(numericValue) &&
59 !obj.link.includes("mailto") &&
60 obj.link !== ""
61 ) {
62 if (obj.link.endsWith(".pdf")) {
63 result.PDFs.push(obj);
64 } else result.Links.push(obj);
65 }
66 }
67
68 return result;
69 });
70
71 const PDFs = (
72 await Promise.allSettled(
73 result.PDFs.map(
74 (pdf) =>
75 new Promise(async (res, rej) => {
76 try {
77 const pdfResponse = await page.request.fetch(
78 pdf.link
79 );
80
81 // Parse the PDF using pdf-parse
82 const pdfText = await pdfParse(
83 (
84 await pdfResponse.body()
85 ).buffer
86 );
87
88 res({
89 ...pdf,
90 text: pdfText.text,
91 });
92 } catch (e) {
93 // console.log(e);
94 res({
95 ...pdf,
96 error: e.message || e.code || true,
97 });
98 }
99 })
100 )
101 )
102 ).map((p) => p.value);
103
104 if (request.errorMessages.includes("Data item is too large")) {
105 await Dataset.pushData({
106 url: request.url,
107 ...result,
108 PDFs: PDFs.map((item) => ({
109 ...item,
110 text: "Please retrieve manually due to size limitations",
111 })),
112 Links: Links.map((item) => ({
113 ...item,
114 text: "Please retrieve manually due to size limitations",
115 })),
116 });
117 } else {
118 await Dataset.pushData({
119 url2: request.url,
120 ...result,
121 PDFs,
122 });
123 }
124 } catch (error) {
125 log.error(
126 `An unexpected error occurred: ${error.message || error.code}`
127 );
128 }
129});
130
131router.addHandler("detail6", async ({ request, page, log }) => {
132 try {
133 const title = await page.title();
134 const url = request.loadedUrl;
135 log.info(`${title}`, { url: request.loadedUrl });
136 const result = await page.evaluate(() => {
137 const result = {
138 Category: "Rules and Regulations",
139 Title:
140 document.querySelector(".page-title")?.innerText || "N/A",
141 MainParagraphText:
142 document.querySelector(".field-type-text_with_summary")
143 ?.innerText || "N/A",
144 Links: [],
145 PDFs: [],
146 };
147
148 const linkElements = document.querySelectorAll(
149 ".field-type-text_with_summary a"
150 );
151 for (const el of Array.from(linkElements)) {
152 const obj = {
153 linkText: el.innerText || "N/A",
154 link: el.href || "",
155 };
156 const numericValue = Number(obj.linkText);
157
158 if (
159 isNaN(numericValue) &&
160 !obj.link.includes("mailto") &&
161 obj.link !== ""
162 ) {
163 if (obj.link.endsWith(".pdf")) {
164 result.PDFs.push(obj);
165 } else result.Links.push(obj);
166 }
167 }
168
169 return result;
170 });
171
172
173 const PDFs = (
174 await Promise.allSettled(
175 result.PDFs.map(
176 (pdf) =>
177 new Promise(async (res, rej) => {
178 try {
179 const pdfResponse = await page.request.fetch(
180 pdf.link
181 );
182
183 // Parse the PDF using pdf-parse
184 const pdfText = await pdfParse(
185 (
186 await pdfResponse.body()
187 ).buffer
188 );
189
190 res({
191 ...pdf,
192 text: pdfText.text,
193 });
194 } catch (e) {
195 // console.log(e);
196 res({
197 ...pdf,
198 error: e.message || e.code || true,
199 });
200 }
201 })
202 )
203 )
204 ).map((p) => p.value);
205
206 // If the request has large data errors, mark the data for manual processing
207 if (request.errorMessages.includes("Data item is too large")) {
208 await Dataset.pushData({
209 url: request.url,
210 ...result,
211 PDFs: PDFs.map((item) => ({
212 ...item,
213 text: "Please retrieve manually due to size limitations",
214 })),
215 Links: Links.map((item) => ({
216 ...item,
217 text: "Please retrieve manually due to size limitations",
218 })),
219 });
220 } else {
221 await Dataset.pushData({
222 url: request.url,
223 ...result,
224 // Links,
225 PDFs,
226 // InnerPDFs
227 });
228 }
229 } catch (error) {
230 log.error(
231 `An unexpected error occurred: ${error.message || error.code}`
232 );
233 }
234});
235
236router.addHandler("detail7", async ({ request, page, log }) => {
237 try {
238 const title = await page.title();
239 log.info(`${title}`, { url: request.loadedUrl });
240 const result = await page.evaluate(() => {
241 const result = {
242 Category: "Rules and Regulations",
243 Title:
244 document.querySelector(".page-title")?.innerText || "N/A",
245 MainParagraphText:
246 document.querySelector(".field-type-text_with_summary")
247 ?.innerText || "N/A",
248 Links: [],
249 PDFs: [],
250 };
251
252 const linkElements = document.querySelectorAll(
253 ".field-type-text_with_summary a"
254 );
255 for (const el of Array.from(linkElements)) {
256 const obj = {
257 linkText: el.innerText || "N/A",
258 link: el.href || "",
259 };
260 const numericValue = Number(obj.linkText);
261
262 if (
263 isNaN(numericValue) &&
264 !obj.link.includes("mailto") &&
265 obj.link !== ""
266 ) {
267 if (obj.link.endsWith(".pdf")) {
268 result.PDFs.push(obj);
269 } else result.Links.push(obj);
270 }
271 }
272
273 return result;
274 });
275
276 const PDFs = (
277 await Promise.allSettled(
278 result.PDFs.map(
279 (pdf) =>
280 new Promise(async (res, rej) => {
281 try {
282 const pdfResponse = await page.request.fetch(
283 pdf.link
284 );
285
286 // Parse the PDF using pdf-parse
287 const pdfText = await pdfParse(
288 (
289 await pdfResponse.body()
290 ).buffer
291 );
292
293 res({
294 ...pdf,
295 text: pdfText.text,
296 });
297 } catch (e) {
298 // console.log(e);
299 res({
300 ...pdf,
301 error: e.message || e.code || true,
302 });
303 }
304 })
305 )
306 )
307 ).map((p) => p.value);
308
309 if (request.errorMessages.includes("Data item is too large")) {
310 await Dataset.pushData({
311 url: request.url,
312 ...result,
313 PDFs: PDFs.map((item) => ({
314 ...item,
315 text: "Please retrieve manually due to size limitations",
316 })),
317 Links: Links.map((item) => ({
318 ...item,
319 text: "Please retrieve manually due to size limitations",
320 })),
321 });
322 } else {
323 await Dataset.pushData({
324 url: request.url,
325 ...result,
326 // Links,
327 PDFs,
328 // InnerPDFs
329 });
330 }
331 } catch (error) {
332 log.error(
333 `An unexpected error occurred: ${error.message || error.code}`
334 );
335 }
336});
337
338module.exports = { router };
Developer
Maintained by Community
Actor Metrics
2 monthly users
-
1 star
>99% runs succeeded
Created in May 2024
Modified 8 months ago
Categories