R R I
Pricing
Pay per usage
Go to Store
R R I
0.0 (0)
Pricing
Pay per usage
1
Total users
2
Monthly users
2
Runs succeeded
>99%
Last modified
a year ago
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
# Added by Apify CLI.venv
crawlee.json
{ "purgeOnStart": false}
Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-playwright-chrome:20
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
package.json
{ "name": "consumerfinance", "version": "0.0.1", "type": "commonjs", "description": "consumerfinance rules extractor", "dependencies": { "22": "^0.0.0", "apify": "^3.1.10", "crawlee": "^3.5.4", "fs": "^0.0.1-security", "node-fetch": "^3.3.2", "path": "^0.12.7", "pdf-parse": "^1.1.1", "playwright": "^1.43.1" }, "devDependencies": { "@apify/eslint-config": "^0.4.0", "@types/pdf-parse": "^1.1.4", "eslint": "^8.50.0", "@playwright/test": "^1.43.1" }, "scripts": { "start": "node src/main.js", "lint": "eslint ./src --ext .js,.jsx", "lint:fix": "eslint ./src --ext .js,.jsx --fix", "test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1", "postinstall": "npx crawlee install-playwright-browsers" }, "author": "Moazzam Malek"}
start.bat
Downloadstart.sh
Download.actor/actor.json
{ "actorSpecification": 1, "name": "R-R-I", "title": "Project Playwright Crawler JavaScript", "description": "Crawlee and Playwright project in JavaScript.", "version": "0.0", "meta": { "templateId": "js-crawlee-playwright-chrome" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-playwright-chrome:18
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/input_schema.json
{ "title": "PlaywrightCrawler Template", "type": "object", "schemaVersion": 1, "properties": { "startUrls": { "title": "Start URLs", "type": "array", "description": "URLs to start with.", "editor": "requestListSources", "prefill": [ { "url": "https://apify.com" } ] } }}
src/main.js
1const { Actor } = require("apify");2const { PlaywrightCrawler, Dataset } = require("crawlee");3
4Actor.main(async () => {5 const { router } = require("./routes.js");6 const startUrls = [7 "https://ncua.gov/regulation-supervision/rules-regulations",8 ];9
10 // const proxyConfiguration = await Actor.createProxyConfiguration();11
12 const crawler = new PlaywrightCrawler({13 // proxyConfiguration,14 maxConcurrency: 3,15 launchContext: {16 launchOptions: { javaScriptEnabled: false },17 },18
19 maxRequestRetries: 5,20 requestHandler: router,21 requestHandlerTimeoutSecs: 300,22 navigationTimeoutSecs: 300,23 });24
25 await crawler.run(startUrls);26 await Dataset.exportToCSV("OUTPUT");27 await Dataset.exportToJSON("OUTPUT");28});
src/routes.js
1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");2const pdfParse = require("pdf-parse");3const router = createPlaywrightRouter();4const { load } = require("cheerio");5const fs = require("fs");6
7router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {8 const title = await page.title();9 log.info(`${title}`, { url: request.loadedUrl });10 x = title;11
12 await enqueueLinks({13 selector: ".field-type-text_with_summary h2 a[href*=regulatory-review]",14 label: "detail5",15 });16
17 await enqueueLinks({18 selector:19 ".field-type-text_with_summary h2 a[href*=regulatory-reform-agenda]",20 label: "detail6",21 });22
23 await enqueueLinks({24 selector:25 ".field-type-text_with_summary h2 a[href*=interpretive-rulings-policy-statements]",26 label: "detail7",27 });28});29
30router.addHandler("detail5", async ({ request, page, log }) => {31 try {32 const title = await page.title();33 const url = request.loadedUrl;34 log.info(`${title}`, { url: request.loadedUrl });35 const result = await page.evaluate(() => {36 const result = {37 Category: "Rules and Regulations",38 Title:39 document.querySelector(".page-title")?.innerText || "N/A",40 MainParagraphText:41 document.querySelector(".field-type-text_with_summary")42 ?.innerText || "N/A",43 Links: [],44 PDFs: [],45 };46
47 const linkElements = document.querySelectorAll(48 ".field-type-text_with_summary a"49 );50 for (const el of Array.from(linkElements)) {51 const obj = {52 linkText: el.innerText || "N/A",53 link: el.href || "",54 };55 const numericValue = Number(obj.linkText);56
57 if (58 isNaN(numericValue) &&59 !obj.link.includes("mailto") &&60 obj.link !== ""61 ) {62 if (obj.link.endsWith(".pdf")) {63 result.PDFs.push(obj);64 } else result.Links.push(obj);65 }66 }67
68 return result;69 });70
71 const PDFs = (72 await Promise.allSettled(73 result.PDFs.map(74 (pdf) =>75 new Promise(async (res, rej) => {76 try {77 const pdfResponse = await page.request.fetch(78 pdf.link79 );80
81 // Parse the PDF using pdf-parse82 const pdfText = await pdfParse(83 (84 await pdfResponse.body()85 ).buffer86 );87
88 res({89 ...pdf,90 text: pdfText.text,91 });92 } catch (e) {93 // console.log(e);94 res({95 ...pdf,96 error: e.message || e.code || true,97 });98 }99 })100 )101 )102 ).map((p) => p.value);103
104 if (request.errorMessages.includes("Data item is too large")) {105 await Dataset.pushData({106 url: request.url,107 ...result,108 PDFs: PDFs.map((item) => ({109 ...item,110 text: "Please retrieve manually due to size limitations",111 })),112 Links: Links.map((item) => ({113 ...item,114 text: "Please retrieve manually due to size limitations",115 })),116 });117 } else {118 await Dataset.pushData({119 url2: request.url,120 ...result,121 PDFs,122 });123 }124 } catch (error) {125 log.error(126 `An unexpected error occurred: ${error.message || error.code}`127 );128 }129});130
131router.addHandler("detail6", async ({ request, page, log }) => {132 try {133 const title = await page.title();134 const url = request.loadedUrl;135 log.info(`${title}`, { url: request.loadedUrl });136 const result = await page.evaluate(() => {137 const result = {138 Category: "Rules and Regulations",139 Title:140 document.querySelector(".page-title")?.innerText || "N/A",141 MainParagraphText:142 document.querySelector(".field-type-text_with_summary")143 ?.innerText || "N/A",144 Links: [],145 PDFs: [],146 };147
148 const linkElements = document.querySelectorAll(149 ".field-type-text_with_summary a"150 );151 for (const el of Array.from(linkElements)) {152 const obj = {153 linkText: el.innerText || "N/A",154 link: el.href || "",155 };156 const numericValue = Number(obj.linkText);157
158 if (159 isNaN(numericValue) &&160 !obj.link.includes("mailto") &&161 obj.link !== ""162 ) {163 if (obj.link.endsWith(".pdf")) {164 result.PDFs.push(obj);165 } else result.Links.push(obj);166 }167 }168
169 return result;170 });171
172
173 const PDFs = (174 await Promise.allSettled(175 result.PDFs.map(176 (pdf) =>177 new Promise(async (res, rej) => {178 try {179 const pdfResponse = await page.request.fetch(180 pdf.link181 );182
183 // Parse the PDF using pdf-parse184 const pdfText = await pdfParse(185 (186 await pdfResponse.body()187 ).buffer188 );189
190 res({191 ...pdf,192 text: pdfText.text,193 });194 } catch (e) {195 // console.log(e);196 res({197 ...pdf,198 error: e.message || e.code || true,199 });200 }201 })202 )203 )204 ).map((p) => p.value);205
206 // If the request has large data errors, mark the data for manual processing207 if (request.errorMessages.includes("Data item is too large")) {208 await Dataset.pushData({209 url: request.url,210 ...result,211 PDFs: PDFs.map((item) => ({212 ...item,213 text: "Please retrieve manually due to size limitations",214 })),215 Links: Links.map((item) => ({216 ...item,217 text: "Please retrieve manually due to size limitations",218 })),219 });220 } else {221 await Dataset.pushData({222 url: request.url,223 ...result,224 // Links,225 PDFs,226 // InnerPDFs227 });228 }229 } catch (error) {230 log.error(231 `An unexpected error occurred: ${error.message || error.code}`232 );233 }234});235
236router.addHandler("detail7", async ({ request, page, log }) => {237 try {238 const title = await page.title();239 log.info(`${title}`, { url: request.loadedUrl });240 const result = await page.evaluate(() => {241 const result = {242 Category: "Rules and Regulations",243 Title:244 document.querySelector(".page-title")?.innerText || "N/A",245 MainParagraphText:246 document.querySelector(".field-type-text_with_summary")247 ?.innerText || "N/A",248 Links: [],249 PDFs: [],250 };251
252 const linkElements = document.querySelectorAll(253 ".field-type-text_with_summary a"254 );255 for (const el of Array.from(linkElements)) {256 const obj = {257 linkText: el.innerText || "N/A",258 link: el.href || "",259 };260 const numericValue = Number(obj.linkText);261
262 if (263 isNaN(numericValue) &&264 !obj.link.includes("mailto") &&265 obj.link !== ""266 ) {267 if (obj.link.endsWith(".pdf")) {268 result.PDFs.push(obj);269 } else result.Links.push(obj);270 }271 }272
273 return result;274 });275
276 const PDFs = (277 await Promise.allSettled(278 result.PDFs.map(279 (pdf) =>280 new Promise(async (res, rej) => {281 try {282 const pdfResponse = await page.request.fetch(283 pdf.link284 );285
286 // Parse the PDF using pdf-parse287 const pdfText = await pdfParse(288 (289 await pdfResponse.body()290 ).buffer291 );292
293 res({294 ...pdf,295 text: pdfText.text,296 });297 } catch (e) {298 // console.log(e);299 res({300 ...pdf,301 error: e.message || e.code || true,302 });303 }304 })305 )306 )307 ).map((p) => p.value);308
309 if (request.errorMessages.includes("Data item is too large")) {310 await Dataset.pushData({311 url: request.url,312 ...result,313 PDFs: PDFs.map((item) => ({314 ...item,315 text: "Please retrieve manually due to size limitations",316 })),317 Links: Links.map((item) => ({318 ...item,319 text: "Please retrieve manually due to size limitations",320 })),321 });322 } else {323 await Dataset.pushData({324 url: request.url,325 ...result,326 // Links,327 PDFs,328 // InnerPDFs329 });330 }331 } catch (error) {332 log.error(333 `An unexpected error occurred: ${error.message || error.code}`334 );335 }336});337
338module.exports = { router };