R R I
Pricing
Pay per usage
Go to Apify Store
R R I
0.0 (0)
Pricing
Pay per usage
1
3
1
Last modified
a year ago
Pricing
Pay per usage
0.0 (0)
Pricing
Pay per usage
1
3
1
Last modified
a year ago
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.gitroot = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf{    "extends": "@apify",    "root": true}# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
# Added by Apify CLI.venv{	"purgeOnStart": false}# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-playwright-chrome:20
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY  package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \    && npm install --omit=dev --omit=optional \    && echo "Installed NPM packages:" \    && (npm list --omit=dev --all || true) \    && echo "Node.js version:" \    && node --version \    && echo "NPM version:" \    && npm --version
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY  . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm start --silent{	"name": "consumerfinance",	"version": "0.0.1",	"type": "commonjs",	"description": "consumerfinance rules extractor",	"dependencies": {		"22": "^0.0.0",		"apify": "^3.1.10",		"crawlee": "^3.5.4",		"fs": "^0.0.1-security",		"node-fetch": "^3.3.2",		"path": "^0.12.7",		"pdf-parse": "^1.1.1",		"playwright": "^1.43.1"	},	"devDependencies": {		"@apify/eslint-config": "^0.4.0",		"@types/pdf-parse": "^1.1.4",		"eslint": "^8.50.0",		"@playwright/test": "^1.43.1"	},	"scripts": {		"start": "node src/main.js",		"lint": "eslint ./src --ext .js,.jsx",		"lint:fix": "eslint ./src --ext .js,.jsx --fix",		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",		"postinstall": "npx crawlee install-playwright-browsers"	},	"author": "Moazzam Malek"}{	"actorSpecification": 1,	"name": "R-R-I",	"title": "Project Playwright Crawler JavaScript",	"description": "Crawlee and Playwright project in JavaScript.",	"version": "0.0",	"meta": {		"templateId": "js-crawlee-playwright-chrome"	},	"input": "./input_schema.json",	"dockerfile": "./Dockerfile"}# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-playwright-chrome:18
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY  package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \    && npm install --omit=dev --omit=optional \    && echo "Installed NPM packages:" \    && (npm list --omit=dev --all || true) \    && echo "Node.js version:" \    && node --version \    && echo "NPM version:" \    && npm --version \    && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY  . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm start --silent{    "title": "PlaywrightCrawler Template",    "type": "object",    "schemaVersion": 1,    "properties": {        "startUrls": {            "title": "Start URLs",            "type": "array",            "description": "URLs to start with.",            "editor": "requestListSources",            "prefill": [                {                    "url": "https://apify.com"                }            ]        }    }}1const { Actor } = require("apify");2const { PlaywrightCrawler, Dataset } = require("crawlee");3
4Actor.main(async () => {5    const { router } = require("./routes.js");6    const startUrls = [7        "https://ncua.gov/regulation-supervision/rules-regulations",8    ];9
10    // const proxyConfiguration = await Actor.createProxyConfiguration();11
12    const crawler = new PlaywrightCrawler({13        // proxyConfiguration,14        maxConcurrency: 3,15        launchContext: {16            launchOptions: { javaScriptEnabled: false },17        },18
19        maxRequestRetries: 5,20        requestHandler: router,21        requestHandlerTimeoutSecs: 300,22        navigationTimeoutSecs: 300,23    });24
25    await crawler.run(startUrls);26    await Dataset.exportToCSV("OUTPUT");27    await Dataset.exportToJSON("OUTPUT");28});1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");2const pdfParse = require("pdf-parse");3const router = createPlaywrightRouter();4const { load } = require("cheerio");5const fs = require("fs");6
7router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {8    const title = await page.title();9    log.info(`${title}`, { url: request.loadedUrl });10    x = title;11
12    await enqueueLinks({13        selector: ".field-type-text_with_summary h2 a[href*=regulatory-review]",14        label: "detail5",15    });16
17    await enqueueLinks({18        selector:19            ".field-type-text_with_summary h2 a[href*=regulatory-reform-agenda]",20        label: "detail6",21    });22
23    await enqueueLinks({24        selector:25            ".field-type-text_with_summary h2 a[href*=interpretive-rulings-policy-statements]",26        label: "detail7",27    });28});29
30router.addHandler("detail5", async ({ request, page, log }) => {31    try {32        const title = await page.title();33        const url = request.loadedUrl;34        log.info(`${title}`, { url: request.loadedUrl });35        const result = await page.evaluate(() => {36            const result = {37                Category: "Rules and Regulations",38                Title:39                    document.querySelector(".page-title")?.innerText || "N/A",40                MainParagraphText:41                    document.querySelector(".field-type-text_with_summary")42                        ?.innerText || "N/A",43                Links: [],44                PDFs: [],45            };46
47            const linkElements = document.querySelectorAll(48                ".field-type-text_with_summary a"49            );50            for (const el of Array.from(linkElements)) {51                const obj = {52                    linkText: el.innerText || "N/A",53                    link: el.href || "",54                };55                const numericValue = Number(obj.linkText);56
57                if (58                    isNaN(numericValue) &&59                    !obj.link.includes("mailto") &&60                    obj.link !== ""61                ) {62                    if (obj.link.endsWith(".pdf")) {63                        result.PDFs.push(obj);64                    } else result.Links.push(obj);65                }66            }67
68            return result;69        });70
71        const PDFs = (72            await Promise.allSettled(73                result.PDFs.map(74                    (pdf) =>75                        new Promise(async (res, rej) => {76                            try {77                                const pdfResponse = await page.request.fetch(78                                    pdf.link79                                );80
81                                // Parse the PDF using pdf-parse82                                const pdfText = await pdfParse(83                                    (84                                        await pdfResponse.body()85                                    ).buffer86                                );87
88                                res({89                                    ...pdf,90                                    text: pdfText.text,91                                });92                            } catch (e) {93                                // console.log(e);94                                res({95                                    ...pdf,96                                    error: e.message || e.code || true,97                                });98                            }99                        })100                )101            )102        ).map((p) => p.value);103
104        if (request.errorMessages.includes("Data item is too large")) {105            await Dataset.pushData({106                url: request.url,107                ...result,108                PDFs: PDFs.map((item) => ({109                    ...item,110                    text: "Please retrieve manually due to size limitations",111                })),112                Links: Links.map((item) => ({113                    ...item,114                    text: "Please retrieve manually due to size limitations",115                })),116            });117        } else {118            await Dataset.pushData({119                url2: request.url,120                ...result,121                PDFs,122            });123        }124    } catch (error) {125        log.error(126            `An unexpected error occurred: ${error.message || error.code}`127        );128    }129});130
131router.addHandler("detail6", async ({ request, page, log }) => {132    try {133        const title = await page.title();134        const url = request.loadedUrl;135        log.info(`${title}`, { url: request.loadedUrl });136        const result = await page.evaluate(() => {137            const result = {138                Category: "Rules and Regulations",139                Title:140                    document.querySelector(".page-title")?.innerText || "N/A",141                MainParagraphText:142                    document.querySelector(".field-type-text_with_summary")143                        ?.innerText || "N/A",144                Links: [],145                PDFs: [],146            };147
148            const linkElements = document.querySelectorAll(149                ".field-type-text_with_summary a"150            );151            for (const el of Array.from(linkElements)) {152                const obj = {153                    linkText: el.innerText || "N/A",154                    link: el.href || "",155                };156                const numericValue = Number(obj.linkText);157
158                if (159                    isNaN(numericValue) &&160                    !obj.link.includes("mailto") &&161                    obj.link !== ""162                ) {163                    if (obj.link.endsWith(".pdf")) {164                        result.PDFs.push(obj);165                    } else result.Links.push(obj);166                }167            }168
169            return result;170        });171
172
173        const PDFs = (174            await Promise.allSettled(175                result.PDFs.map(176                    (pdf) =>177                        new Promise(async (res, rej) => {178                            try {179                                const pdfResponse = await page.request.fetch(180                                    pdf.link181                                );182
183                                // Parse the PDF using pdf-parse184                                const pdfText = await pdfParse(185                                    (186                                        await pdfResponse.body()187                                    ).buffer188                                );189
190                                res({191                                    ...pdf,192                                    text: pdfText.text,193                                });194                            } catch (e) {195                                // console.log(e);196                                res({197                                    ...pdf,198                                    error: e.message || e.code || true,199                                });200                            }201                        })202                )203            )204        ).map((p) => p.value);205
206        // If the request has large data errors, mark the data for manual processing207        if (request.errorMessages.includes("Data item is too large")) {208            await Dataset.pushData({209                url: request.url,210                ...result,211                PDFs: PDFs.map((item) => ({212                    ...item,213                    text: "Please retrieve manually due to size limitations",214                })),215                Links: Links.map((item) => ({216                    ...item,217                    text: "Please retrieve manually due to size limitations",218                })),219            });220        } else {221            await Dataset.pushData({222                url: request.url,223                ...result,224                // Links,225                PDFs,226                // InnerPDFs227            });228        }229    } catch (error) {230        log.error(231            `An unexpected error occurred: ${error.message || error.code}`232        );233    }234});235
236router.addHandler("detail7", async ({ request, page, log }) => {237    try {238        const title = await page.title();239        log.info(`${title}`, { url: request.loadedUrl });240        const result = await page.evaluate(() => {241            const result = {242                Category: "Rules and Regulations",243                Title:244                    document.querySelector(".page-title")?.innerText || "N/A",245                MainParagraphText:246                    document.querySelector(".field-type-text_with_summary")247                        ?.innerText || "N/A",248                Links: [],249                PDFs: [],250            };251
252            const linkElements = document.querySelectorAll(253                ".field-type-text_with_summary a"254            );255            for (const el of Array.from(linkElements)) {256                const obj = {257                    linkText: el.innerText || "N/A",258                    link: el.href || "",259                };260                const numericValue = Number(obj.linkText);261
262                if (263                    isNaN(numericValue) &&264                    !obj.link.includes("mailto") &&265                    obj.link !== ""266                ) {267                    if (obj.link.endsWith(".pdf")) {268                        result.PDFs.push(obj);269                    } else result.Links.push(obj);270                }271            }272
273            return result;274        });275
276        const PDFs = (277            await Promise.allSettled(278                result.PDFs.map(279                    (pdf) =>280                        new Promise(async (res, rej) => {281                            try {282                                const pdfResponse = await page.request.fetch(283                                    pdf.link284                                );285
286                                // Parse the PDF using pdf-parse287                                const pdfText = await pdfParse(288                                    (289                                        await pdfResponse.body()290                                    ).buffer291                                );292
293                                res({294                                    ...pdf,295                                    text: pdfText.text,296                                });297                            } catch (e) {298                                // console.log(e);299                                res({300                                    ...pdf,301                                    error: e.message || e.code || true,302                                });303                            }304                        })305                )306            )307        ).map((p) => p.value);308
309        if (request.errorMessages.includes("Data item is too large")) {310            await Dataset.pushData({311                url: request.url,312                ...result,313                PDFs: PDFs.map((item) => ({314                    ...item,315                    text: "Please retrieve manually due to size limitations",316                })),317                Links: Links.map((item) => ({318                    ...item,319                    text: "Please retrieve manually due to size limitations",320                })),321            });322        } else {323            await Dataset.pushData({324                url: request.url,325                ...result,326                // Links,327                PDFs,328                // InnerPDFs329            });330        }331    } catch (error) {332        log.error(333            `An unexpected error occurred: ${error.message || error.code}`334        );335    }336});337
338module.exports = { router };