Final Consumerfinance avatar
Final Consumerfinance
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Final Consumerfinance

Final Consumerfinance

dhaval079/final-consumerfinance

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "final-consumerfinance",
4	"title": "Project Playwright Crawler JavaScript",
5	"description": "Crawlee and Playwright project in JavaScript.",
6	"version": "0.0",
7	"meta": {
8		"templateId": "js-crawlee-playwright-chrome"
9	},
10	"input": "./input_schema.json",
11	"dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://apify.com"
14                }
15            ]
16        }
17    }
18}

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5    const { router } = require("./routes.js");
6    const startUrls = [
7        "https://www.consumerfinance.gov/rules-policy/final-rules/",
8        "https://www.consumerfinance.gov/rules-policy/rules-under-development/",
9    ];
10
11    // const proxyConfiguration = await Actor.createProxyConfiguration();
12
13    const crawler = new PlaywrightCrawler({
14        // proxyConfiguration,
15        maxConcurrency: 3,
16        launchContext: {
17            launchOptions: { javaScriptEnabled: false },
18        },
19        maxRequestRetries: 5,
20        requestHandler: router,
21        requestHandlerTimeoutSecs: 300,
22        navigationTimeoutSecs: 300,
23    });
24
25    await crawler.run(startUrls);
26    await Dataset.exportToCSV("OUTPUT");
27    await Dataset.exportToJSON("OUTPUT");
28});

src/routes.js

1const { Dataset, createPlaywrightRouter } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
6    const title = await page.title();
7    log.info(`${title}`, { url: request.loadedUrl });
8    await enqueueLinks({
9        selector: ".a-btn[href*=page]",
10    });
11    await enqueueLinks({
12        selector: "article h3 a",
13        label: "detail",
14    });
15});
16router.addHandler("detail", async ({ request, page, log }) => {
17    const title = await page.title();
18    log.info(`${title}`, { url: request.loadedUrl });
19    const result = await page.evaluate(() => {
20        let result = {
21            Category: document.querySelector("a[class*=m-breadcrumbs]")
22                .innerText,
23            Title: document.querySelector("h1").innerText,
24            MainParagraphText: document.querySelector(
25                ".m-full-width-text p:first-of-type"
26            ).innerText,
27            Text: document.querySelector(".m-full-width-text").innerText,
28            PDFs: [],
29            Links: [],
30        };
31        let category;
32        for (const el of Array.from(
33            document.querySelectorAll(".m-full-width-text >*")
34        )) {
35            if (el.tagName == "H5" || el.tagName == "H4")
36                category = el?.getAttribute("id") || el?.innerText;
37            if (!category) continue;
38            const link = el.querySelector("a");
39            if (link) {
40                const isPDF = link.href.includes(".pdf");
41                const obj = {
42                    linkText: link.innerText,
43                    link: link.href,
44                    category,
45                };
46                if (isPDF) result.PDFs.push(obj);
47                else result.Links.push(obj);
48            }
49        }
50
51        return result;
52    });
53    const PDFs = (
54        await Promise.allSettled(
55            result.PDFs.map(
56                (pdf) =>
57                    new Promise(async (res, rej) => {
58                        try {
59                            const pdfResponse = await page.request.fetch(
60                                pdf.link
61                            );
62
63                            // Parse the PDF using pdf-parse
64                            const pdfText = await pdfParse(
65                                (
66                                    await pdfResponse.body()
67                                ).buffer
68                            );
69
70                            res({
71                                ...pdf,
72                                text: pdfText.text,
73                                info: pdfText.info,
74                                metadata:
75                                    pdfText.metadata?._metadata ||
76                                    pdfText.metadata,
77                                error: null,
78                            });
79                        } catch (e) {
80                            // console.log(e);
81                            res({ ...pdf, error: e.message || e.code || true });
82                        }
83                    })
84            )
85        )
86    ).map((p) => p.value);
87    const Links = (
88        await Promise.allSettled(
89            result.Links.map(
90                (link) =>
91                    new Promise(async (res, rej) => {
92                        try {
93                            let text;
94                            if (
95                                link.linkText?.includes("Read it") &&
96                                link.linkText?.includes("the Federal Register")
97                            ) {
98                                const FederalRegisterResponse =
99                                    await page.request.fetch(link.link);
100                                const $ = load(
101                                    await FederalRegisterResponse.text()
102                                );
103                                text = $("#fulltext_content_area").text();
104                            }
105                            res({
106                                ...link,
107                                text,
108                            });
109                        } catch (e) {
110                            // console.log(e);
111                            res({
112                                ...link,
113                                error: e.message || e.code || true,
114                            });
115                        }
116                    })
117            )
118        )
119    ).map((p) => p.value);
120    if (request.errorMessages.includes("Data item is too large")) {
121        await Dataset.pushData({
122            url: request.url,
123            ...result,
124            PDFs: PDFs.map((item) => ({
125                ...item,
126                text: "Please get Manually",
127            })),
128            Links: Links.map((item) => ({
129                ...item,
130                text: "Please get Manually",
131            })),
132        });
133    }
134    await Dataset.pushData({ url: request.url, ...result, PDFs, Links });
135});
136module.exports = { router };

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv

crawlee.json

1{
2    "purgeOnStart":false
3}

package.json

1{
2	"name": "consumer",
3	"version": "0.0.1",
4	"type": "commonjs",
5	"description": "consumer rules extractor",
6	"dependencies": {
7		"22": "^0.0.0",
8		"apify": "^3.1.10",
9		"crawlee": "^3.5.4",
10		"pdf-parse": "^1.1.1",
11		"playwright": "*"
12	},
13	"devDependencies": {
14		"@apify/eslint-config": "^0.4.0",
15		"@types/pdf-parse": "^1.1.4",
16		"eslint": "^8.50.0"
17	},
18	"scripts": {
19		"start": "node src/main.js",
20		"lint": "eslint ./src --ext .js,.jsx",
21		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
22		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
23		"postinstall": "npx crawlee install-playwright-browsers"
24	},
25	"author": "Dhaval Rupapara"
26}

start.bat

Download

start.sh

Download
Developer
Maintained by Community