Legal Opinions avatar
Legal Opinions

Deprecated

Pricing

Pay per usage

Go to Store
Legal Opinions

Legal Opinions

Deprecated

Developed by

Dhaval Rupapara

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Monthly users

2

Last modified

a year ago

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "Legal Opinions",
4	"title": "Project Playwright Crawler JavaScript",
5	"description": "Crawlee and Playwright project in JavaScript.",
6	"version": "0.0",
7	"meta": {
8		"templateId": "js-crawlee-playwright-chrome"
9	},
10	"input": "./input_schema.json",
11	"dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://apify.com"
14                }
15            ]
16        }
17    }
18}

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5    const { router } = require("./routes.js");
6    const startUrls = [
7        "https://www.consumerfinance.gov/rules-policy/final-rules/",
8        "https://www.consumerfinance.gov/rules-policy/rules-under-development/",
9    ];
10
11    // const proxyConfiguration = await Actor.createProxyConfiguration();
12
13    const crawler = new PlaywrightCrawler({
14        // proxyConfiguration,
15        maxConcurrency: 3,
16        launchContext: {
17            launchOptions: { javaScriptEnabled: false },
18        },
19        maxRequestRetries: 5,
20        requestHandler: router,
21        requestHandlerTimeoutSecs: 300,
22        navigationTimeoutSecs: 300,
23    });
24
25    await crawler.run(startUrls);
26    await Dataset.exportToCSV("OUTPUT");
27    await Dataset.exportToJSON("OUTPUT");
28});

src/routes.js

1const { Dataset, createPlaywrightRouter } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
6    const title = await page.title();
7    log.info(`${title}`, { url: request.loadedUrl });
8    await enqueueLinks({
9        selector: ".a-btn[href*=page]",
10    });
11    await enqueueLinks({
12        selector: "article h3 a",
13        label: "detail",
14    });
15});
16router.addHandler("detail", async ({ request, page, log }) => {
17    const title = await page.title();
18    log.info(`${title}`, { url: request.loadedUrl });
19    const result = await page.evaluate(() => {
20        let result = {
21            Category: document.querySelector("a[class*=m-breadcrumbs]")
22                .innerText,
23            Title: document.querySelector("h1").innerText,
24            MainParagraphText: document.querySelector(
25                ".m-full-width-text p:first-of-type"
26            ).innerText,
27            Text: document.querySelector(".m-full-width-text").innerText,
28            PDFs: [],
29            Links: [],
30        };
31        let category;
32        for (const el of Array.from(
33            document.querySelectorAll(".m-full-width-text >*")
34        )) {
35            if (el.tagName == "H5" || el.tagName == "H4")
36                category = el?.getAttribute("id") || el?.innerText;
37            if (!category) continue;
38            const link = el.querySelector("a");
39            if (link) {
40                const isPDF = link.href.includes(".pdf");
41                const obj = {
42                    linkText: link.innerText,
43                    link: link.href,
44                    category,
45                };
46                if (isPDF) result.PDFs.push(obj);
47                else result.Links.push(obj);
48            }
49        }
50
51        return result;
52    });
53    const PDFs = (
54        await Promise.allSettled(
55            result.PDFs.map(
56                (pdf) =>
57                    new Promise(async (res, rej) => {
58                        try {
59                            const pdfResponse = await page.request.fetch(
60                                pdf.link
61                            );
62
63                            // Parse the PDF using pdf-parse
64                            const pdfText = await pdfParse(
65                                (
66                                    await pdfResponse.body()
67                                ).buffer
68                            );
69
70                            res({
71                                ...pdf,
72                                text: pdfText.text,
73                                info: pdfText.info,
74                                metadata:
75                                    pdfText.metadata?._metadata ||
76                                    pdfText.metadata,
77                                error: null,
78                            });
79                        } catch (e) {
80                            // console.log(e);
81                            res({ ...pdf, error: e.message || e.code || true });
82                        }
83                    })
84            )
85        )
86    ).map((p) => p.value);
87    const Links = (
88        await Promise.allSettled(
89            result.Links.map(
90                (link) =>
91                    new Promise(async (res, rej) => {
92                        try {
93                            let text;
94                            if (
95                                link.linkText?.includes("Read it") &&
96                                link.linkText?.includes("the Federal Register")
97                            ) {
98                                const FederalRegisterResponse =
99                                    await page.request.fetch(link.link);
100                                const $ = load(
101                                    await FederalRegisterResponse.text()
102                                );
103                                text = $("#fulltext_content_area").text();
104                            }
105                            res({
106                                ...link,
107                                text,
108                            });
109                        } catch (e) {
110                            // console.log(e);
111                            res({
112                                ...link,
113                                error: e.message || e.code || true,
114                            });
115                        }
116                    })
117            )
118        )
119    ).map((p) => p.value);
120    if (request.errorMessages.includes("Data item is too large")) {
121        await Dataset.pushData({
122            url: request.url,
123            ...result,
124            PDFs: PDFs.map((item) => ({
125                ...item,
126                text: "Please get Manually",
127            })),
128            Links: Links.map((item) => ({
129                ...item,
130                text: "Please get Manually",
131            })),
132        });
133    }
134    await Dataset.pushData({ url: request.url, ...result, PDFs, Links });
135});
136module.exports = { router };

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv

package.json

1{
2	"name": "consumer",
3	"version": "0.0.1",
4	"type": "commonjs",
5	"description": "consumer rules extractor",
6	"dependencies": {
7		"22": "^0.0.0",
8		"apify": "^3.1.10",
9		"crawlee": "^3.5.4",
10		"pdf-parse": "^1.1.1",
11		"playwright": "*"
12	},
13	"devDependencies": {
14		"@apify/eslint-config": "^0.4.0",
15		"@types/pdf-parse": "^1.1.4",
16		"eslint": "^8.50.0"
17	},
18	"scripts": {
19		"start": "node src/main.js",
20		"lint": "eslint ./src --ext .js,.jsx",
21		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
22		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
23		"postinstall": "npx crawlee install-playwright-browsers"
24	},
25	"author": "Dhaval Rupapara"
26}

start.bat

Download

start.sh

Download

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.