Legal Opinions
Deprecated
Pricing
Pay per usage
Go to Store
Legal Opinions
Deprecated
0.0 (0)
Pricing
Pay per usage
1
Monthly users
2
Last modified
a year ago
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "Legal Opinions",
4 "title": "Project Playwright Crawler JavaScript",
5 "description": "Crawlee and Playwright project in JavaScript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-playwright-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "PlaywrightCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with.",
10 "editor": "requestListSources",
11 "prefill": [
12 {
13 "url": "https://apify.com"
14 }
15 ]
16 }
17 }
18}
src/main.js
1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5 const { router } = require("./routes.js");
6 const startUrls = [
7 "https://www.consumerfinance.gov/rules-policy/final-rules/",
8 "https://www.consumerfinance.gov/rules-policy/rules-under-development/",
9 ];
10
11 // const proxyConfiguration = await Actor.createProxyConfiguration();
12
13 const crawler = new PlaywrightCrawler({
14 // proxyConfiguration,
15 maxConcurrency: 3,
16 launchContext: {
17 launchOptions: { javaScriptEnabled: false },
18 },
19 maxRequestRetries: 5,
20 requestHandler: router,
21 requestHandlerTimeoutSecs: 300,
22 navigationTimeoutSecs: 300,
23 });
24
25 await crawler.run(startUrls);
26 await Dataset.exportToCSV("OUTPUT");
27 await Dataset.exportToJSON("OUTPUT");
28});
src/routes.js
1const { Dataset, createPlaywrightRouter } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
6 const title = await page.title();
7 log.info(`${title}`, { url: request.loadedUrl });
8 await enqueueLinks({
9 selector: ".a-btn[href*=page]",
10 });
11 await enqueueLinks({
12 selector: "article h3 a",
13 label: "detail",
14 });
15});
16router.addHandler("detail", async ({ request, page, log }) => {
17 const title = await page.title();
18 log.info(`${title}`, { url: request.loadedUrl });
19 const result = await page.evaluate(() => {
20 let result = {
21 Category: document.querySelector("a[class*=m-breadcrumbs]")
22 .innerText,
23 Title: document.querySelector("h1").innerText,
24 MainParagraphText: document.querySelector(
25 ".m-full-width-text p:first-of-type"
26 ).innerText,
27 Text: document.querySelector(".m-full-width-text").innerText,
28 PDFs: [],
29 Links: [],
30 };
31 let category;
32 for (const el of Array.from(
33 document.querySelectorAll(".m-full-width-text >*")
34 )) {
35 if (el.tagName == "H5" || el.tagName == "H4")
36 category = el?.getAttribute("id") || el?.innerText;
37 if (!category) continue;
38 const link = el.querySelector("a");
39 if (link) {
40 const isPDF = link.href.includes(".pdf");
41 const obj = {
42 linkText: link.innerText,
43 link: link.href,
44 category,
45 };
46 if (isPDF) result.PDFs.push(obj);
47 else result.Links.push(obj);
48 }
49 }
50
51 return result;
52 });
53 const PDFs = (
54 await Promise.allSettled(
55 result.PDFs.map(
56 (pdf) =>
57 new Promise(async (res, rej) => {
58 try {
59 const pdfResponse = await page.request.fetch(
60 pdf.link
61 );
62
63 // Parse the PDF using pdf-parse
64 const pdfText = await pdfParse(
65 (
66 await pdfResponse.body()
67 ).buffer
68 );
69
70 res({
71 ...pdf,
72 text: pdfText.text,
73 info: pdfText.info,
74 metadata:
75 pdfText.metadata?._metadata ||
76 pdfText.metadata,
77 error: null,
78 });
79 } catch (e) {
80 // console.log(e);
81 res({ ...pdf, error: e.message || e.code || true });
82 }
83 })
84 )
85 )
86 ).map((p) => p.value);
87 const Links = (
88 await Promise.allSettled(
89 result.Links.map(
90 (link) =>
91 new Promise(async (res, rej) => {
92 try {
93 let text;
94 if (
95 link.linkText?.includes("Read it") &&
96 link.linkText?.includes("the Federal Register")
97 ) {
98 const FederalRegisterResponse =
99 await page.request.fetch(link.link);
100 const $ = load(
101 await FederalRegisterResponse.text()
102 );
103 text = $("#fulltext_content_area").text();
104 }
105 res({
106 ...link,
107 text,
108 });
109 } catch (e) {
110 // console.log(e);
111 res({
112 ...link,
113 error: e.message || e.code || true,
114 });
115 }
116 })
117 )
118 )
119 ).map((p) => p.value);
120 if (request.errorMessages.includes("Data item is too large")) {
121 await Dataset.pushData({
122 url: request.url,
123 ...result,
124 PDFs: PDFs.map((item) => ({
125 ...item,
126 text: "Please get Manually",
127 })),
128 Links: Links.map((item) => ({
129 ...item,
130 text: "Please get Manually",
131 })),
132 });
133 }
134 await Dataset.pushData({ url: request.url, ...result, PDFs, Links });
135});
136module.exports = { router };
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv
package.json
1{
2 "name": "consumer",
3 "version": "0.0.1",
4 "type": "commonjs",
5 "description": "consumer rules extractor",
6 "dependencies": {
7 "22": "^0.0.0",
8 "apify": "^3.1.10",
9 "crawlee": "^3.5.4",
10 "pdf-parse": "^1.1.1",
11 "playwright": "*"
12 },
13 "devDependencies": {
14 "@apify/eslint-config": "^0.4.0",
15 "@types/pdf-parse": "^1.1.4",
16 "eslint": "^8.50.0"
17 },
18 "scripts": {
19 "start": "node src/main.js",
20 "lint": "eslint ./src --ext .js,.jsx",
21 "lint:fix": "eslint ./src --ext .js,.jsx --fix",
22 "test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
23 "postinstall": "npx crawlee install-playwright-browsers"
24 },
25 "author": "Dhaval Rupapara"
26}
start.bat
Downloadstart.sh
DownloadPricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.