Legal Opinions avatar
Legal Opinions

Deprecated

Pricing

Pay per usage

Go to Store
Legal Opinions

Legal Opinions

Deprecated

Developed by

Dhaval Rupapara

Dhaval Rupapara

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

2

Monthly users

2

Last modified

a year ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "Legal Opinions",
"title": "Project Playwright Crawler JavaScript",
"description": "Crawlee and Playwright project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-playwright-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "PlaywrightCrawler Template",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Start URLs",
"type": "array",
"description": "URLs to start with.",
"editor": "requestListSources",
"prefill": [
{
"url": "https://apify.com"
}
]
}
}
}

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5 const { router } = require("./routes.js");
6 const startUrls = [
7 "https://www.consumerfinance.gov/rules-policy/final-rules/",
8 "https://www.consumerfinance.gov/rules-policy/rules-under-development/",
9 ];
10
11 // const proxyConfiguration = await Actor.createProxyConfiguration();
12
13 const crawler = new PlaywrightCrawler({
14 // proxyConfiguration,
15 maxConcurrency: 3,
16 launchContext: {
17 launchOptions: { javaScriptEnabled: false },
18 },
19 maxRequestRetries: 5,
20 requestHandler: router,
21 requestHandlerTimeoutSecs: 300,
22 navigationTimeoutSecs: 300,
23 });
24
25 await crawler.run(startUrls);
26 await Dataset.exportToCSV("OUTPUT");
27 await Dataset.exportToJSON("OUTPUT");
28});

src/routes.js

1const { Dataset, createPlaywrightRouter } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
6 const title = await page.title();
7 log.info(`${title}`, { url: request.loadedUrl });
8 await enqueueLinks({
9 selector: ".a-btn[href*=page]",
10 });
11 await enqueueLinks({
12 selector: "article h3 a",
13 label: "detail",
14 });
15});
16router.addHandler("detail", async ({ request, page, log }) => {
17 const title = await page.title();
18 log.info(`${title}`, { url: request.loadedUrl });
19 const result = await page.evaluate(() => {
20 let result = {
21 Category: document.querySelector("a[class*=m-breadcrumbs]")
22 .innerText,
23 Title: document.querySelector("h1").innerText,
24 MainParagraphText: document.querySelector(
25 ".m-full-width-text p:first-of-type"
26 ).innerText,
27 Text: document.querySelector(".m-full-width-text").innerText,
28 PDFs: [],
29 Links: [],
30 };
31 let category;
32 for (const el of Array.from(
33 document.querySelectorAll(".m-full-width-text >*")
34 )) {
35 if (el.tagName == "H5" || el.tagName == "H4")
36 category = el?.getAttribute("id") || el?.innerText;
37 if (!category) continue;
38 const link = el.querySelector("a");
39 if (link) {
40 const isPDF = link.href.includes(".pdf");
41 const obj = {
42 linkText: link.innerText,
43 link: link.href,
44 category,
45 };
46 if (isPDF) result.PDFs.push(obj);
47 else result.Links.push(obj);
48 }
49 }
50
51 return result;
52 });
53 const PDFs = (
54 await Promise.allSettled(
55 result.PDFs.map(
56 (pdf) =>
57 new Promise(async (res, rej) => {
58 try {
59 const pdfResponse = await page.request.fetch(
60 pdf.link
61 );
62
63 // Parse the PDF using pdf-parse
64 const pdfText = await pdfParse(
65 (
66 await pdfResponse.body()
67 ).buffer
68 );
69
70 res({
71 ...pdf,
72 text: pdfText.text,
73 info: pdfText.info,
74 metadata:
75 pdfText.metadata?._metadata ||
76 pdfText.metadata,
77 error: null,
78 });
79 } catch (e) {
80 // console.log(e);
81 res({ ...pdf, error: e.message || e.code || true });
82 }
83 })
84 )
85 )
86 ).map((p) => p.value);
87 const Links = (
88 await Promise.allSettled(
89 result.Links.map(
90 (link) =>
91 new Promise(async (res, rej) => {
92 try {
93 let text;
94 if (
95 link.linkText?.includes("Read it") &&
96 link.linkText?.includes("the Federal Register")
97 ) {
98 const FederalRegisterResponse =
99 await page.request.fetch(link.link);
100 const $ = load(
101 await FederalRegisterResponse.text()
102 );
103 text = $("#fulltext_content_area").text();
104 }
105 res({
106 ...link,
107 text,
108 });
109 } catch (e) {
110 // console.log(e);
111 res({
112 ...link,
113 error: e.message || e.code || true,
114 });
115 }
116 })
117 )
118 )
119 ).map((p) => p.value);
120 if (request.errorMessages.includes("Data item is too large")) {
121 await Dataset.pushData({
122 url: request.url,
123 ...result,
124 PDFs: PDFs.map((item) => ({
125 ...item,
126 text: "Please get Manually",
127 })),
128 Links: Links.map((item) => ({
129 ...item,
130 text: "Please get Manually",
131 })),
132 });
133 }
134 await Dataset.pushData({ url: request.url, ...result, PDFs, Links });
135});
136module.exports = { router };

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage
# Added by Apify CLI
.venv

package.json

{
"name": "consumer",
"version": "0.0.1",
"type": "commonjs",
"description": "consumer rules extractor",
"dependencies": {
"22": "^0.0.0",
"apify": "^3.1.10",
"crawlee": "^3.5.4",
"pdf-parse": "^1.1.1",
"playwright": "*"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"@types/pdf-parse": "^1.1.4",
"eslint": "^8.50.0"
},
"scripts": {
"start": "node src/main.js",
"lint": "eslint ./src --ext .js,.jsx",
"lint:fix": "eslint ./src --ext .js,.jsx --fix",
"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
"postinstall": "npx crawlee install-playwright-browsers"
},
"author": "Dhaval Rupapara"
}

start.bat

Download

start.sh

Download