NCUAs Rules Regulation
Try for free
No credit card required
Go to Store
NCUAs Rules Regulation
nondescript_cord/ncuas-rules-regulation
Try for free
No credit card required
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4dist
5node_modules
6apify_storage
7crawlee_storage
8storage
9
10# Added by Apify CLI
11.venv
Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27
28# Run the image. If you know you won't need headful browsers,
29# you can remove the XVFB start script for a micro perf gain.
30CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
package.json
1{
2 "name": "my-crawler",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of a Crawlee project.",
6 "dependencies": {
7 "apify": "^3.2.0",
8 "crawlee": "^3.0.0",
9 "playwright": "*"
10 },
11 "scripts": {
12 "start": "node src/main.js",
13 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
14 "postinstall": "npx crawlee install-playwright-browsers"
15 },
16 "author": "It's not you it's me",
17 "license": "ISC"
18}
start.bat
Downloadstart.sh
Download.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "NCUAs-Rules-Regulation",
4 "version": "0.0",
5 "buildTag": "latest",
6 "environmentVariables": {}
7}
src/main.js
1import { Actor } from "apify";
2import { PlaywrightCrawler, Dataset } from "crawlee";
3
4Actor.main(async () => {
5 const crawler = new PlaywrightCrawler({
6 maxRequestsPerCrawl: 100, // Limit the number of requests to 100
7 async requestHandler({ request, page, log }) {
8 log.info(`Processing ${request.url}`);
9
10 const title = await page.title();
11 log.info(`${title}`, { url: request.loadedUrl });
12 const result = await page.evaluate(() => {
13 const result = {
14 Category: "Rules and Regulations",
15 Title:
16 document.querySelector(".logo h2")?.innerText || "N/A",
17 MainParagraphText:
18 document.querySelector(".table-of-contents.root")
19 ?.innerText || "N/A",
20 Links: [],
21 PDFs: [],
22 };
23
24 const linkElements = document.querySelectorAll(
25 ".table-of-contents.root a"
26 );
27 for (const el of Array.from(linkElements)) {
28 const obj = {
29 linkText: el.innerText || "N/A",
30 link: el.href || "",
31 };
32 const numericValue = Number(obj.linkText);
33
34 if (
35 isNaN(numericValue) &&
36 !obj.link.includes("mailto") &&
37 obj.link !== ""
38 ) {
39 if (obj.link.endsWith(".pdf")) {
40 result.PDFs.push(obj);
41 } else result.Links.push(obj);
42 }
43 }
44
45 return result;
46 });
47
48 // Save the data to the dataset
49 await Dataset.pushData({
50 url: request.url,
51 ...result,
52 });
53
54 },
55 });
56
57 // Run the crawler with the initial URL
58 await crawler.run(['https://www.ecfr.gov/current/title-12/chapter-VII']);
59});
src/routes.js
1import { createPlaywrightRouter } from 'crawlee';
2
3export const router = createPlaywrightRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://crawlee.dev/**'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log, pushData }) => {
14 const title = await page.title();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});
Developer
Maintained by Community
Actor Metrics
2 monthly users
-
1 star
>99% runs succeeded
Created in May 2024
Modified 8 months ago
Categories