NCUAs Rules Regulation avatar
NCUAs Rules Regulation
Try for free

No credit card required

View all Actors
NCUAs Rules Regulation

NCUAs Rules Regulation

nondescript_cord/ncuas-rules-regulation
Try for free

No credit card required

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4dist
5node_modules
6apify_storage
7crawlee_storage
8storage
9
10# Added by Apify CLI
11.venv

Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY --chown=myuser . ./
26
27
28# Run the image. If you know you won't need headful browsers,
29# you can remove the XVFB start script for a micro perf gain.
30CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

package.json

1{
2    "name": "my-crawler",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of a Crawlee project.",
6    "dependencies": {
7        "apify": "^3.2.0",
8        "crawlee": "^3.0.0",
9        "playwright": "*"
10    },
11    "scripts": {
12        "start": "node src/main.js",
13        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
14        "postinstall": "npx crawlee install-playwright-browsers"
15    },
16    "author": "It's not you it's me",
17    "license": "ISC"
18}

start.bat

Download

start.sh

Download

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "NCUAs-Rules-Regulation",
4	"version": "0.0",
5	"buildTag": "latest",
6	"environmentVariables": {}
7}

src/main.js

1import { Actor } from "apify";
2import { PlaywrightCrawler, Dataset } from "crawlee";
3
4Actor.main(async () => {
5  const crawler = new PlaywrightCrawler({
6    maxRequestsPerCrawl: 100, // Limit the number of requests to 100
7    async requestHandler({ request, page, log }) {
8      log.info(`Processing ${request.url}`);
9  
10      const title = await page.title();
11        log.info(`${title}`, { url: request.loadedUrl });
12        const result = await page.evaluate(() => {
13            const result = {
14                Category: "Rules and Regulations",
15                Title:
16                    document.querySelector(".logo h2")?.innerText || "N/A",
17                MainParagraphText:
18                    document.querySelector(".table-of-contents.root")
19                        ?.innerText || "N/A",
20                Links: [],
21                PDFs: [],
22            };
23
24            const linkElements = document.querySelectorAll(
25                ".table-of-contents.root a"
26            );
27            for (const el of Array.from(linkElements)) {
28                const obj = {
29                    linkText: el.innerText || "N/A",
30                    link: el.href || "",
31                };
32                const numericValue = Number(obj.linkText);
33
34                if (
35                    isNaN(numericValue) &&
36                    !obj.link.includes("mailto") &&
37                    obj.link !== ""
38                ) {
39                    if (obj.link.endsWith(".pdf")) {
40                        result.PDFs.push(obj);
41                    } else result.Links.push(obj);
42                }
43            }
44
45            return result;
46        });
47  
48      // Save the data to the dataset
49      await Dataset.pushData({
50        url: request.url,
51        ...result,
52      });
53
54    },
55  });
56  
57  // Run the crawler with the initial URL
58  await crawler.run(['https://www.ecfr.gov/current/title-12/chapter-VII']);
59});

src/routes.js

1import { createPlaywrightRouter } from 'crawlee';
2
3export const router = createPlaywrightRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6    log.info(`enqueueing new URLs`);
7    await enqueueLinks({
8        globs: ['https://crawlee.dev/**'],
9        label: 'detail',
10    });
11});
12
13router.addHandler('detail', async ({ request, page, log, pushData }) => {
14    const title = await page.title();
15    log.info(`${title}`, { url: request.loadedUrl });
16
17    await pushData({
18        url: request.loadedUrl,
19        title,
20    });
21});
Developer
Maintained by Community
Actor metrics
  • 2 monthly users
  • 0 stars
  • 100.0% runs succeeded
  • Created in May 2024
  • Modified about 2 months ago
Categories