My Actor
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsMy Actor
flamboyant_marimba/my-actor
Testing
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "project-cheerio-crawler-javascript",
4 "title": "Project Cheerio Crawler Javascript",
5 "description": "Crawlee and Cheerio project in javascript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-cheerio"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "CheerioCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with.",
10 "editor": "requestListSources",
11 "prefill": [
12 {
13 "url": "https://apify.com"
14 }
15 ]
16 }
17 }
18}
src/main.js
1/**
2 * This template is a production ready boilerplate for developing with `CheerioCrawler`.
3 * Use this to bootstrap your projects using the most up-to-date code.
4 * If you're looking for examples or want to learn more, see README.
5 */
6
7// For more information, see https://docs.apify.com/sdk/js
8import { Actor } from 'apify';
9// For more information, see https://crawlee.dev
10import { CheerioCrawler } from 'crawlee';
11import { router } from './routes.js';
12
13// Initialize the Apify SDK
14await Actor.init();
15
16const startUrls = ['https://api.apify.com/v2/acts/flamboyant_marimba~my-actor/runs?token=apify_api_r0Xcytmdv3sTxxWfOJMaF9p6c6YVGY0OLBgv'];
17
18const proxyConfiguration = await Actor.createProxyConfiguration();
19
20const crawler = new CheerioCrawler({
21 proxyConfiguration,
22 requestHandler: router,
23});
24
25await crawler.run(startUrls);
26
27// Exit successfully
28await Actor.exit();
src/routes.js
1import { Dataset, createCheerioRouter } from 'crawlee';
2
3export const router = createCheerioRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://apify.com/*'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, $, log }) => {
14 const title = $('title').text();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await Dataset.pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
package.json
1{
2 "name": "crawlee-cheerio-javascript",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is a boilerplate of an Apify actor.",
6 "engines": {
7 "node": ">=16.0.0"
8 },
9 "dependencies": {
10 "apify": "^3.0.0",
11 "crawlee": "^3.0.0"
12 },
13 "devDependencies": {
14 "@apify/eslint-config": "^0.3.1",
15 "eslint": "^8.36.0"
16 },
17 "scripts": {
18 "start": "node src/main.js",
19 "lint": "eslint ./src --ext .js,.jsx",
20 "lint:fix": "eslint ./src --ext .js,.jsx --fix",
21 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
22 },
23 "author": "It's not you it's me",
24 "license": "ISC"
25}
Developer
Maintained by Community
Categories