# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:18

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./


# Run the image.
CMD npm start --silent

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor-25",
    "title": "Project Cheerio Crawler Javascript",
    "description": "Crawlee and Cheerio project in javascript.",
    "version": "0.0",
    "meta": {
        "templateId": "js-crawlee-cheerio"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "PlaywrightCrawler Template",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "csvUrls": {
            "title": "Upload or link a CSV or text file",
            "type": "array",
            "description": "Upload or link a CSV with data",
            "editor": "requestListSources"
        },
        "separator": {
            "title": "Column separator",
            "type": "string",
            "default": ",",
            "description": "Usually `,` or `;`",
            "editor": "textfield"
        }
    }
}

src/main.js

1import { Actor, log } from 'apify';
2
3import { gotScraping } from 'got-scraping';
4import neatCsv from 'neat-csv';
5// Initialize the Apify SDK
6await Actor.init();
7
8const { csvUrls, separator = ',' } = await Actor.getValue('INPUT');
9
10const urls = csvUrls.map((req) => req?.url || req?.requestsFromUrl).filter(Boolean);
11
12await Actor.setStatusMessage(`Received ${urls.length} CSV URLs. Starting download.`);
13
14for (const url of urls) {
15    const { body } = await gotScraping(url);
16    let data;
17    try {
18        data = await neatCsv(body.toString(), { separator });
19    } catch (e) {
20        await Actor.fail(`Could not convert file to CSV with error: ${e}`)
21    }
22    await Actor.setStatusMessage(`Received ${data.length} rows from ${url}. Starting to push to the dataset, this might take a while.`);
23    await Actor.pushData(data);
24}
25
26await Actor.exit(`CSV succefully converted to a dataset with ID: ${Actor.getEnv().defaultDatasetId}`);

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify",
    "root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
dist
node_modules
apify_storage
storage

package.json

{
    "name": "crawlee-cheerio-javascript",
    "version": "0.0.1",
    "type": "module",
    "description": "This is a boilerplate of an Apify actor.",
    "engines": {
        "node": ">=18.0.0"
    },
    "dependencies": {
        "apify": "^3.1.10",
        "crawlee": "^3.5.4",
        "neat-csv": "^7.0.0"
    },
    "devDependencies": {
        "@apify/eslint-config": "^0.4.0",
        "eslint": "^8.50.0"
    },
    "scripts": {
        "start": "node src/main.js",
        "lint": "eslint ./src --ext .js,.jsx",
        "lint:fix": "eslint ./src --ext .js,.jsx --fix",
        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
    },
    "author": "It's not you it's me",
    "license": "ISC"
}

Image Text Extractor

m3web/image-text-extractor

Extract text from images using OCR (Optical Character Recognition) via direct URLs or uploaded JSON/CSV files. Works with multiple languages and automatically enriches your structured file with the text found inside images.

M3Web

Barcode Search Scraper

powerai/barcode-search-scraper

Scrapes product information from barcode database on barcodelookup.com, including brand, manufacturer, category, and other details.

PowerAI

5.0

Scraped Data Cleaner & Converter (No-Code CSV/JSON Tool) Rental

m3web/scraped-data-cleaner-rental

Clean and organize scraped .json or .csv data — no coding required. Remove duplicates, empty rows, unwanted columns, and sort by any field. Cleaned results are pushed to your Apify dataset. Perfect for marketers, researchers, and no-code workflows.

M3Web

5.0

Receive webhooks

riceman/receive-webhooks

Receives webhook data and saves it to Apify dataset storage. Ideal for Clay users on the Starter plan who still want or need to build tables that can receive webhooks.

RICEMAN

Forward dataset as POST data

anchor/forward-dataset-webhook

This actor forwards the results of an Actor to an endpoint, instead of having to fetch the results manually. It will download the dataset and attach it to the body of a POST request you will specify. It acts as a new webhook. Simplify your Actor process !!!

Anchor

5.0

Power Webhook Integration

pocesar/run-webhook-digest

Allows you to provide multiple HTTP endpoints, that receive a more complete JSON from the run, and allow you to hit those endpoints using a proxy, and enable you to do conditional webhook calls with some lines of Javascript code and you can link/chain one actor to another

Paulo Cesar

Html Renderer

jakubbalada/html-renderer

Generate image for your HTML using a headless browser

Jakub Balada

HTTP API & Webhook Gateway

amernas/http-api-webhook-gateway

A versatile actor that functions as both a powerful HTTP client and a secure webhook receiver. If your application can trigger an Apify actor but can't make direct HTTP calls, use this tool to interact with any external API. It also provides a stable endpoint to receive incoming webhooks.

Traffic Architect

YellowPages.ca Business Data Scraper

delicious_zebu/yellowpages-ca-business-data-scraper

Effortlessly extract comprehensive Canadian business data from YellowPages.ca with flexible search options, rich detail extraction, and customizable pagination for your market research and lead generation needs.

ВAH

5.0

Convert Image to PDF and PDF to Image

akash9078/image-pdf-converter

Convert images (JPG, PNG, BMP, and more) into high-quality PDFs, or extract images from PDF files in seconds. Image–PDF Converter Pro delivers fast, reliable, and professional results for all your document and image conversion needs.