# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:20

# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./


# Run the image.
CMD npm start --silent

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor",
    "title": "Scrape single page in JavaScript",
    "description": "Scrape data from single page with provided URL.",
    "version": "0.0",
    "meta": {
        "templateId": "js-start"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "schemaVersion": 1,
  "type": "object",
  "title": "Input Schema for Review Scraper",
  "description": "Schema for the input to the Actor that scrapes reviews.",
  "properties": {
    "url": {
      "type": "string",
      "title": "URL to scrape",
      "description": "The URL of the webpage containing reviews to scrape.",
      "editor": "textfield",
      "default": "https://omr.com/en/reviews/product/microsoft-teams/all"
    },
    "maxReviews": {
      "type": "integer",
      "title": "Maximum Reviews",
      "description": "The maximum number of reviews to scrape (e.g., 100).",
      "editor": "number",
      "default": 100
    }
  },
  "required": ["url"],
  "additionalProperties": false
}

src/main.js

1import { Actor } from 'apify';
2import axios from 'axios';
3import * as cheerio from 'cheerio';
4
5await Actor.init();
6
7const input = await Actor.getInput();
8const { url, maxReviews = 100 } = input;
9
10let currentPage = 1;
11const reviews = [];
12
13// Proxy rotation setup using actor-provided proxy URLs
14const proxyUrls = process.env.APIFY_PROXY_URLS?.split(',') || [];
15let proxyIndex = 0;
16
17while (reviews.length < maxReviews) {
18    const currentPageUrl = `${url}/${currentPage}`;
19    console.log(`Scraping page: ${currentPageUrl}`);
20
21    try {
22        // Use proxy URLs if available
23        const proxyUrl = proxyUrls.length > 0 ? proxyUrls[proxyIndex % proxyUrls.length] : null;
24        proxyIndex++;
25
26        const axiosConfig = proxyUrl
27            ? {
28                  proxy: false,
29                  httpsAgent: new (require('https-proxy-agent'))(proxyUrl),
30              }
31            : {};
32
33        // Fetch the HTML content of the current page
34        const response = await axios.get(currentPageUrl, axiosConfig);
35        const $ = cheerio.load(response.data);
36
37        // Extract reviews from the current page
38        const pageReviews = [];
39        $('[data-testid="text-review-quotes"]').each((i, element) => {
40            if (reviews.length + pageReviews.length >= maxReviews) return false;
41
42            const positive = $(element).find('[data-testid="text-review-quotes-positive"] [data-testid="review-quote-answer"]').text().trim() || '';
43            const negative = $(element).find('[data-testid="text-review-negative"] [data-testid="review-quote-answer"]').text().trim() || '';
44            const problems = $(element).find('[data-testid="text-review-problems"] [data-testid="review-quote-answer"]').text().trim() || '';
45            const dateText = $(element).closest('[data-testid="review-card"]').find('[data-testid="review-author-date"]').text().trim() || '';
46
47            // Parse the date text into a JavaScript Date object
48            const parsedDate = dateText ? new Date(Date.parse(dateText.replace('Older than', '').trim())) : null;
49
50            pageReviews.push({
51                Positive: positive,
52                Negative: negative,
53                Problems: problems,
54                Date: parsedDate,
55                OriginalDate: dateText
56            });
57        });
58
59        reviews.push(...pageReviews);
60
61        if (reviews.length >= maxReviews) break;
62
63        if (pageReviews.length === 0) {
64            console.log('No more reviews found on this page. Ending pagination.');
65            break;
66        }
67
68        currentPage++;
69    } catch (error) {
70        console.error(`Failed to fetch page: ${currentPageUrl}`, error);
71        break;
72    }
73}
74
75// Sort reviews by date in descending order
76reviews.sort((a, b) => (b.Date || 0) - (a.Date || 0));
77
78// Save the extracted reviews to the default dataset
79await Actor.pushData(reviews);
80
81console.log(`${reviews.length} reviews extracted.`);
82
83await Actor.exit();

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json

package.json

{
    "name": "js-scrape-single-page",
    "version": "0.0.1",
    "type": "module",
    "description": "This is an example of an Apify actor.",
    "engines": {
        "node": ">=18.0.0"
    },
    "dependencies": {
        "apify": "^3.2.6",
        "axios": "^1.5.0",
        "cheerio": "^1.0.0-rc.12"
    },
    "scripts": {
        "start": "node ./src/main.js",
        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
    },
    "author": "It's not you it's me",
    "license": "ISC"
}

OMR Reviews Scraper

easyapi/omr-reviews-scraper

Scrape detailed reviews from OMR Reviews platform. Extract comprehensive review data including reviewer information, company details, ratings, and video content for products and services.

EasyApi

5.0

OMR Reviews Scraper

scrape_pro/omr-reviews-scraper

Extract all reviews from a product at OMR Reviews. Supports multiple OMR Reviews product URLs as input. Download the reviews as JSON, CSV, XML, Excel, HTML Table, RSS or JSONL.

Tom

Walmart Review Crawler

webscrapewizard/walmart-review-crawler

Extract all reviews from each review page. For each review, capture and output the following information: URL, product name, product link, total number of reviews, average product rating, review location, review rating, publication date, review text, review title, author name, and original source.

WebScraperWizard

Capterra

canadesk/capterra-software-advice

Fetch reviews and software alternatives from Capterra (Software Advice). Search and advanced filters available.

Canadesk Support

1.0

Software Advice

canadesk/software-advice

Get Products, Reviews, Statistics and Search results from Software Advice/Capterra. It's fast and costs little.

Canadesk Support

Influenster Review Crawler

webscrapewizard/influenster-review-crawler

Extract reviews from the Influenster review crawler using a list of URLs. The desired output includes: URL, product name, brand, author name, published date, review title, average review rating, individual review rating, review text, and like count.

WebScraperWizard

Google Maps Reviews Scraper

webscrapewizard/google-maps-reviews-scraper

Extract all reviews of Google Maps places using place URLs or Review urls. Get review text, published date, response from owner, review URL, and reviewer's details.

WebScraperWizard

390

1.0

Capterra Scraper

epctex/capterra-scraper

Extract valuable data from the Capterra software database with our Capterra Scraper. Gather product information, related products, insights, portfolio, real client reviews, and more. Explore top software listings for your business.

epctex

721

1.0

TrustRadius Review Scraper

getdataforme/TrustRadius-Review-scraper

TrustRadius Review Scraper is an Apify Actor that crawls trustradius.com using a specified software keyword and page limit(s). It returns each page’s URL, software name, aggregated rating, community insights, consolidated pros and cons, and reviewer profiles including Q&A, with Apify proxy support.!

GetDataForMe

Google Maps Store Review Scraper

delicious_zebu/google-maps-store-review-scraper

This Actor scrapes detailed Google Maps reviews, including shop info, reviewer profiles, ratings, and review content with images. It’s designed for fast and efficient data extraction from local businesses.