OMR Review Scraper avatar

OMR Review Scraper

Try for free

No credit card required

Go to Store
OMR Review Scraper

OMR Review Scraper

scrapers123/omr-review-scraper
Try for free

No credit card required

Scrapes software reviews from OMR

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install NPM packages, skip optional and development dependencies to
14# keep the image small. Avoid logging too much and print the dependency
15# tree for debugging
16RUN npm --quiet set progress=false \
17    && npm install --omit=dev --omit=optional \
18    && echo "Installed NPM packages:" \
19    && (npm list --omit=dev --all || true) \
20    && echo "Node.js version:" \
21    && node --version \
22    && echo "NPM version:" \
23    && npm --version \
24    && rm -r ~/.npm
25
26# Next, copy the remaining files and directories with the source code.
27# Since we do this after NPM install, quick build will be really fast
28# for most source file changes.
29COPY . ./
30
31
32# Run the image.
33CMD npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor",
4    "title": "Scrape single page in JavaScript",
5    "description": "Scrape data from single page with provided URL.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-start"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2  "$schema": "http://json-schema.org/draft-07/schema#",
3  "schemaVersion": 1,
4  "type": "object",
5  "title": "Input Schema for Review Scraper",
6  "description": "Schema for the input to the Actor that scrapes reviews.",
7  "properties": {
8    "url": {
9      "type": "string",
10      "title": "URL to scrape",
11      "description": "The URL of the webpage containing reviews to scrape.",
12      "editor": "textfield",
13      "default": "https://omr.com/en/reviews/product/microsoft-teams/all"
14    },
15    "maxReviews": {
16      "type": "integer",
17      "title": "Maximum Reviews",
18      "description": "The maximum number of reviews to scrape (e.g., 100).",
19      "editor": "number",
20      "default": 100
21    }
22  },
23  "required": ["url"],
24  "additionalProperties": false
25}

src/main.js

1import { Actor } from 'apify';
2import axios from 'axios';
3import * as cheerio from 'cheerio';
4
5await Actor.init();
6
7const input = await Actor.getInput();
8const { url, maxReviews = 100 } = input;
9
10let currentPage = 1;
11const reviews = [];
12
13// Proxy rotation setup using actor-provided proxy URLs
14const proxyUrls = process.env.APIFY_PROXY_URLS?.split(',') || [];
15let proxyIndex = 0;
16
17while (reviews.length < maxReviews) {
18    const currentPageUrl = `${url}/${currentPage}`;
19    console.log(`Scraping page: ${currentPageUrl}`);
20
21    try {
22        // Use proxy URLs if available
23        const proxyUrl = proxyUrls.length > 0 ? proxyUrls[proxyIndex % proxyUrls.length] : null;
24        proxyIndex++;
25
26        const axiosConfig = proxyUrl
27            ? {
28                  proxy: false,
29                  httpsAgent: new (require('https-proxy-agent'))(proxyUrl),
30              }
31            : {};
32
33        // Fetch the HTML content of the current page
34        const response = await axios.get(currentPageUrl, axiosConfig);
35        const $ = cheerio.load(response.data);
36
37        // Extract reviews from the current page
38        const pageReviews = [];
39        $('[data-testid="text-review-quotes"]').each((i, element) => {
40            if (reviews.length + pageReviews.length >= maxReviews) return false;
41
42            const positive = $(element).find('[data-testid="text-review-quotes-positive"] [data-testid="review-quote-answer"]').text().trim() || '';
43            const negative = $(element).find('[data-testid="text-review-negative"] [data-testid="review-quote-answer"]').text().trim() || '';
44            const problems = $(element).find('[data-testid="text-review-problems"] [data-testid="review-quote-answer"]').text().trim() || '';
45            const dateText = $(element).closest('[data-testid="review-card"]').find('[data-testid="review-author-date"]').text().trim() || '';
46
47            // Parse the date text into a JavaScript Date object
48            const parsedDate = dateText ? new Date(Date.parse(dateText.replace('Older than', '').trim())) : null;
49
50            pageReviews.push({
51                Positive: positive,
52                Negative: negative,
53                Problems: problems,
54                Date: parsedDate,
55                OriginalDate: dateText
56            });
57        });
58
59        reviews.push(...pageReviews);
60
61        if (reviews.length >= maxReviews) break;
62
63        if (pageReviews.length === 0) {
64            console.log('No more reviews found on this page. Ending pagination.');
65            break;
66        }
67
68        currentPage++;
69    } catch (error) {
70        console.error(`Failed to fetch page: ${currentPageUrl}`, error);
71        break;
72    }
73}
74
75// Sort reviews by date in descending order
76reviews.sort((a, b) => (b.Date || 0) - (a.Date || 0));
77
78// Save the extracted reviews to the default dataset
79await Actor.pushData(reviews);
80
81console.log(`${reviews.length} reviews extracted.`);
82
83await Actor.exit();

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.gitignore

1# This file tells Git which files shouldn't be added to source control
2.DS_Store
3.idea
4dist
5node_modules
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json

package.json

1{
2    "name": "js-scrape-single-page",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.2.6",
11        "axios": "^1.5.0",
12        "cheerio": "^1.0.0-rc.12"
13    },
14    "scripts": {
15        "start": "node ./src/main.js",
16        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
17    },
18    "author": "It's not you it's me",
19    "license": "ISC"
20}
Developer
Maintained by Community

Actor Metrics

  • 4 monthly users

  • 2 stars

  • >99% runs succeeded

  • Created in Dec 2024

  • Modified a month ago