OMR Review Scraper avatar
OMR Review Scraper

Pricing

Pay per usage

Go to Store
OMR Review Scraper

OMR Review Scraper

Developed by

ManM

ManM

Maintained by Community

Scrapes software reviews from OMR

5.0 (1)

Pricing

Pay per usage

2

Total users

9

Monthly users

1

Runs succeeded

>99%

Last modified

4 months ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:20
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Run the image.
CMD npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor",
"title": "Scrape single page in JavaScript",
"description": "Scrape data from single page with provided URL.",
"version": "0.0",
"meta": {
"templateId": "js-start"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"$schema": "http://json-schema.org/draft-07/schema#",
"schemaVersion": 1,
"type": "object",
"title": "Input Schema for Review Scraper",
"description": "Schema for the input to the Actor that scrapes reviews.",
"properties": {
"url": {
"type": "string",
"title": "URL to scrape",
"description": "The URL of the webpage containing reviews to scrape.",
"editor": "textfield",
"default": "https://omr.com/en/reviews/product/microsoft-teams/all"
},
"maxReviews": {
"type": "integer",
"title": "Maximum Reviews",
"description": "The maximum number of reviews to scrape (e.g., 100).",
"editor": "number",
"default": 100
}
},
"required": ["url"],
"additionalProperties": false
}

src/main.js

1import { Actor } from 'apify';
2import axios from 'axios';
3import * as cheerio from 'cheerio';
4
5await Actor.init();
6
7const input = await Actor.getInput();
8const { url, maxReviews = 100 } = input;
9
10let currentPage = 1;
11const reviews = [];
12
13// Proxy rotation setup using actor-provided proxy URLs
14const proxyUrls = process.env.APIFY_PROXY_URLS?.split(',') || [];
15let proxyIndex = 0;
16
17while (reviews.length < maxReviews) {
18 const currentPageUrl = `${url}/${currentPage}`;
19 console.log(`Scraping page: ${currentPageUrl}`);
20
21 try {
22 // Use proxy URLs if available
23 const proxyUrl = proxyUrls.length > 0 ? proxyUrls[proxyIndex % proxyUrls.length] : null;
24 proxyIndex++;
25
26 const axiosConfig = proxyUrl
27 ? {
28 proxy: false,
29 httpsAgent: new (require('https-proxy-agent'))(proxyUrl),
30 }
31 : {};
32
33 // Fetch the HTML content of the current page
34 const response = await axios.get(currentPageUrl, axiosConfig);
35 const $ = cheerio.load(response.data);
36
37 // Extract reviews from the current page
38 const pageReviews = [];
39 $('[data-testid="text-review-quotes"]').each((i, element) => {
40 if (reviews.length + pageReviews.length >= maxReviews) return false;
41
42 const positive = $(element).find('[data-testid="text-review-quotes-positive"] [data-testid="review-quote-answer"]').text().trim() || '';
43 const negative = $(element).find('[data-testid="text-review-negative"] [data-testid="review-quote-answer"]').text().trim() || '';
44 const problems = $(element).find('[data-testid="text-review-problems"] [data-testid="review-quote-answer"]').text().trim() || '';
45 const dateText = $(element).closest('[data-testid="review-card"]').find('[data-testid="review-author-date"]').text().trim() || '';
46
47 // Parse the date text into a JavaScript Date object
48 const parsedDate = dateText ? new Date(Date.parse(dateText.replace('Older than', '').trim())) : null;
49
50 pageReviews.push({
51 Positive: positive,
52 Negative: negative,
53 Problems: problems,
54 Date: parsedDate,
55 OriginalDate: dateText
56 });
57 });
58
59 reviews.push(...pageReviews);
60
61 if (reviews.length >= maxReviews) break;
62
63 if (pageReviews.length === 0) {
64 console.log('No more reviews found on this page. Ending pagination.');
65 break;
66 }
67
68 currentPage++;
69 } catch (error) {
70 console.error(`Failed to fetch page: ${currentPageUrl}`, error);
71 break;
72 }
73}
74
75// Sort reviews by date in descending order
76reviews.sort((a, b) => (b.Date || 0) - (a.Date || 0));
77
78// Save the extracted reviews to the default dataset
79await Actor.pushData(reviews);
80
81console.log(`${reviews.length} reviews extracted.`);
82
83await Actor.exit();

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json

package.json

{
"name": "js-scrape-single-page",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify actor.",
"engines": {
"node": ">=18.0.0"
},
"dependencies": {
"apify": "^3.2.6",
"axios": "^1.5.0",
"cheerio": "^1.0.0-rc.12"
},
"scripts": {
"start": "node ./src/main.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}