
OMR Review Scraper
Pricing
Pay per usage
Go to Store

OMR Review Scraper
Scrapes software reviews from OMR
5.0 (1)
Pricing
Pay per usage
2
Total users
9
Monthly users
1
Runs succeeded
>99%
Last modified
4 months ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:20
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image.CMD npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor", "title": "Scrape single page in JavaScript", "description": "Scrape data from single page with provided URL.", "version": "0.0", "meta": { "templateId": "js-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "$schema": "http://json-schema.org/draft-07/schema#", "schemaVersion": 1, "type": "object", "title": "Input Schema for Review Scraper", "description": "Schema for the input to the Actor that scrapes reviews.", "properties": { "url": { "type": "string", "title": "URL to scrape", "description": "The URL of the webpage containing reviews to scrape.", "editor": "textfield", "default": "https://omr.com/en/reviews/product/microsoft-teams/all" }, "maxReviews": { "type": "integer", "title": "Maximum Reviews", "description": "The maximum number of reviews to scrape (e.g., 100).", "editor": "number", "default": 100 } }, "required": ["url"], "additionalProperties": false}
src/main.js
1import { Actor } from 'apify';2import axios from 'axios';3import * as cheerio from 'cheerio';4
5await Actor.init();6
7const input = await Actor.getInput();8const { url, maxReviews = 100 } = input;9
10let currentPage = 1;11const reviews = [];12
13// Proxy rotation setup using actor-provided proxy URLs14const proxyUrls = process.env.APIFY_PROXY_URLS?.split(',') || [];15let proxyIndex = 0;16
17while (reviews.length < maxReviews) {18 const currentPageUrl = `${url}/${currentPage}`;19 console.log(`Scraping page: ${currentPageUrl}`);20
21 try {22 // Use proxy URLs if available23 const proxyUrl = proxyUrls.length > 0 ? proxyUrls[proxyIndex % proxyUrls.length] : null;24 proxyIndex++;25
26 const axiosConfig = proxyUrl27 ? {28 proxy: false,29 httpsAgent: new (require('https-proxy-agent'))(proxyUrl),30 }31 : {};32
33 // Fetch the HTML content of the current page34 const response = await axios.get(currentPageUrl, axiosConfig);35 const $ = cheerio.load(response.data);36
37 // Extract reviews from the current page38 const pageReviews = [];39 $('[data-testid="text-review-quotes"]').each((i, element) => {40 if (reviews.length + pageReviews.length >= maxReviews) return false;41
42 const positive = $(element).find('[data-testid="text-review-quotes-positive"] [data-testid="review-quote-answer"]').text().trim() || '';43 const negative = $(element).find('[data-testid="text-review-negative"] [data-testid="review-quote-answer"]').text().trim() || '';44 const problems = $(element).find('[data-testid="text-review-problems"] [data-testid="review-quote-answer"]').text().trim() || '';45 const dateText = $(element).closest('[data-testid="review-card"]').find('[data-testid="review-author-date"]').text().trim() || '';46
47 // Parse the date text into a JavaScript Date object48 const parsedDate = dateText ? new Date(Date.parse(dateText.replace('Older than', '').trim())) : null;49
50 pageReviews.push({51 Positive: positive,52 Negative: negative,53 Problems: problems,54 Date: parsedDate,55 OriginalDate: dateText56 });57 });58
59 reviews.push(...pageReviews);60
61 if (reviews.length >= maxReviews) break;62
63 if (pageReviews.length === 0) {64 console.log('No more reviews found on this page. Ending pagination.');65 break;66 }67
68 currentPage++;69 } catch (error) {70 console.error(`Failed to fetch page: ${currentPageUrl}`, error);71 break;72 }73}74
75// Sort reviews by date in descending order76reviews.sort((a, b) => (b.Date || 0) - (a.Date || 0));77
78// Save the extracted reviews to the default dataset79await Actor.pushData(reviews);80
81console.log(`${reviews.length} reviews extracted.`);82
83await Actor.exit();
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.gitignore
# This file tells Git which files shouldn't be added to source control.DS_Store.ideadistnode_modulesapify_storagestorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
package.json
{ "name": "js-scrape-single-page", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "engines": { "node": ">=18.0.0" }, "dependencies": { "apify": "^3.2.6", "axios": "^1.5.0", "cheerio": "^1.0.0-rc.12" }, "scripts": { "start": "node ./src/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}