reviews.io
Under maintenance
Pricing
Pay per usage
Go to Store
reviews.io
Under maintenance
This crawler is designed to fetch and compile all customer reviews from the Reviews.io website. It systematically extracts key information, including reviewer names, ratings, review content, submission dates, and relevant metadata.
0.0 (0)
Pricing
Pay per usage
0
Total users
1
Monthly users
1
Runs succeeded
>99%
Last modified
a day ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:20 AS builder
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install all dependencies. Don't audit to speed up the installation.RUN npm install --include=dev --audit=false
# Next, copy the source files using the user set# in the base image.COPY . ./
# Install all dependencies and build the project.# Don't audit to speed up the installation.RUN npm run build
# Create final imageFROM apify/actor-node:20
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Copy built JS files from builder imageCOPY /usr/src/app/dist ./dist
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Create and run as a non-root user.RUN adduser -h /home/apify -D apify && \ chown -R apify:apify ./USER apify
# Run the image.CMD npm run start:prod --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor", "title": "Scrape single page in TypeScript", "description": "Scrape data from single page with provided URL.", "version": "0.0", "meta": { "templateId": "ts-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "url": { "title": "URL of the page", "type": "string", "description": "The URL of website you want to get the data from.", "editor": "textfield", "prefill": "https://www.apify.com" } }, "required": ["url"]}
src/main.ts
1// Axios - Promise based HTTP client for the browser and node.js (Read more at https://axios-http.com/docs/intro).2import axios from "axios";3// Cheerio - The fast, flexible & elegant library for parsing and manipulating HTML and XML (Read more at https://cheerio.js.org/).4import * as cheerio from "cheerio";5// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/).6import { Actor } from "apify";7
8// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().9await Actor.init();10
11interface Input {12 url: string;13}14
15interface Review {16 reviewerName: string;17 reviewText: string;18 rating: number;19 postedDate?: string;20 pageNumber: number;21}22
23interface PageInfo {24 nextPageUrl: string | null;25 pageNumber: number;26}27
28function getStoreId(url: string): string {29 const matches = url.match(/store\/([^\/]+)/);30 return matches ? matches[1] : '';31}32
33async function extractReviewsFromPage(pageUrl: string, pageNumber: number): Promise<{ reviews: Review[]; nextPage: string | null }> {34 console.log(`Fetching reviews from: ${pageUrl}`);35
36 try {37 const response = await axios.get(pageUrl, {38 headers: {39 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',40 Accept: 'text/html',41 },42 });43
44 const $ = cheerio.load(response.data);45 const reviews: Review[] = [];46
47 $('.Review').each((_i, element) => {48 const $element = $(element);49 const reviewerName = $element.find('.Review__author').text().trim() || 'Anonymous';50 const reviewText = $element.find('.Review__body').text().trim()51 .replace(/[""]/g, '')52 .replace(/[\u201C\u201D]/g, '')53 .trim();54 const rating = $element.find('.Rating__stars .stars__icon--100').length;55 const postedDate = $element.find('.Review__dateSource').text().trim();56
57 if (reviewText) {58 reviews.push({59 reviewerName,60 reviewText,61 rating,62 postedDate: postedDate || undefined,63 pageNumber,64 });65 }66 });67
68 // Find the next sequential page URL69 let nextPageUrl: string | null = null;70 const currentPageElement = $('.pagination li.active');71 if (currentPageElement.length) {72 const nextPageElement = currentPageElement.next('li').find('a');73 if (nextPageElement.length) {74 nextPageUrl = nextPageElement.attr('href') || null;75 }76 }77
78 console.log(`Page ${pageNumber}: Found ${reviews.length} reviews. Next page URL: ${nextPageUrl}`);79 return { reviews, nextPage: nextPageUrl };80 } catch (error) {81 if (axios.isAxiosError(error)) {82 console.error(`Error fetching page ${pageNumber}:`, error.message);83 if (error.response) {84 console.error(`Status: ${error.response.status}`);85 }86 }87 throw error;88 }89}90
91async function getTotalPages(url: string): Promise<number> {92 try {93 const response = await axios.get(url);94 const $ = cheerio.load(response.data);95 const totalReviewsText = $('.js-reviewsio-review-count strong').text().trim();96 const totalReviews = parseInt(totalReviewsText, 10);97 const reviewsPerPage = 20;98 const totalPages = Math.ceil(totalReviews / reviewsPerPage);99 console.log(`Total reviews: ${totalReviews}, Total pages: ${totalPages}`);100 return totalPages;101 } catch (error) {102 console.error('Error getting total pages:', error);103 throw error;104 }105}106
107async function getAllReviews(url: string): Promise<Review[]> {108 const allReviews: Review[] = [];109 let currentPage = 1;110 let currentUrl = url;111 const seenUrls = new Set<string>();112
113 try {114 const totalPages = await getTotalPages(url);115 console.log(`Detected ${totalPages} total pages`);116
117 while (currentUrl && currentPage <= totalPages) {118 // Prevent infinite loops by checking if we've seen this URL before119 if (seenUrls.has(currentUrl)) {120 console.log(`Already visited ${currentUrl}, stopping pagination`);121 break;122 }123 seenUrls.add(currentUrl);124
125 try {126 const { reviews, nextPage } = await extractReviewsFromPage(currentUrl, currentPage);127 128 if (reviews.length === 0) {129 console.log(`No more reviews found after page ${currentPage - 1}`);130 break;131 }132
133 allReviews.push(...reviews);134 console.log(`Total reviews collected so far: ${allReviews.length}`);135 136 // Update URL for next iteration137 currentUrl = nextPage || '';138 currentPage++;139
140 // Add a delay between requests141 await new Promise((resolve) => setTimeout(resolve, 2000));142 } catch (error) {143 console.error(`Failed to fetch page ${currentPage}, stopping pagination`);144 break;145 }146 }147 } catch (error) {148 console.error('Error fetching reviews:', error);149 throw error;150 }151
152 return allReviews;153}154
155// Main execution156try {157 const input = await Actor.getInput<Input>();158 if (!input) throw new Error('Input is missing!');159 const { url } = input;160
161 console.log('Starting review extraction...');162 const allReviews = await getAllReviews(url);163 164 // Log review distribution across pages165 const reviewsByPage = allReviews.reduce((acc, review) => {166 acc[review.pageNumber] = (acc[review.pageNumber] || 0) + 1;167 return acc;168 }, {} as Record<number, number>);169 170 console.log('Reviews per page:', reviewsByPage);171 172 // Save all reviews to Dataset173 await Actor.pushData(allReviews);174 175 console.log(`Successfully extracted ${allReviews.length} total reviews`);176} catch (error) {177 console.error('Error during scraping:', error);178 throw error;179} finally {180 // Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().181 await Actor.exit();182}
.dockerignore
# configurations.idea.vscode
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
# dist folderdist
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "root": true, "env": { "browser": true, "es2020": true, "node": true }, "extends": [ "@apify/eslint-config-ts" ], "parserOptions": { "project": "./tsconfig.json", "ecmaVersion": 2020 }, "ignorePatterns": [ "node_modules", "dist", "**/*.d.ts" ]}
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.vscode.zedstorageapify_storagecrawlee_storagenode_modulesdisttsconfig.tsbuildinfostorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
package.json
{ "name": "ts-start", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "engines": { "node": ">=18.0.0" }, "dependencies": { "apify": "^3.2.6", "axios": "^1.5.0", "cheerio": "^1.0.0-rc.12", "crawlee": "^3.6.4", "puppeteer": "^22.8.2" }, "devDependencies": { "@apify/eslint-config-ts": "^0.3.0", "@apify/tsconfig": "^0.1.0", "@typescript-eslint/eslint-plugin": "^7.18.0", "@typescript-eslint/parser": "^7.18.0", "eslint": "^8.50.0", "tsx": "^4.6.2", "typescript": "^5.3.3" }, "scripts": { "start": "npm run start:dev", "start:prod": "node dist/main.js", "start:dev": "tsx src/main.ts", "build": "tsc", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}
tsconfig.json
{ "extends": "@apify/tsconfig", "compilerOptions": { "module": "NodeNext", "moduleResolution": "NodeNext", "target": "ES2022", "outDir": "dist", "noUnusedLocals": false, "skipLibCheck": true, "lib": ["DOM"] }, "include": [ "./src/**/*" ]}