
Booking Reviews Scraper
Deprecated
Pricing
Pay per usage
Go to Store

Booking Reviews Scraper
Deprecated
In the input tab, provide the Booking.com listing URL you'd like reviews for.
0.0 (0)
Pricing
Pay per usage
1
Total users
4
Monthly users
1
Last modified
a year ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-puppeteer-chrome:18
# Remove shell access
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor", "title": "Project Puppeteer Crawler JavaScript", "description": "Crawlee and Puppeteer project in JavaScript.", "version": "0.0", "meta": { "templateId": "js-crawlee-puppeteer-chrome" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "PlaywrightCrawler Template", "type": "object", "schemaVersion": 1, "properties": { "startUrls": { "title": "Start URL", "type": "string", "description": "URL to start with.", "editor": "textfield", "prefill": "https://apify.com" } }}
main_folder/main.js
1import puppeteer from 'puppeteer';2import { Actor } from 'apify';3
4(async () => {5 await Actor.init();6
7 const input = await Actor.getInput();8 const url = input.startUrls;9 10
11 async function run() {12 const browser = await puppeteer.launch({13 args: ['--no-sandbox', '--disable-setuid-sandbox']14 });15
16 const page = await browser.newPage();17
18
19
20 await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');21 await page22 .goto(url, {23 waitUntil: "domcontentloaded",24 })25 .catch((err) => console.log("error loading url", err));26
27 const element = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a/div/div/div/div[2]/div[2]');28 let reviewCount;29
30 // Get review count31 if (element.length > 0) {32 const textContent = await page.evaluate(el => el.textContent, element[0]);33 const reviews = textContent.match(/\d+/g);34 reviewCount = reviews ? parseInt(reviews.join(''), 10) : null;35 if (reviews) {36 console.log('Review count:', reviewCount);37 } else {38 console.error('No numeric part found in the element');39 }40 } else {41 console.error('Element not found');42 }43
44 //Review button45 const button = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a');46
47 48 if (button.length > 0) {49 await button[0].click();50 console.log('Processing...');51 } else {52 console.error('Element not found');53 }54
55 try {56 const elementXPath = '//*[@id="review_list_page_container"]';57 await page.waitForXPath(elementXPath);58 59 console.log('Processing...');60 } catch (error) {61 console.error('Error waiting for sidebar:', error.message);62 }63
64
65 if(reviewCount <=10){66
67 await page.waitForSelector(".review_list_new_item_block", { visible: true });68
69 const listItems = await page.$x('//ul[@class="review_list"]/li');70
71 // Loop through each list item and extract data72 for (const listItem of listItems) {73
74 const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));75 const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());76
77 const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());78 let roomInfo = ''; 79 try {80 roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());81 } catch (error) {82 console.error('Error finding roomInfo:', error.message);83 roomInfo = ''; 84 }85 const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());86 const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());87 const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());88 const match = fullReviewDate.match(/Reviewed:\s+(.+)/);89 const reviewDate = match ? match[1] : null;90
91 const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());92 const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());93 const reviewRows = await listItem.$$('.c-review__row');94 95 let positiveComment = '';96 let negativeComment = '';97
98 for (const row of reviewRows) {99 try {100 const prefixSpan = await row.$('.c-review__prefix');101 const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());102 103 if (prefixContent === 'Liked') {104 // Save positive comment text105 positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());106 } else if (prefixContent === 'Disliked') {107 // Save negative comment text108 negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());109 }110 } catch (error) {111 }112 }113 114 // Here all of the elements115 const data = {116 id,117 userName,118 userLocation,119 roomInfo,120 stayDate,121 stayLength,122 reviewDate,123 reviewTitle,124 rating,125 reviewTextParts: {126 Liked: positiveComment,127 Disliked: negativeComment,128 },129 };130
131 await Actor.pushData(data);132 133 }134 // Append the extracted data to the JSON file135 136 console.log("Task completed!");137 await Actor.exit();138 139 }else{140
141 try {142 // Find the next oage href by XPath143 const elementXPath = '//*[@id="review_list_page_container"]/div[4]/div/div[1]/div/div[2]/div/div[2]/a';144 await page.waitForXPath(elementXPath);145 const [elementHandle] = await page.$x(elementXPath);146 var offset = 0;147
148 if (elementHandle) {149 const nextPageHref = await page.evaluate(element => element.getAttribute('href'), elementHandle);150 let bookingUrl = 'https://www.booking.com' + nextPageHref;151 bookingUrl = bookingUrl.slice(0, -10);152 153 while (offset < reviewCount) {154 155 const newLink = createNewLink(bookingUrl, offset);156 157 await scrapeReviews(newLink);158 offset += 25;159 160 }161 162 }163
164 } catch (error) {165 console.error('Error:', error.message);166 } finally {167 await browser.close();168 console.log("Task completed!");169 await Actor.exit();170 } 171
172 await browser.close();173 }174 }175 run();176
177 178 function createNewLink(bookingUrl, offset) {179
180 const newLink = bookingUrl + '&&offset=' + offset.toString();181 182
183 return newLink;184 }185
186 async function scrapeReviews(link) {187 const browser = await puppeteer.launch({188 args: ['--no-sandbox', '--disable-setuid-sandbox']189 });190 191 const page = await browser.newPage();192 await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');193 console.log('Processing...');194
195 try {196 197 await page.goto(link, {198 waitUntil: "domcontentloaded",199 });200 201 await page.waitForSelector(".review_list_new_item_block", { visible: true });202
203 const listItems = await page.$x('//ul[@class="review_list"]/li');204
205 // Loop through each list item and extract data206
207 for (const listItem of listItems) {208
209 const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));210 const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());211 const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());212
213 var roomInfo = '';214 try {215 roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());216 } catch (error) {217 console.error('Error finding roomInfo:', error.message);218 roomInfo = ' '; 219 }220 const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());221 const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());222 const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());223 const match = fullReviewDate.match(/Reviewed:\s+(.+)/);224 const reviewDate = match ? match[1] : null;225
226 const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());227 const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());228 const reviewRows = await listItem.$$('.c-review__row');229 230 let positiveComment = ' ';231 let negativeComment = ' ';232
233 for (const row of reviewRows) {234 try {235 const prefixSpan = await row.$('.c-review__prefix');236 const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());237 238 if (prefixContent === 'Liked') {239 // Save positive comment text240 positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());241 } else if (prefixContent === 'Disliked') {242 // Save negative comment text243 negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());244 }245 } catch (error) {246 }247 }248 249 // Here all of the elements250 const data = {251 id,252 userName,253 userLocation,254 roomInfo,255 stayDate,256 stayLength,257 reviewDate,258 reviewTitle,259 rating,260 reviewTextParts: {261 Liked: positiveComment,262 Disliked: negativeComment,263 },264 };265 await Actor.pushData(data);266 }267 268 return listItems.length;269 } catch (error) {270 console.error('Error during page navigation:', error);271 return null;272 } finally {273 await browser.close();274 275
276 }277 }278 279})();
main_folder/routes.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';2
3export const router = createPuppeteerRouter();4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {6 log.info(`enqueueing new URLs`);7 await enqueueLinks({8 globs: ['https://apify.com/*'],9 label: 'detail',10 });11});12
13router.addHandler('detail', async ({ request, page, log }) => {14 const title = await page.title();15 log.info(`${title}`, { url: request.loadedUrl });16
17 await Dataset.pushData({18 url: request.loadedUrl,19 title,20 });21});
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
package.json
{ "name": "crawlee-puppeteer-javascript", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "dependencies": { "apify": "^3.1.10", "crawlee": "^3.5.4", "puppeteer": "*" }, "devDependencies": { "@apify/eslint-config": "^0.4.0", "eslint": "^8.50.0", "javascript-obfuscator": "^2.18.1" }, "scripts": { "start": "node main_folder/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1", "obfuscate": "javascript-obfuscator main_folder/main.js --output main_folder/main-obfuscated.js" }, "author": "It's not you it's me", "license": "ISC"}