Booking Reviews Scraper
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsBooking Reviews Scraper
krapsits/booking-reviews-scraper
In the input tab, provide the Booking.com listing URL you'd like reviews for.
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:18
5
6# Remove shell access
7
8# Copy just package.json and package-lock.json
9# to speed up the build using Docker layer cache.
10COPY package*.json ./
11
12# Install NPM packages, skip optional and development dependencies to
13# keep the image small. Avoid logging too much and print the dependency
14# tree for debugging
15RUN npm --quiet set progress=false \
16 && npm install --omit=dev --omit=optional \
17 && echo "Installed NPM packages:" \
18 && (npm list --omit=dev --all || true) \
19 && echo "Node.js version:" \
20 && node --version \
21 && echo "NPM version:" \
22 && npm --version \
23 && rm -r ~/.npm
24
25# Next, copy the remaining files and directories with the source code.
26# Since we do this after NPM install, quick build will be really fast
27# for most source file changes.
28COPY . ./
29
30# Run the image. If you know you won't need headful browsers,
31# you can remove the XVFB start script for a micro perf gain.
32CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor",
4 "title": "Project Puppeteer Crawler JavaScript",
5 "description": "Crawlee and Puppeteer project in JavaScript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-puppeteer-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "PlaywrightCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Start URL",
8 "type": "string",
9 "description": "URL to start with.",
10 "editor": "textfield",
11 "prefill": "https://apify.com"
12 }
13 }
14}
main_folder/main.js
1import puppeteer from 'puppeteer';
2import { Actor } from 'apify';
3
4(async () => {
5 await Actor.init();
6
7 const input = await Actor.getInput();
8 const url = input.startUrls;
9
10
11 async function run() {
12 const browser = await puppeteer.launch({
13 args: ['--no-sandbox', '--disable-setuid-sandbox']
14 });
15
16 const page = await browser.newPage();
17
18
19
20 await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
21 await page
22 .goto(url, {
23 waitUntil: "domcontentloaded",
24 })
25 .catch((err) => console.log("error loading url", err));
26
27 const element = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a/div/div/div/div[2]/div[2]');
28 let reviewCount;
29
30 // Get review count
31 if (element.length > 0) {
32 const textContent = await page.evaluate(el => el.textContent, element[0]);
33 const reviews = textContent.match(/\d+/g);
34 reviewCount = reviews ? parseInt(reviews.join(''), 10) : null;
35 if (reviews) {
36 console.log('Review count:', reviewCount);
37 } else {
38 console.error('No numeric part found in the element');
39 }
40 } else {
41 console.error('Element not found');
42 }
43
44 //Review button
45 const button = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a');
46
47
48 if (button.length > 0) {
49 await button[0].click();
50 console.log('Processing...');
51 } else {
52 console.error('Element not found');
53 }
54
55 try {
56 const elementXPath = '//*[@id="review_list_page_container"]';
57 await page.waitForXPath(elementXPath);
58
59 console.log('Processing...');
60 } catch (error) {
61 console.error('Error waiting for sidebar:', error.message);
62 }
63
64
65 if(reviewCount <=10){
66
67 await page.waitForSelector(".review_list_new_item_block", { visible: true });
68
69 const listItems = await page.$x('//ul[@class="review_list"]/li');
70
71 // Loop through each list item and extract data
72 for (const listItem of listItems) {
73
74 const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));
75 const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());
76
77 const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());
78 let roomInfo = '';
79 try {
80 roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());
81 } catch (error) {
82 console.error('Error finding roomInfo:', error.message);
83 roomInfo = '';
84 }
85 const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());
86 const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());
87 const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());
88 const match = fullReviewDate.match(/Reviewed:\s+(.+)/);
89 const reviewDate = match ? match[1] : null;
90
91 const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());
92 const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());
93 const reviewRows = await listItem.$$('.c-review__row');
94
95 let positiveComment = '';
96 let negativeComment = '';
97
98 for (const row of reviewRows) {
99 try {
100 const prefixSpan = await row.$('.c-review__prefix');
101 const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());
102
103 if (prefixContent === 'Liked') {
104 // Save positive comment text
105 positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
106 } else if (prefixContent === 'Disliked') {
107 // Save negative comment text
108 negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
109 }
110 } catch (error) {
111 }
112 }
113
114 // Here all of the elements
115 const data = {
116 id,
117 userName,
118 userLocation,
119 roomInfo,
120 stayDate,
121 stayLength,
122 reviewDate,
123 reviewTitle,
124 rating,
125 reviewTextParts: {
126 Liked: positiveComment,
127 Disliked: negativeComment,
128 },
129 };
130
131 await Actor.pushData(data);
132
133 }
134 // Append the extracted data to the JSON file
135
136 console.log("Task completed!");
137 await Actor.exit();
138
139 }else{
140
141 try {
142 // Find the next oage href by XPath
143 const elementXPath = '//*[@id="review_list_page_container"]/div[4]/div/div[1]/div/div[2]/div/div[2]/a';
144 await page.waitForXPath(elementXPath);
145 const [elementHandle] = await page.$x(elementXPath);
146 var offset = 0;
147
148 if (elementHandle) {
149 const nextPageHref = await page.evaluate(element => element.getAttribute('href'), elementHandle);
150 let bookingUrl = 'https://www.booking.com' + nextPageHref;
151 bookingUrl = bookingUrl.slice(0, -10);
152
153 while (offset < reviewCount) {
154
155 const newLink = createNewLink(bookingUrl, offset);
156
157 await scrapeReviews(newLink);
158 offset += 25;
159
160 }
161
162 }
163
164 } catch (error) {
165 console.error('Error:', error.message);
166 } finally {
167 await browser.close();
168 console.log("Task completed!");
169 await Actor.exit();
170 }
171
172 await browser.close();
173 }
174 }
175 run();
176
177
178 function createNewLink(bookingUrl, offset) {
179
180 const newLink = bookingUrl + '&&offset=' + offset.toString();
181
182
183 return newLink;
184 }
185
186 async function scrapeReviews(link) {
187 const browser = await puppeteer.launch({
188 args: ['--no-sandbox', '--disable-setuid-sandbox']
189 });
190
191 const page = await browser.newPage();
192 await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
193 console.log('Processing...');
194
195 try {
196
197 await page.goto(link, {
198 waitUntil: "domcontentloaded",
199 });
200
201 await page.waitForSelector(".review_list_new_item_block", { visible: true });
202
203 const listItems = await page.$x('//ul[@class="review_list"]/li');
204
205 // Loop through each list item and extract data
206
207 for (const listItem of listItems) {
208
209 const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));
210 const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());
211 const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());
212
213 var roomInfo = '';
214 try {
215 roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());
216 } catch (error) {
217 console.error('Error finding roomInfo:', error.message);
218 roomInfo = ' ';
219 }
220 const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());
221 const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());
222 const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());
223 const match = fullReviewDate.match(/Reviewed:\s+(.+)/);
224 const reviewDate = match ? match[1] : null;
225
226 const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());
227 const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());
228 const reviewRows = await listItem.$$('.c-review__row');
229
230 let positiveComment = ' ';
231 let negativeComment = ' ';
232
233 for (const row of reviewRows) {
234 try {
235 const prefixSpan = await row.$('.c-review__prefix');
236 const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());
237
238 if (prefixContent === 'Liked') {
239 // Save positive comment text
240 positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
241 } else if (prefixContent === 'Disliked') {
242 // Save negative comment text
243 negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
244 }
245 } catch (error) {
246 }
247 }
248
249 // Here all of the elements
250 const data = {
251 id,
252 userName,
253 userLocation,
254 roomInfo,
255 stayDate,
256 stayLength,
257 reviewDate,
258 reviewTitle,
259 rating,
260 reviewTextParts: {
261 Liked: positiveComment,
262 Disliked: negativeComment,
263 },
264 };
265 await Actor.pushData(data);
266 }
267
268 return listItems.length;
269 } catch (error) {
270 console.error('Error during page navigation:', error);
271 return null;
272 } finally {
273 await browser.close();
274
275
276 }
277 }
278
279})();
main_folder/routes.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://apify.com/*'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14 const title = await page.title();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await Dataset.pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
package.json
1{
2 "name": "crawlee-puppeteer-javascript",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "dependencies": {
7 "apify": "^3.1.10",
8 "crawlee": "^3.5.4",
9 "puppeteer": "*"
10 },
11 "devDependencies": {
12 "@apify/eslint-config": "^0.4.0",
13 "eslint": "^8.50.0",
14 "javascript-obfuscator": "^2.18.1"
15 },
16 "scripts": {
17 "start": "node main_folder/main.js",
18 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
19 "obfuscate": "javascript-obfuscator main_folder/main.js --output main_folder/main-obfuscated.js"
20 },
21 "author": "It's not you it's me",
22 "license": "ISC"
23}
Developer
Maintained by Community
Categories