
Booking Reviews Scraper
Deprecated
Pricing
Pay per usage
Go to Store

Booking Reviews Scraper
Deprecated
In the input tab, provide the Booking.com listing URL you'd like reviews for.
0.0 (0)
Pricing
Pay per usage
1
Total users
4
Monthly users
1
Last modified
a year ago
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:18
5
6# Remove shell access
7
8# Copy just package.json and package-lock.json
9# to speed up the build using Docker layer cache.
10COPY package*.json ./
11
12# Install NPM packages, skip optional and development dependencies to
13# keep the image small. Avoid logging too much and print the dependency
14# tree for debugging
15RUN npm --quiet set progress=false \
16 && npm install --omit=dev --omit=optional \
17 && echo "Installed NPM packages:" \
18 && (npm list --omit=dev --all || true) \
19 && echo "Node.js version:" \
20 && node --version \
21 && echo "NPM version:" \
22 && npm --version \
23 && rm -r ~/.npm
24
25# Next, copy the remaining files and directories with the source code.
26# Since we do this after NPM install, quick build will be really fast
27# for most source file changes.
28COPY . ./
29
30# Run the image. If you know you won't need headful browsers,
31# you can remove the XVFB start script for a micro perf gain.
32CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor",
4 "title": "Project Puppeteer Crawler JavaScript",
5 "description": "Crawlee and Puppeteer project in JavaScript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-puppeteer-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "PlaywrightCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Start URL",
8 "type": "string",
9 "description": "URL to start with.",
10 "editor": "textfield",
11 "prefill": "https://apify.com"
12 }
13 }
14}
main_folder/main.js
1import puppeteer from 'puppeteer';
2import { Actor } from 'apify';
3
4(async () => {
5 await Actor.init();
6
7 const input = await Actor.getInput();
8 const url = input.startUrls;
9
10
11 async function run() {
12 const browser = await puppeteer.launch({
13 args: ['--no-sandbox', '--disable-setuid-sandbox']
14 });
15
16 const page = await browser.newPage();
17
18
19
20 await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
21 await page
22 .goto(url, {
23 waitUntil: "domcontentloaded",
24 })
25 .catch((err) => console.log("error loading url", err));
26
27 const element = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a/div/div/div/div[2]/div[2]');
28 let reviewCount;
29
30 // Get review count
31 if (element.length > 0) {
32 const textContent = await page.evaluate(el => el.textContent, element[0]);
33 const reviews = textContent.match(/\d+/g);
34 reviewCount = reviews ? parseInt(reviews.join(''), 10) : null;
35 if (reviews) {
36 console.log('Review count:', reviewCount);
37 } else {
38 console.error('No numeric part found in the element');
39 }
40 } else {
41 console.error('Element not found');
42 }
43
44 //Review button
45 const button = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a');
46
47
48 if (button.length > 0) {
49 await button[0].click();
50 console.log('Processing...');
51 } else {
52 console.error('Element not found');
53 }
54
55 try {
56 const elementXPath = '//*[@id="review_list_page_container"]';
57 await page.waitForXPath(elementXPath);
58
59 console.log('Processing...');
60 } catch (error) {
61 console.error('Error waiting for sidebar:', error.message);
62 }
63
64
65 if(reviewCount <=10){
66
67 await page.waitForSelector(".review_list_new_item_block", { visible: true });
68
69 const listItems = await page.$x('//ul[@class="review_list"]/li');
70
71 // Loop through each list item and extract data
72 for (const listItem of listItems) {
73
74 const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));
75 const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());
76
77 const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());
78 let roomInfo = '';
79 try {
80 roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());
81 } catch (error) {
82 console.error('Error finding roomInfo:', error.message);
83 roomInfo = '';
84 }
85 const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());
86 const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());
87 const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());
88 const match = fullReviewDate.match(/Reviewed:\s+(.+)/);
89 const reviewDate = match ? match[1] : null;
90
91 const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());
92 const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());
93 const reviewRows = await listItem.$$('.c-review__row');
94
95 let positiveComment = '';
96 let negativeComment = '';
97
98 for (const row of reviewRows) {
99 try {
100 const prefixSpan = await row.$('.c-review__prefix');
101 const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());
102
103 if (prefixContent === 'Liked') {
104 // Save positive comment text
105 positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
106 } else if (prefixContent === 'Disliked') {
107 // Save negative comment text
108 negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
109 }
110 } catch (error) {
111 }
112 }
113
114 // Here all of the elements
115 const data = {
116 id,
117 userName,
118 userLocation,
119 roomInfo,
120 stayDate,
121 stayLength,
122 reviewDate,
123 reviewTitle,
124 rating,
125 reviewTextParts: {
126 Liked: positiveComment,
127 Disliked: negativeComment,
128 },
129 };
130
131 await Actor.pushData(data);
132
133 }
134 // Append the extracted data to the JSON file
135
136 console.log("Task completed!");
137 await Actor.exit();
138
139 }else{
140
141 try {
142 // Find the next oage href by XPath
143 const elementXPath = '//*[@id="review_list_page_container"]/div[4]/div/div[1]/div/div[2]/div/div[2]/a';
144 await page.waitForXPath(elementXPath);
145 const [elementHandle] = await page.$x(elementXPath);
146 var offset = 0;
147
148 if (elementHandle) {
149 const nextPageHref = await page.evaluate(element => element.getAttribute('href'), elementHandle);
150 let bookingUrl = 'https://www.booking.com' + nextPageHref;
151 bookingUrl = bookingUrl.slice(0, -10);
152
153 while (offset < reviewCount) {
154
155 const newLink = createNewLink(bookingUrl, offset);
156
157 await scrapeReviews(newLink);
158 offset += 25;
159
160 }
161
162 }
163
164 } catch (error) {
165 console.error('Error:', error.message);
166 } finally {
167 await browser.close();
168 console.log("Task completed!");
169 await Actor.exit();
170 }
171
172 await browser.close();
173 }
174 }
175 run();
176
177
178 function createNewLink(bookingUrl, offset) {
179
180 const newLink = bookingUrl + '&&offset=' + offset.toString();
181
182
183 return newLink;
184 }
185
186 async function scrapeReviews(link) {
187 const browser = await puppeteer.launch({
188 args: ['--no-sandbox', '--disable-setuid-sandbox']
189 });
190
191 const page = await browser.newPage();
192 await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
193 console.log('Processing...');
194
195 try {
196
197 await page.goto(link, {
198 waitUntil: "domcontentloaded",
199 });
200
201 await page.waitForSelector(".review_list_new_item_block", { visible: true });
202
203 const listItems = await page.$x('//ul[@class="review_list"]/li');
204
205 // Loop through each list item and extract data
206
207 for (const listItem of listItems) {
208
209 const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));
210 const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());
211 const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());
212
213 var roomInfo = '';
214 try {
215 roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());
216 } catch (error) {
217 console.error('Error finding roomInfo:', error.message);
218 roomInfo = ' ';
219 }
220 const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());
221 const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());
222 const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());
223 const match = fullReviewDate.match(/Reviewed:\s+(.+)/);
224 const reviewDate = match ? match[1] : null;
225
226 const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());
227 const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());
228 const reviewRows = await listItem.$$('.c-review__row');
229
230 let positiveComment = ' ';
231 let negativeComment = ' ';
232
233 for (const row of reviewRows) {
234 try {
235 const prefixSpan = await row.$('.c-review__prefix');
236 const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());
237
238 if (prefixContent === 'Liked') {
239 // Save positive comment text
240 positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
241 } else if (prefixContent === 'Disliked') {
242 // Save negative comment text
243 negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
244 }
245 } catch (error) {
246 }
247 }
248
249 // Here all of the elements
250 const data = {
251 id,
252 userName,
253 userLocation,
254 roomInfo,
255 stayDate,
256 stayLength,
257 reviewDate,
258 reviewTitle,
259 rating,
260 reviewTextParts: {
261 Liked: positiveComment,
262 Disliked: negativeComment,
263 },
264 };
265 await Actor.pushData(data);
266 }
267
268 return listItems.length;
269 } catch (error) {
270 console.error('Error during page navigation:', error);
271 return null;
272 } finally {
273 await browser.close();
274
275
276 }
277 }
278
279})();
main_folder/routes.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://apify.com/*'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14 const title = await page.title();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await Dataset.pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
package.json
1{
2 "name": "crawlee-puppeteer-javascript",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "dependencies": {
7 "apify": "^3.1.10",
8 "crawlee": "^3.5.4",
9 "puppeteer": "*"
10 },
11 "devDependencies": {
12 "@apify/eslint-config": "^0.4.0",
13 "eslint": "^8.50.0",
14 "javascript-obfuscator": "^2.18.1"
15 },
16 "scripts": {
17 "start": "node main_folder/main.js",
18 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
19 "obfuscate": "javascript-obfuscator main_folder/main.js --output main_folder/main-obfuscated.js"
20 },
21 "author": "It's not you it's me",
22 "license": "ISC"
23}