Booking Reviews Scraper avatar
Booking Reviews Scraper
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Booking Reviews Scraper

Booking Reviews Scraper

krapsits/booking-reviews-scraper

In the input tab, provide the Booking.com listing URL you'd like reviews for.

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:18
5
6# Remove shell access
7
8# Copy just package.json and package-lock.json
9# to speed up the build using Docker layer cache.
10COPY --chown=myuser package*.json ./
11
12# Install NPM packages, skip optional and development dependencies to
13# keep the image small. Avoid logging too much and print the dependency
14# tree for debugging
15RUN npm --quiet set progress=false \
16    && npm install --omit=dev --omit=optional \
17    && echo "Installed NPM packages:" \
18    && (npm list --omit=dev --all || true) \
19    && echo "Node.js version:" \
20    && node --version \
21    && echo "NPM version:" \
22    && npm --version \
23    && rm -r ~/.npm
24
25# Next, copy the remaining files and directories with the source code.
26# Since we do this after NPM install, quick build will be really fast
27# for most source file changes.
28COPY --chown=myuser . ./
29
30# Run the image. If you know you won't need headful browsers,
31# you can remove the XVFB start script for a micro perf gain.
32CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor",
4    "title": "Project Puppeteer Crawler JavaScript",
5    "description": "Crawlee and Puppeteer project in JavaScript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-puppeteer-chrome"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2  "title": "PlaywrightCrawler Template",
3  "type": "object",
4  "schemaVersion": 1,
5  "properties": {
6    "startUrls": {
7      "title": "Start URL",
8      "type": "string",
9      "description": "URL to start with.",
10      "editor": "textfield",
11      "prefill": "https://apify.com"
12    }
13  }
14}

main_folder/main.js

1import puppeteer from 'puppeteer';
2import { Actor } from 'apify';
3
4(async () => {
5  await Actor.init();
6
7  const input = await Actor.getInput();
8  const url = input.startUrls;
9  
10
11  async function run() {
12    const browser = await puppeteer.launch({
13      args: ['--no-sandbox', '--disable-setuid-sandbox']
14    });
15
16    const page = await browser.newPage();
17
18
19
20    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
21    await page
22      .goto(url, {
23        waitUntil: "domcontentloaded",
24      })
25      .catch((err) => console.log("error loading url", err));
26
27    const element = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a/div/div/div/div[2]/div[2]');
28    let reviewCount;
29
30    // Get review count
31    if (element.length > 0) {
32      const textContent = await page.evaluate(el => el.textContent, element[0]);
33      const reviews = textContent.match(/\d+/g);
34      reviewCount = reviews ? parseInt(reviews.join(''), 10) : null;
35      if (reviews) {
36        console.log('Review count:', reviewCount);
37      } else {
38        console.error('No numeric part found in the element');
39      }
40    } else {
41      console.error('Element not found');
42    }
43
44    //Review button
45    const button = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a');
46
47  
48    if (button.length > 0) {
49      await button[0].click();
50      console.log('Processing...');
51    } else {
52      console.error('Element not found');
53    }
54
55    try {
56      const elementXPath = '//*[@id="review_list_page_container"]';
57      await page.waitForXPath(elementXPath);
58    
59      console.log('Processing...');
60    } catch (error) {
61      console.error('Error waiting for sidebar:', error.message);
62    }
63
64
65    if(reviewCount <=10){
66
67      await page.waitForSelector(".review_list_new_item_block", { visible: true });
68
69      const listItems = await page.$x('//ul[@class="review_list"]/li');
70
71      // Loop through each list item and extract data
72      for (const listItem of listItems) {
73
74        const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));
75        const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());
76
77        const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());
78        let roomInfo = ''; 
79        try {
80          roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());
81        } catch (error) {
82          console.error('Error finding roomInfo:', error.message);
83          roomInfo = ''; 
84        }
85        const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());
86        const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());
87        const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());
88        const match = fullReviewDate.match(/Reviewed:\s+(.+)/);
89        const reviewDate = match ? match[1] : null;
90
91        const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());
92        const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());
93        const reviewRows = await listItem.$$('.c-review__row');
94        
95        let positiveComment = '';
96        let negativeComment = '';
97
98        for (const row of reviewRows) {
99          try {
100            const prefixSpan = await row.$('.c-review__prefix');
101            const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());
102        
103            if (prefixContent === 'Liked') {
104              // Save positive comment text
105              positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
106            } else if (prefixContent === 'Disliked') {
107              // Save negative comment text
108              negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
109            }
110          } catch (error) {
111          }
112        }
113        
114        // Here all of the elements
115        const data = {
116          id,
117          userName,
118          userLocation,
119          roomInfo,
120          stayDate,
121          stayLength,
122          reviewDate,
123          reviewTitle,
124          rating,
125          reviewTextParts: {
126            Liked: positiveComment,
127            Disliked: negativeComment,
128          },
129        };
130
131       await Actor.pushData(data);
132        
133      }
134      // Append the extracted data to the JSON file
135      
136      console.log("Task completed!");
137      await Actor.exit();
138      
139    }else{
140
141      try {
142        // Find the next oage href by XPath
143        const elementXPath = '//*[@id="review_list_page_container"]/div[4]/div/div[1]/div/div[2]/div/div[2]/a';
144        await page.waitForXPath(elementXPath);
145        const [elementHandle] = await page.$x(elementXPath);
146        var offset = 0;
147
148        if (elementHandle) {
149          const nextPageHref = await page.evaluate(element => element.getAttribute('href'), elementHandle);
150          let bookingUrl = 'https://www.booking.com' + nextPageHref;
151          bookingUrl = bookingUrl.slice(0, -10);
152          
153          while (offset < reviewCount) {
154            
155            const newLink = createNewLink(bookingUrl, offset);
156            
157            await scrapeReviews(newLink);
158            offset += 25;
159            
160          }
161        
162        }
163
164      } catch (error) {
165        console.error('Error:', error.message);
166      } finally {
167        await browser.close();
168        console.log("Task completed!");
169        await Actor.exit();
170      } 
171
172      await browser.close();
173    }
174  }
175  run();
176
177  
178  function createNewLink(bookingUrl, offset) {
179
180  const newLink = bookingUrl + '&&offset=' + offset.toString();
181  
182
183  return newLink;
184  }
185
186  async function scrapeReviews(link) {
187    const browser = await puppeteer.launch({
188      args: ['--no-sandbox', '--disable-setuid-sandbox']
189    });
190    
191    const page = await browser.newPage();
192    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
193    console.log('Processing...');
194
195    try {
196      
197      await page.goto(link, {
198        waitUntil: "domcontentloaded",
199      });
200      
201      await page.waitForSelector(".review_list_new_item_block", { visible: true });
202
203      const listItems = await page.$x('//ul[@class="review_list"]/li');
204
205      // Loop through each list item and extract data
206
207      for (const listItem of listItems) {
208
209        const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));
210        const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());
211        const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());
212
213        var roomInfo = '';
214        try {
215          roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());
216        } catch (error) {
217          console.error('Error finding roomInfo:', error.message);
218          roomInfo = ' '; 
219        }
220        const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());
221        const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());
222        const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());
223        const match = fullReviewDate.match(/Reviewed:\s+(.+)/);
224        const reviewDate = match ? match[1] : null;
225
226        const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());
227        const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());
228        const reviewRows = await listItem.$$('.c-review__row');
229        
230        let positiveComment = ' ';
231        let negativeComment = ' ';
232
233        for (const row of reviewRows) {
234          try {
235            const prefixSpan = await row.$('.c-review__prefix');
236            const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());
237        
238            if (prefixContent === 'Liked') {
239              // Save positive comment text
240              positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
241            } else if (prefixContent === 'Disliked') {
242              // Save negative comment text
243              negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
244            }
245          } catch (error) {
246          }
247        }
248        
249        // Here all of the elements
250        const data = {
251          id,
252          userName,
253          userLocation,
254          roomInfo,
255          stayDate,
256          stayLength,
257          reviewDate,
258          reviewTitle,
259          rating,
260          reviewTextParts: {
261            Liked: positiveComment,
262            Disliked: negativeComment,
263          },
264        };
265        await Actor.pushData(data);
266      }
267          
268      return listItems.length;
269    } catch (error) {
270      console.error('Error during page navigation:', error);
271      return null;
272    } finally {
273      await browser.close();
274     
275
276    }
277  }
278  
279})();

main_folder/routes.js

1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6    log.info(`enqueueing new URLs`);
7    await enqueueLinks({
8        globs: ['https://apify.com/*'],
9        label: 'detail',
10    });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14    const title = await page.title();
15    log.info(`${title}`, { url: request.loadedUrl });
16
17    await Dataset.pushData({
18        url: request.loadedUrl,
19        title,
20    });
21});

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-puppeteer-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "dependencies": {
7        "apify": "^3.1.10",
8        "crawlee": "^3.5.4",
9        "puppeteer": "*"
10    },
11    "devDependencies": {
12        "@apify/eslint-config": "^0.4.0",
13        "eslint": "^8.50.0",
14        "javascript-obfuscator": "^2.18.1"
15    },
16    "scripts": {
17        "start": "node main_folder/main.js",
18        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
19        "obfuscate": "javascript-obfuscator main_folder/main.js --output main_folder/main-obfuscated.js"
20    },
21    "author": "It's not you it's me",
22    "license": "ISC"
23}
Developer
Maintained by Community