Booking Reviews Scraper avatar

Booking Reviews Scraper

Under maintenance
Go to Store
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
Booking Reviews Scraper

Booking Reviews Scraper

krapsits/booking-reviews-scraper

In the input tab, provide the Booking.com listing URL you'd like reviews for.

Developer
Maintained by Community

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:18
5
6# Remove shell access
7
8# Copy just package.json and package-lock.json
9# to speed up the build using Docker layer cache.
10COPY --chown=myuser package*.json ./
11
12# Install NPM packages, skip optional and development dependencies to
13# keep the image small. Avoid logging too much and print the dependency
14# tree for debugging
15RUN npm --quiet set progress=false \
16    && npm install --omit=dev --omit=optional \
17    && echo "Installed NPM packages:" \
18    && (npm list --omit=dev --all || true) \
19    && echo "Node.js version:" \
20    && node --version \
21    && echo "NPM version:" \
22    && npm --version \
23    && rm -r ~/.npm
24
25# Next, copy the remaining files and directories with the source code.
26# Since we do this after NPM install, quick build will be really fast
27# for most source file changes.
28COPY --chown=myuser . ./
29
30# Run the image. If you know you won't need headful browsers,
31# you can remove the XVFB start script for a micro perf gain.
32CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor",
4    "title": "Project Puppeteer Crawler JavaScript",
5    "description": "Crawlee and Puppeteer project in JavaScript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-puppeteer-chrome"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2  "title": "PlaywrightCrawler Template",
3  "type": "object",
4  "schemaVersion": 1,
5  "properties": {
6    "startUrls": {
7      "title": "Start URL",
8      "type": "string",
9      "description": "URL to start with.",
10      "editor": "textfield",
11      "prefill": "https://apify.com"
12    }
13  }
14}

main_folder/main.js

1import puppeteer from 'puppeteer';
2import { Actor } from 'apify';
3
4(async () => {
5  await Actor.init();
6
7  const input = await Actor.getInput();
8  const url = input.startUrls;
9  
10
11  async function run() {
12    const browser = await puppeteer.launch({
13      args: ['--no-sandbox', '--disable-setuid-sandbox']
14    });
15
16    const page = await browser.newPage();
17
18
19
20    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
21    await page
22      .goto(url, {
23        waitUntil: "domcontentloaded",
24      })
25      .catch((err) => console.log("error loading url", err));
26
27    const element = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a/div/div/div/div[2]/div[2]');
28    let reviewCount;
29
30    // Get review count
31    if (element.length > 0) {
32      const textContent = await page.evaluate(el => el.textContent, element[0]);
33      const reviews = textContent.match(/\d+/g);
34      reviewCount = reviews ? parseInt(reviews.join(''), 10) : null;
35      if (reviews) {
36        console.log('Review count:', reviewCount);
37      } else {
38        console.error('No numeric part found in the element');
39      }
40    } else {
41      console.error('Element not found');
42    }
43
44    //Review button
45    const button = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a');
46
47  
48    if (button.length > 0) {
49      await button[0].click();
50      console.log('Processing...');
51    } else {
52      console.error('Element not found');
53    }
54
55    try {
56      const elementXPath = '//*[@id="review_list_page_container"]';
57      await page.waitForXPath(elementXPath);
58    
59      console.log('Processing...');
60    } catch (error) {
61      console.error('Error waiting for sidebar:', error.message);
62    }
63
64
65    if(reviewCount <=10){
66
67      await page.waitForSelector(".review_list_new_item_block", { visible: true });
68
69      const listItems = await page.$x('//ul[@class="review_list"]/li');
70
71      // Loop through each list item and extract data
72      for (const listItem of listItems) {
73
74        const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));
75        const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());
76
77        const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());
78        let roomInfo = ''; 
79        try {
80          roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());
81        } catch (error) {
82          console.error('Error finding roomInfo:', error.message);
83          roomInfo = ''; 
84        }
85        const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());
86        const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());
87        const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());
88        const match = fullReviewDate.match(/Reviewed:\s+(.+)/);
89        const reviewDate = match ? match[1] : null;
90
91        const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());
92        const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());
93        const reviewRows = await listItem.$$('.c-review__row');
94        
95        let positiveComment = '';
96        let negativeComment = '';
97
98        for (const row of reviewRows) {
99          try {
100            const prefixSpan = await row.$('.c-review__prefix');
101            const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());
102        
103            if (prefixContent === 'Liked') {
104              // Save positive comment text
105              positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
106            } else if (prefixContent === 'Disliked') {
107              // Save negative comment text
108              negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
109            }
110          } catch (error) {
111          }
112        }
113        
114        // Here all of the elements
115        const data = {
116          id,
117          userName,
118          userLocation,
119          roomInfo,
120          stayDate,
121          stayLength,
122          reviewDate,
123          reviewTitle,
124          rating,
125          reviewTextParts: {
126            Liked: positiveComment,
127            Disliked: negativeComment,
128          },
129        };
130
131       await Actor.pushData(data);
132        
133      }
134      // Append the extracted data to the JSON file
135      
136      console.log("Task completed!");
137      await Actor.exit();
138      
139    }else{
140
141      try {
142        // Find the next oage href by XPath
143        const elementXPath = '//*[@id="review_list_page_container"]/div[4]/div/div[1]/div/div[2]/div/div[2]/a';
144        await page.waitForXPath(elementXPath);
145        const [elementHandle] = await page.$x(elementXPath);
146        var offset = 0;
147
148        if (elementHandle) {
149          const nextPageHref = await page.evaluate(element => element.getAttribute('href'), elementHandle);
150          let bookingUrl = 'https://www.booking.com' + nextPageHref;
151          bookingUrl = bookingUrl.slice(0, -10);
152          
153          while (offset < reviewCount) {
154            
155            const newLink = createNewLink(bookingUrl, offset);
156            
157            await scrapeReviews(newLink);
158            offset += 25;
159            
160          }
161        
162        }
163
164      } catch (error) {
165        console.error('Error:', error.message);
166      } finally {
167        await browser.close();
168        console.log("Task completed!");
169        await Actor.exit();
170      } 
171
172      await browser.close();
173    }
174  }
175  run();
176
177  
178  function createNewLink(bookingUrl, offset) {
179
180  const newLink = bookingUrl + '&&offset=' + offset.toString();
181  
182
183  return newLink;
184  }
185
186  async function scrapeReviews(link) {
187    const browser = await puppeteer.launch({
188      args: ['--no-sandbox', '--disable-setuid-sandbox']
189    });
190    
191    const page = await browser.newPage();
192    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
193    console.log('Processing...');
194
195    try {
196      
197      await page.goto(link, {
198        waitUntil: "domcontentloaded",
199      });
200      
201      await page.waitForSelector(".review_list_new_item_block", { visible: true });
202
203      const listItems = await page.$x('//ul[@class="review_list"]/li');
204
205      // Loop through each list item and extract data
206
207      for (const listItem of listItems) {
208
209        const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));
210        const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());
211        const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());
212
213        var roomInfo = '';
214        try {
215          roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());
216        } catch (error) {
217          console.error('Error finding roomInfo:', error.message);
218          roomInfo = ' '; 
219        }
220        const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());
221        const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());
222        const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());
223        const match = fullReviewDate.match(/Reviewed:\s+(.+)/);
224        const reviewDate = match ? match[1] : null;
225
226        const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());
227        const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());
228        const reviewRows = await listItem.$$('.c-review__row');
229        
230        let positiveComment = ' ';
231        let negativeComment = ' ';
232
233        for (const row of reviewRows) {
234          try {
235            const prefixSpan = await row.$('.c-review__prefix');
236            const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());
237        
238            if (prefixContent === 'Liked') {
239              // Save positive comment text
240              positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
241            } else if (prefixContent === 'Disliked') {
242              // Save negative comment text
243              negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
244            }
245          } catch (error) {
246          }
247        }
248        
249        // Here all of the elements
250        const data = {
251          id,
252          userName,
253          userLocation,
254          roomInfo,
255          stayDate,
256          stayLength,
257          reviewDate,
258          reviewTitle,
259          rating,
260          reviewTextParts: {
261            Liked: positiveComment,
262            Disliked: negativeComment,
263          },
264        };
265        await Actor.pushData(data);
266      }
267          
268      return listItems.length;
269    } catch (error) {
270      console.error('Error during page navigation:', error);
271      return null;
272    } finally {
273      await browser.close();
274     
275
276    }
277  }
278  
279})();

main_folder/routes.js

1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6    log.info(`enqueueing new URLs`);
7    await enqueueLinks({
8        globs: ['https://apify.com/*'],
9        label: 'detail',
10    });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14    const title = await page.title();
15    log.info(`${title}`, { url: request.loadedUrl });
16
17    await Dataset.pushData({
18        url: request.loadedUrl,
19        title,
20    });
21});

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-puppeteer-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "dependencies": {
7        "apify": "^3.1.10",
8        "crawlee": "^3.5.4",
9        "puppeteer": "*"
10    },
11    "devDependencies": {
12        "@apify/eslint-config": "^0.4.0",
13        "eslint": "^8.50.0",
14        "javascript-obfuscator": "^2.18.1"
15    },
16    "scripts": {
17        "start": "node main_folder/main.js",
18        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
19        "obfuscate": "javascript-obfuscator main_folder/main.js --output main_folder/main-obfuscated.js"
20    },
21    "author": "It's not you it's me",
22    "license": "ISC"
23}