1import puppeteer from 'puppeteer';
2import { Actor } from 'apify';
3
4(async () => {
5 await Actor.init();
6
7 const input = await Actor.getInput();
8 const url = input.startUrls;
9
10
11 async function run() {
12 const browser = await puppeteer.launch({
13 args: ['--no-sandbox', '--disable-setuid-sandbox']
14 });
15
16 const page = await browser.newPage();
17
18
19
20 await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
21 await page
22 .goto(url, {
23 waitUntil: "domcontentloaded",
24 })
25 .catch((err) => console.log("error loading url", err));
26
27 const element = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a/div/div/div/div[2]/div[2]');
28 let reviewCount;
29
30
31 if (element.length > 0) {
32 const textContent = await page.evaluate(el => el.textContent, element[0]);
33 const reviews = textContent.match(/\d+/g);
34 reviewCount = reviews ? parseInt(reviews.join(''), 10) : null;
35 if (reviews) {
36 console.log('Review count:', reviewCount);
37 } else {
38 console.error('No numeric part found in the element');
39 }
40 } else {
41 console.error('Element not found');
42 }
43
44
45 const button = await page.$x('//*[@id="js--hp-gallery-scorecard"]/a');
46
47
48 if (button.length > 0) {
49 await button[0].click();
50 console.log('Processing...');
51 } else {
52 console.error('Element not found');
53 }
54
55 try {
56 const elementXPath = '//*[@id="review_list_page_container"]';
57 await page.waitForXPath(elementXPath);
58
59 console.log('Processing...');
60 } catch (error) {
61 console.error('Error waiting for sidebar:', error.message);
62 }
63
64
65 if(reviewCount <=10){
66
67 await page.waitForSelector(".review_list_new_item_block", { visible: true });
68
69 const listItems = await page.$x('//ul[@class="review_list"]/li');
70
71
72 for (const listItem of listItems) {
73
74 const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));
75 const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());
76
77 const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());
78 let roomInfo = '';
79 try {
80 roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());
81 } catch (error) {
82 console.error('Error finding roomInfo:', error.message);
83 roomInfo = '';
84 }
85 const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());
86 const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());
87 const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());
88 const match = fullReviewDate.match(/Reviewed:\s+(.+)/);
89 const reviewDate = match ? match[1] : null;
90
91 const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());
92 const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());
93 const reviewRows = await listItem.$$('.c-review__row');
94
95 let positiveComment = '';
96 let negativeComment = '';
97
98 for (const row of reviewRows) {
99 try {
100 const prefixSpan = await row.$('.c-review__prefix');
101 const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());
102
103 if (prefixContent === 'Liked') {
104
105 positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
106 } else if (prefixContent === 'Disliked') {
107
108 negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
109 }
110 } catch (error) {
111 }
112 }
113
114
115 const data = {
116 id,
117 userName,
118 userLocation,
119 roomInfo,
120 stayDate,
121 stayLength,
122 reviewDate,
123 reviewTitle,
124 rating,
125 reviewTextParts: {
126 Liked: positiveComment,
127 Disliked: negativeComment,
128 },
129 };
130
131 await Actor.pushData(data);
132
133 }
134
135
136 console.log("Task completed!");
137 await Actor.exit();
138
139 }else{
140
141 try {
142
143 const elementXPath = '//*[@id="review_list_page_container"]/div[4]/div/div[1]/div/div[2]/div/div[2]/a';
144 await page.waitForXPath(elementXPath);
145 const [elementHandle] = await page.$x(elementXPath);
146 var offset = 0;
147
148 if (elementHandle) {
149 const nextPageHref = await page.evaluate(element => element.getAttribute('href'), elementHandle);
150 let bookingUrl = 'https://www.booking.com' + nextPageHref;
151 bookingUrl = bookingUrl.slice(0, -10);
152
153 while (offset < reviewCount) {
154
155 const newLink = createNewLink(bookingUrl, offset);
156
157 await scrapeReviews(newLink);
158 offset += 25;
159
160 }
161
162 }
163
164 } catch (error) {
165 console.error('Error:', error.message);
166 } finally {
167 await browser.close();
168 console.log("Task completed!");
169 await Actor.exit();
170 }
171
172 await browser.close();
173 }
174 }
175 run();
176
177
178 function createNewLink(bookingUrl, offset) {
179
180 const newLink = bookingUrl + '&&offset=' + offset.toString();
181
182
183 return newLink;
184 }
185
186 async function scrapeReviews(link) {
187 const browser = await puppeteer.launch({
188 args: ['--no-sandbox', '--disable-setuid-sandbox']
189 });
190
191 const page = await browser.newPage();
192 await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
193 console.log('Processing...');
194
195 try {
196
197 await page.goto(link, {
198 waitUntil: "domcontentloaded",
199 });
200
201 await page.waitForSelector(".review_list_new_item_block", { visible: true });
202
203 const listItems = await page.$x('//ul[@class="review_list"]/li');
204
205
206
207 for (const listItem of listItems) {
208
209 const id = await listItem.evaluate(element => element.getAttribute('data-review-url'));
210 const userName = await listItem.$eval('.bui-avatar-block__title', (element) => element.textContent.trim());
211 const userLocation = await listItem.$eval('.bui-avatar-block__subtitle', (element) => element.textContent.trim());
212
213 var roomInfo = '';
214 try {
215 roomInfo = await listItem.$eval('.c-review-block__room-link .bui-list__body', (element) => element.textContent.trim());
216 } catch (error) {
217 console.error('Error finding roomInfo:', error.message);
218 roomInfo = ' ';
219 }
220 const stayDate = await listItem.$eval('.c-review-block__stay-date .bui-list__body .c-review-block__date', (element) => element.textContent.trim());
221 const stayLength = await listItem.$eval('.c-review-block__stay-date .bui-list__body', (element) => element.textContent.trim());
222 const fullReviewDate = await listItem.$eval('.c-review-block__right .c-review-block__date', (element) => element.textContent.trim());
223 const match = fullReviewDate.match(/Reviewed:\s+(.+)/);
224 const reviewDate = match ? match[1] : null;
225
226 const reviewTitle = await listItem.$eval('.c-review-block__title', (element) => element.textContent.trim());
227 const rating = await listItem.$eval('.bui-review-score__badge', (element) => element.textContent.trim());
228 const reviewRows = await listItem.$$('.c-review__row');
229
230 let positiveComment = ' ';
231 let negativeComment = ' ';
232
233 for (const row of reviewRows) {
234 try {
235 const prefixSpan = await row.$('.c-review__prefix');
236 const prefixContent = await prefixSpan.$eval('span.bui-u-sr-only', (element) => element.textContent.trim());
237
238 if (prefixContent === 'Liked') {
239
240 positiveComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
241 } else if (prefixContent === 'Disliked') {
242
243 negativeComment = await row.$eval('.c-review__body', (element) => element.textContent.trim());
244 }
245 } catch (error) {
246 }
247 }
248
249
250 const data = {
251 id,
252 userName,
253 userLocation,
254 roomInfo,
255 stayDate,
256 stayLength,
257 reviewDate,
258 reviewTitle,
259 rating,
260 reviewTextParts: {
261 Liked: positiveComment,
262 Disliked: negativeComment,
263 },
264 };
265 await Actor.pushData(data);
266 }
267
268 return listItems.length;
269 } catch (error) {
270 console.error('Error during page navigation:', error);
271 return null;
272 } finally {
273 await browser.close();
274
275
276 }
277 }
278
279})();