1
2import axios from "axios";
3
4import * as cheerio from "cheerio";
5
6import { Actor } from "apify";
7
8
9await Actor.init();
10
11interface Input {
12 url: string;
13}
14
15interface Review {
16 reviewerName: string;
17 reviewText: string;
18 rating: number;
19 postedDate?: string;
20 pageNumber: number;
21}
22
23interface PageInfo {
24 nextPageUrl: string | null;
25 pageNumber: number;
26}
27
28function getStoreId(url: string): string {
29 const matches = url.match(/store\/([^\/]+)/);
30 return matches ? matches[1] : '';
31}
32
33async function extractReviewsFromPage(pageUrl: string, pageNumber: number): Promise<{ reviews: Review[]; nextPage: string | null }> {
34 console.log(`Fetching reviews from: ${pageUrl}`);
35
36 try {
37 const response = await axios.get(pageUrl, {
38 headers: {
39 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
40 Accept: 'text/html',
41 },
42 });
43
44 const $ = cheerio.load(response.data);
45 const reviews: Review[] = [];
46
47 $('.Review').each((_i, element) => {
48 const $element = $(element);
49 const reviewerName = $element.find('.Review__author').text().trim() || 'Anonymous';
50 const reviewText = $element.find('.Review__body').text().trim()
51 .replace(/[""]/g, '')
52 .replace(/[\u201C\u201D]/g, '')
53 .trim();
54 const rating = $element.find('.Rating__stars .stars__icon--100').length;
55 const postedDate = $element.find('.Review__dateSource').text().trim();
56
57 if (reviewText) {
58 reviews.push({
59 reviewerName,
60 reviewText,
61 rating,
62 postedDate: postedDate || undefined,
63 pageNumber,
64 });
65 }
66 });
67
68
69 let nextPageUrl: string | null = null;
70 const currentPageElement = $('.pagination li.active');
71 if (currentPageElement.length) {
72 const nextPageElement = currentPageElement.next('li').find('a');
73 if (nextPageElement.length) {
74 nextPageUrl = nextPageElement.attr('href') || null;
75 }
76 }
77
78 console.log(`Page ${pageNumber}: Found ${reviews.length} reviews. Next page URL: ${nextPageUrl}`);
79 return { reviews, nextPage: nextPageUrl };
80 } catch (error) {
81 if (axios.isAxiosError(error)) {
82 console.error(`Error fetching page ${pageNumber}:`, error.message);
83 if (error.response) {
84 console.error(`Status: ${error.response.status}`);
85 }
86 }
87 throw error;
88 }
89}
90
91async function getTotalPages(url: string): Promise<number> {
92 try {
93 const response = await axios.get(url);
94 const $ = cheerio.load(response.data);
95 const totalReviewsText = $('.js-reviewsio-review-count strong').text().trim();
96 const totalReviews = parseInt(totalReviewsText, 10);
97 const reviewsPerPage = 20;
98 const totalPages = Math.ceil(totalReviews / reviewsPerPage);
99 console.log(`Total reviews: ${totalReviews}, Total pages: ${totalPages}`);
100 return totalPages;
101 } catch (error) {
102 console.error('Error getting total pages:', error);
103 throw error;
104 }
105}
106
107async function getAllReviews(url: string): Promise<Review[]> {
108 const allReviews: Review[] = [];
109 let currentPage = 1;
110 let currentUrl = url;
111 const seenUrls = new Set<string>();
112
113 try {
114 const totalPages = await getTotalPages(url);
115 console.log(`Detected ${totalPages} total pages`);
116
117 while (currentUrl && currentPage <= totalPages) {
118
119 if (seenUrls.has(currentUrl)) {
120 console.log(`Already visited ${currentUrl}, stopping pagination`);
121 break;
122 }
123 seenUrls.add(currentUrl);
124
125 try {
126 const { reviews, nextPage } = await extractReviewsFromPage(currentUrl, currentPage);
127
128 if (reviews.length === 0) {
129 console.log(`No more reviews found after page ${currentPage - 1}`);
130 break;
131 }
132
133 allReviews.push(...reviews);
134 console.log(`Total reviews collected so far: ${allReviews.length}`);
135
136
137 currentUrl = nextPage || '';
138 currentPage++;
139
140
141 await new Promise((resolve) => setTimeout(resolve, 2000));
142 } catch (error) {
143 console.error(`Failed to fetch page ${currentPage}, stopping pagination`);
144 break;
145 }
146 }
147 } catch (error) {
148 console.error('Error fetching reviews:', error);
149 throw error;
150 }
151
152 return allReviews;
153}
154
155
156try {
157 const input = await Actor.getInput<Input>();
158 if (!input) throw new Error('Input is missing!');
159 const { url } = input;
160
161 console.log('Starting review extraction...');
162 const allReviews = await getAllReviews(url);
163
164
165 const reviewsByPage = allReviews.reduce((acc, review) => {
166 acc[review.pageNumber] = (acc[review.pageNumber] || 0) + 1;
167 return acc;
168 }, {} as Record<number, number>);
169
170 console.log('Reviews per page:', reviewsByPage);
171
172
173 await Actor.pushData(allReviews);
174
175 console.log(`Successfully extracted ${allReviews.length} total reviews`);
176} catch (error) {
177 console.error('Error during scraping:', error);
178 throw error;
179} finally {
180
181 await Actor.exit();
182}