1import { Actor } from 'apify';
2import { connect as connectRealBrowser } from 'puppeteer-real-browser';
3import * as cheerio from 'cheerio';
4
5
6
7
8const SCRAPER_CONFIG = {
9
10 maxNoNewDataCount: 3,
11
12 headless: false,
13
14 turnstile: false,
15
16 filterResources: false,
17
18 maxItems: 50,
19
20 requestUrl: 'https://www.sothebys.com/en/search?query=chanel',
21
22 uniqueIdField: 'lotUrl',
23
24 waitSelector: 'div[data-testid="results-search-item"]',
25
26 autoScrollPage: async (page) => {
27 await page.evaluate(async () => {
28 const distance = 500;
29 const delay = 100;
30 while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {
31 document.scrollingElement.scrollBy(0, distance);
32 await new Promise(resolve => setTimeout(resolve, delay));
33 }
34 });
35 }
36};
37
38
39
40
41class BaseScraper {
42
43
44
45
46
47
48
49 parsePageItems($) {
50
51
52 const items = $(SCRAPER_CONFIG.waitSelector).map((i, el) => {
53
54
55
56
57
58 const $item = $(el);
59
60
61 const $titleLink = $item.find('a[class*="lotTitleLink"]').first();
62 const lotTitle = $titleLink.find('[class*="title"]').text().trim() ||
63 $titleLink.text().split('\n')[0].trim();
64 const lotUrl = $titleLink.attr('href') || '';
65 const fullLotUrl = lotUrl ? 'https://www.sothebys.com' + lotUrl : '';
66
67
68 const $img = $item.find('img[src*="sothebys-md"]').first();
69 const imageUrl = $img.attr('src') || '';
70
71
72 const $description = $item.find('[class*="description"]').first();
73 const description = $description.text().trim().substring(0, 500);
74
75
76 const $estimate = $item.find('[class*="estimateContainer"]').first();
77 const estimateText = $estimate.text().trim();
78
79
80 const $label = $item.find('[class*="labelContainer"]');
81 const labelText = $label.text().trim();
82
83
84 const $countdown = $item.find('[data-testid="lot-card-countdown"]');
85 const countdownText = $countdown.text().trim();
86
87
88 const $bidInfo = $item.find('[class*="bidInfoWrapper"]');
89 const bidInfoText = $bidInfo.text().trim();
90
91 return {
92 lotTitle: lotTitle,
93 lotUrl: fullLotUrl,
94 imageUrl: imageUrl,
95 description: description,
96 estimate: estimateText,
97 label: labelText,
98 countdown: countdownText,
99 bidInfo: bidInfoText
100 };
101 }).get();
102
103
104
105 return items;
106 }
107
108 async run(input) {
109 const { searchUrl, maxItems = Infinity, proxyConfiguration } = input;
110 this.maxItems = maxItems;
111 this.searchUrl = searchUrl;
112 const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);
113 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
114 const proxyOptions = this.buildProxyOptions(proxyUrl);
115
116 const realBrowserOption = {
117 args: ["--start-maximized"],
118 turnstile: SCRAPER_CONFIG.turnstile,
119 headless: SCRAPER_CONFIG.headless,
120
121
122 customConfig: {},
123 connectOption: {
124 defaultViewport: null
125 },
126 ...(proxyOptions ? { proxy: proxyOptions } : {}),
127 plugins: []
128 }
129
130 const { page, browser } = await connectRealBrowser(realBrowserOption);
131
132 try {
133 page.setDefaultNavigationTimeout(300 * 1000);
134 page.setDefaultTimeout(3600 * 1000);
135
136
137
138
139
140 await this.handlePage(page, this.searchUrl);
141 } finally {
142 await browser?.close();
143 }
144 }
145
146 async handlePage(page, url) {
147 console.log(`Processing ${url}...`);
148 let count = 0;
149 let endOfResults = false;
150 let interceptedData = [];
151 let lastDataLength = 0;
152 let noNewDataCount = 0;
153
154
155 await page.goto(url);
156
157
158 if (SCRAPER_CONFIG.waitSelector) {
159 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 60000 }).catch(() => {
160 console.log(`data not found...`);
161 });
162 }
163
164
165 const firstPageHtml = await page.content();
166 await this.processResponseData(firstPageHtml, interceptedData);
167
168
169 while (!endOfResults && interceptedData.length < this.maxItems) {
170 count++;
171 console.log(`data fetching...${count}`);
172 await this.randomDelay();
173
174
175 await SCRAPER_CONFIG.autoScrollPage(page);
176 await this.randomDelay(3000, 5000);
177
178
179 if (SCRAPER_CONFIG.waitSelector) {
180 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 10000 }).catch(() => {
181 console.log(`data not found...`);
182 });
183 }
184
185
186 const afterScrollHtml = await page.content();
187 await this.processResponseData(afterScrollHtml, interceptedData);
188
189 if (interceptedData.length === lastDataLength) {
190 noNewDataCount++;
191 if (noNewDataCount >= SCRAPER_CONFIG.maxNoNewDataCount) {
192 endOfResults = true;
193 }
194 } else {
195 noNewDataCount = 0;
196 lastDataLength = interceptedData.length;
197 }
198 }
199
200 console.log(`Scraping completed. Total items: ${interceptedData.length}`);
201 }
202
203
204 async setupInterceptors(page, interceptedData) {
205 await page.setRequestInterception(true);
206 page.on('request', (request) => {
207 const resourceType = request.resourceType();
208 const blockedResourceTypes = [
209 'image',
210 'media',
211 'font',
212 'texttrack',
213 'stylesheet',
214 'ping'
215 ];
216
217 if (SCRAPER_CONFIG.filterResources && blockedResourceTypes.includes(resourceType)) {
218 request.abort();
219 } else {
220 request.continue();
221 }
222 });
223 page.on('response', async (response) => {
224
225 });
226 }
227
228
229 async processResponseData(htmlContent, interceptedData) {
230 let currentData = [];
231
232 if (!htmlContent) {
233 return;
234 }
235
236 const $ = cheerio.load(htmlContent);
237
238
239 const items = this.parsePageItems($);
240
241 if (!items || items.length === 0) {
242 return;
243 }
244
245
246 const uniqueField = SCRAPER_CONFIG.uniqueIdField;
247 const existingIds = new Set(interceptedData.map(item => item[uniqueField]));
248
249 let skippedCount = 0;
250 for (const listing of items) {
251
252 if (existingIds.has(listing[uniqueField])) {
253 skippedCount++;
254 continue;
255 }
256
257 const data = {
258 searchUrl: this.searchUrl,
259 ...listing,
260 scrapedAt: new Date().toISOString()
261 };
262
263 interceptedData.push(data);
264 currentData.push(data);
265 existingIds.add(listing[uniqueField]);
266
267
268 if (interceptedData.length >= this.maxItems) {
269 break;
270 }
271 }
272
273
274
275 if (currentData.length > 0) {
276 console.log(`Saved ${currentData.length} items, total: ${interceptedData.length}`);
277 await Actor.pushData(currentData);
278 }
279 }
280
281
282 async randomDelay(min = 1000, max = 3000) {
283 const delay = Math.floor(Math.random() * (max - min + 1) + min);
284 await new Promise(resolve => setTimeout(resolve, delay));
285 }
286
287
288 async setCookies(page) {
289
290
291
292
293
294
295
296
297
298
299
300 }
301
302
303 buildProxyOptions(proxyUrl) {
304 if (!proxyUrl) {
305 return undefined;
306 }
307
308 try {
309 const parsed = new URL(proxyUrl);
310 if (!parsed.hostname || !parsed.port) {
311 return undefined;
312 }
313
314 return {
315 host: parsed.hostname,
316 port: Number(parsed.port),
317 username: parsed.username || undefined,
318 password: parsed.password || undefined,
319 };
320 } catch (error) {
321 console.warn(`Invalid proxy URL detected: ${proxyUrl}`, error);
322 return undefined;
323 }
324 }
325
326}
327
328
329
330
331
332await Actor.init();
333
334Actor.main(async () => {
335 let input = await Actor.getInput();
336 if (!input) {
337 input = {
338 "searchUrl": SCRAPER_CONFIG.requestUrl,
339 "maxItems": SCRAPER_CONFIG.maxItems
340 };
341 }
342 const crawler = new BaseScraper();
343 await crawler.run(input);
344});