1import { Actor } from 'apify';
2import { connect as connectRealBrowser } from 'puppeteer-real-browser';
3import * as cheerio from 'cheerio';
4
5
6
7
8const SCRAPER_CONFIG = {
9
10 maxNoNewDataCount: 3,
11
12 headless: false,
13
14 turnstile: true,
15
16 filterResources: false,
17
18 maxItems: 50,
19
20 requestUrl: 'https://www.novelupdates.com/series-finder/?sf=1&sh=live&sort=sdate&order=desc',
21
22 uniqueIdField: 'seriesId',
23
24 waitSelector: '.search_main_box_nu',
25
26 autoScrollPage: async (page) => {
27
28
29
30
31 const nextButton = await page.$('a.next_page');
32 if (nextButton) {
33 await nextButton.click();
34 await page.waitForSelector('.search_main_box_nu', { timeout: 10000 }).catch(() => {});
35 }
36 }
37};
38
39
40
41
42class BaseScraper {
43
44
45
46
47
48
49
50 parsePageItems($) {
51
52
53 const items = $(SCRAPER_CONFIG.waitSelector).map((i, el) => {
54
55
56
57
58 const $item = $(el);
59
60
61 const imageSrc = $item.find('.search_img_nu img').attr('src') || '';
62 const seriesIdMatch = imageSrc.match(/series_(\d+)\.jpg/);
63 const seriesId = seriesIdMatch ? seriesIdMatch[1] : '';
64
65
66 const titleLink = $item.find('.search_title a');
67 const title = titleLink.text().trim();
68 const seriesUrl = titleLink.attr('href') || '';
69
70
71 const imageUrl = imageSrc;
72
73
74 const ratingText = $item.find('.search_ratings').text().trim();
75 const ratingMatch = ratingText.match(/[\d.]+/);
76 const rating = ratingMatch ? ratingMatch[0] : '';
77
78
79 const langSpan = $item.find('.orgjp, .orgcn, .orgkr');
80 const language = langSpan.text().trim();
81
82
83 const stats = {};
84 $item.find('.search_stats .ss_desk').each((idx, statEl) => {
85 const $stat = $(statEl);
86 const title = $stat.find('i').attr('title') || '';
87 const text = $stat.clone().children().remove().end().text().trim();
88
89 if (title.includes('Chapter')) {
90 stats.chapterCount = text.replace(/[^0-9]/g, '');
91 } else if (title.includes('Frequency')) {
92 stats.updateFrequency = text;
93 } else if (title.includes('Readers')) {
94 stats.readerCount = text.replace(/[^0-9]/g, '');
95 } else if (title.includes('Reviews')) {
96 stats.reviewCount = text.replace(/[^0-9]/g, '');
97 } else if (title.includes('Last Updated')) {
98 stats.lastUpdated = text;
99 }
100 });
101
102
103 const genres = [];
104 $item.find('.search_genre a').each((idx, genreEl) => {
105 genres.push({
106 name: $(genreEl).text().trim(),
107 url: $(genreEl).attr('href') || '',
108 gid: $(genreEl).attr('gid') || ''
109 });
110 });
111
112
113 let description = $item.find('.search_body_nu').contents().filter(function() {
114 return this.nodeType === 3 && this.textContent?.trim()?.length > 0;
115 }).text().trim();
116
117 if (!description) {
118 description = $item.find('.search_body_nu').text().split('\n').map(line => line.trim()).filter(line => line.length > 20 && !line.includes(' Chapters') && !line.includes(' Day') && !line.includes(' Readers') && !line.includes(' Reviews') && !line.includes(' Updated')).join(' ').trim();
119 }
120
121 return {
122 seriesId: seriesId,
123 title: title,
124 seriesUrl: seriesUrl,
125 imageUrl: imageUrl,
126 rating: rating,
127 language: language,
128 chapterCount: parseInt(stats.chapterCount) || 0,
129 updateFrequency: stats.updateFrequency || '',
130 readerCount: parseInt(stats.readerCount) || 0,
131 reviewCount: parseInt(stats.reviewCount) || 0,
132 lastUpdated: stats.lastUpdated || '',
133 genres: genres,
134 description: description
135 };
136 }).get();
137
138
139
140 return items;
141 }
142
143 async run(input) {
144 const { searchUrl, maxItems = Infinity, proxyConfiguration } = input;
145 this.maxItems = maxItems;
146 this.searchUrl = searchUrl;
147 const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);
148 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
149 const proxyOptions = this.buildProxyOptions(proxyUrl);
150
151 const realBrowserOption = {
152 args: ["--start-maximized"],
153 turnstile: SCRAPER_CONFIG.turnstile,
154 headless: SCRAPER_CONFIG.headless,
155
156
157 customConfig: {},
158 connectOption: {
159 defaultViewport: null
160 },
161 ...(proxyOptions ? { proxy: proxyOptions } : {}),
162 plugins: []
163 }
164
165 const { page, browser } = await connectRealBrowser(realBrowserOption);
166
167 try {
168 page.setDefaultNavigationTimeout(300 * 1000);
169 page.setDefaultTimeout(3600 * 1000);
170
171
172
173
174
175 await this.handlePage(page, this.searchUrl);
176 } finally {
177 await browser?.close();
178 }
179 }
180
181 async handlePage(page, url) {
182 console.log(`Processing ${url}...`);
183 let count = 0;
184 let endOfResults = false;
185 let interceptedData = [];
186 let lastDataLength = 0;
187 let noNewDataCount = 0;
188
189
190 await page.goto(url);
191
192
193 if (SCRAPER_CONFIG.waitSelector) {
194 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 60000 }).catch(() => {
195 console.log(`data not found...`);
196 });
197 }
198
199
200 const firstPageHtml = await page.content();
201 await this.processResponseData(firstPageHtml, interceptedData);
202
203
204 while (!endOfResults && interceptedData.length < this.maxItems) {
205 count++;
206 console.log(`data fetching...${count}`);
207 await this.randomDelay();
208
209
210 await SCRAPER_CONFIG.autoScrollPage(page);
211 await this.randomDelay(3000, 5000);
212
213
214 if (SCRAPER_CONFIG.waitSelector) {
215 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 10000 }).catch(() => {
216 console.log(`data not found...`);
217 });
218 }
219
220
221 const afterScrollHtml = await page.content();
222 await this.processResponseData(afterScrollHtml, interceptedData);
223
224 if (interceptedData.length === lastDataLength) {
225 noNewDataCount++;
226 if (noNewDataCount >= SCRAPER_CONFIG.maxNoNewDataCount) {
227 endOfResults = true;
228 }
229 } else {
230 noNewDataCount = 0;
231 lastDataLength = interceptedData.length;
232 }
233 }
234
235 console.log(`Scraping completed. Total items: ${interceptedData.length}`);
236 }
237
238
239 async setupInterceptors(page, interceptedData) {
240 await page.setRequestInterception(true);
241 page.on('request', (request) => {
242 const resourceType = request.resourceType();
243 const blockedResourceTypes = [
244 'image',
245 'media',
246 'font',
247 'texttrack',
248 'stylesheet',
249 'ping'
250 ];
251
252 if (SCRAPER_CONFIG.filterResources && blockedResourceTypes.includes(resourceType)) {
253 request.abort();
254 } else {
255 request.continue();
256 }
257 });
258 page.on('response', async (response) => {
259
260 });
261 }
262
263
264 async processResponseData(htmlContent, interceptedData) {
265 let currentData = [];
266
267 if (!htmlContent) {
268 return;
269 }
270
271 const $ = cheerio.load(htmlContent);
272
273
274 const items = this.parsePageItems($);
275
276 if (!items || items.length === 0) {
277 return;
278 }
279
280
281 const uniqueField = SCRAPER_CONFIG.uniqueIdField;
282 const existingIds = new Set(interceptedData.map(item => item[uniqueField]));
283
284 let skippedCount = 0;
285 for (const listing of items) {
286
287 if (existingIds.has(listing[uniqueField])) {
288 skippedCount++;
289 continue;
290 }
291
292 const data = {
293 searchUrl: this.searchUrl,
294 ...listing,
295 scrapedAt: new Date().toISOString()
296 };
297
298 interceptedData.push(data);
299 currentData.push(data);
300 existingIds.add(listing[uniqueField]);
301
302
303 if (interceptedData.length >= this.maxItems) {
304 break;
305 }
306 }
307
308
309
310 if (currentData.length > 0) {
311 console.log(`Saved ${currentData.length} items, total: ${interceptedData.length}`);
312 await Actor.pushData(currentData);
313 }
314 }
315
316
317 async randomDelay(min = 1000, max = 3000) {
318 const delay = Math.floor(Math.random() * (max - min + 1) + min);
319 await new Promise(resolve => setTimeout(resolve, delay));
320 }
321
322
323 async setCookies(page) {
324
325
326
327
328
329
330
331
332
333
334
335 }
336
337
338 buildProxyOptions(proxyUrl) {
339 if (!proxyUrl) {
340 return undefined;
341 }
342
343 try {
344 const parsed = new URL(proxyUrl);
345 if (!parsed.hostname || !parsed.port) {
346 return undefined;
347 }
348
349 return {
350 host: parsed.hostname,
351 port: Number(parsed.port),
352 username: parsed.username || undefined,
353 password: parsed.password || undefined,
354 };
355 } catch (error) {
356 console.warn(`Invalid proxy URL detected: ${proxyUrl}`, error);
357 return undefined;
358 }
359 }
360
361}
362
363
364
365
366
367await Actor.init();
368
369Actor.main(async () => {
370 let input = await Actor.getInput();
371 if (!input) {
372 input = {
373 "searchUrl": SCRAPER_CONFIG.requestUrl,
374 "maxItems": SCRAPER_CONFIG.maxItems
375 };
376 }
377 const crawler = new BaseScraper();
378 await crawler.run(input);
379});