1
2
3
4
5
6
7
8
9import { Actor, log } from "apify";
10import { CheerioCrawler, type CheerioAPI } from "@crawlee/cheerio";
11
12
13
14interface Input {
15 source: "yahoo-news" | "rakuten-search" | "suumo-listings" | "tabelog-restaurants" | "hotpepper-gourmet" | "custom";
16 searchQuery?: string;
17 category?: string;
18 startUrls?: Array<{ url: string }>;
19 maxItems: number;
20 maxPages: number;
21 extractFullContent: boolean;
22 proxyConfiguration?: any;
23}
24
25interface ScrapedItem {
26 source: string;
27 title: string;
28 url: string;
29 category?: string;
30 description?: string;
31 content?: string;
32 price?: string;
33 rating?: string;
34 reviewCount?: number;
35 imageUrl?: string;
36 publishedAt?: string;
37 location?: string;
38 metadata?: Record<string, string>;
39 extractedAt: string;
40}
41
42
43
44interface SourceConfig {
45 buildStartUrl: (query?: string, category?: string) => string;
46 buildPageUrl: (baseUrl: string, page: number) => string;
47 parseListPage: ($: CheerioAPI, url: string) => ScrapedItem[];
48 parseDetailPage?: ($: CheerioAPI, url: string) => Partial<ScrapedItem>;
49 detailLinkSelector?: string;
50 encoding?: string;
51}
52
53const SOURCES: Record<string, SourceConfig> = {
54 "yahoo-news": {
55 buildStartUrl: (_query, category) => {
56 const cat = category || "domestic";
57 return `https://news.yahoo.co.jp/categories/${cat}`;
58 },
59 buildPageUrl: (baseUrl, page) => `${baseUrl}?page=${page}`,
60 parseListPage: ($, _url) => {
61 const items: ScrapedItem[] = [];
62 const seen = new Set<string>();
63
64 $("a[href*='/articles/'], a[href*='/pickup/']").each((_, el) => {
65 const $el = $(el);
66 const url = $el.attr("href") || "";
67
68 const articleId = url.match(/\/articles\/([a-z0-9]+)/i)?.[1] || url;
69 if (seen.has(articleId)) return;
70 seen.add(articleId);
71
72
73 const allText = $el.text().trim();
74
75 const title = allText.replace(/\d+\/\d+\([^)]+\)\s*\d+:\d+\s*配信.*$/s, "").trim();
76
77 if (title && title.length > 5 && url) {
78 items.push({
79 source: "yahoo-news",
80 title,
81 url: url.startsWith("http") ? url : `https://news.yahoo.co.jp${url}`,
82 extractedAt: new Date().toISOString(),
83 });
84 }
85 });
86 return items;
87 },
88 parseDetailPage: ($, url) => {
89 const content = $("article .article_body, .highLightSearchTarget, [class*='articleBody']")
90 .text()
91 .trim();
92 const category = $("meta[property='article:section']").attr("content") || undefined;
93 return { content, category, url };
94 },
95 detailLinkSelector: "a[href*='news.yahoo.co.jp/articles']",
96 },
97
98 "rakuten-search": {
99 buildStartUrl: (query) => {
100 const q = encodeURIComponent(query || "");
101 return `https://search.rakuten.co.jp/search/mall/${q}/`;
102 },
103 buildPageUrl: (baseUrl, page) => `${baseUrl}?p=${page}`,
104 parseListPage: ($, _url) => {
105 const items: ScrapedItem[] = [];
106 $(".searchresultitem, [class*='dui-card'], .item").each((_, el) => {
107 const $el = $(el);
108 const titleEl = $el.find("a[href*='item.rakuten.co.jp'], .title a, h2 a").first();
109 const title = titleEl.text().trim();
110 const url = titleEl.attr("href") || "";
111 const price = $el.find(".price, [class*='price'], .important").first().text().trim();
112 const imageUrl = $el.find("img").first().attr("src") || undefined;
113 const rating = $el.find("[class*='rating'], [class*='review'] span").first().text().trim() || undefined;
114 const reviewText = $el.find("[class*='reviewCount'], [class*='review-num']").first().text().trim();
115 const reviewCount = reviewText ? parseInt(reviewText.replace(/[^0-9]/g, ""), 10) || undefined : undefined;
116
117 if (title && url) {
118 items.push({
119 source: "rakuten-search",
120 title,
121 url,
122 price: price || undefined,
123 rating: rating || undefined,
124 reviewCount,
125 imageUrl,
126 extractedAt: new Date().toISOString(),
127 });
128 }
129 });
130 return items;
131 },
132 },
133
134 "suumo-listings": {
135 buildStartUrl: (query) => {
136 const area = encodeURIComponent(query || "東京都");
137 return `https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&fw=${area}`;
138 },
139 buildPageUrl: (baseUrl, page) => `${baseUrl}&pn=${page}`,
140 parseListPage: ($, _url) => {
141 const items: ScrapedItem[] = [];
142 $(".cassetteitem, [class*='property_unit']").each((_, el) => {
143 const $el = $(el);
144 const title = $el.find(".cassetteitem_content-title, h2, .property_unit-title").first().text().trim();
145 const detailLink = $el.find("a[href*='suumo.jp']").first().attr("href") || "";
146 const price = $el.find(".cassetteitem_price--rent, [class*='price']").first().text().trim();
147 const location = $el.find(".cassetteitem_detail-col1, [class*='address']").first().text().trim();
148 const description = $el.find(".cassetteitem_detail-col2, [class*='detail']").text().trim().slice(0, 200);
149
150 if (title) {
151 items.push({
152 source: "suumo-listings",
153 title,
154 url: detailLink.startsWith("http") ? detailLink : `https://suumo.jp${detailLink}`,
155 price: price || undefined,
156 location: location || undefined,
157 description: description || undefined,
158 extractedAt: new Date().toISOString(),
159 });
160 }
161 });
162 return items;
163 },
164 },
165
166 "tabelog-restaurants": {
167 buildStartUrl: (query) => {
168 const area = encodeURIComponent(query || "東京");
169 return `https://tabelog.com/rstLst/?vs=1&sa=${area}&sk=&lid=top_navi1&vac_net=&svd=&svt=&svps=&hfc=1&Cat=&LstCat=&LstCatD=&LstCatSD=&LstCosT=&LstRange=&RdoCosTp=2&LstCos=&LstRev=&LstSitu=&LstReserve=&ChkParking=`;
170 },
171 buildPageUrl: (baseUrl, page) => `${baseUrl}&PG=${page}`,
172 parseListPage: ($, _url) => {
173 const items: ScrapedItem[] = [];
174 $(".list-rst, .rstlist-info, [class*='restaurant']").each((_, el) => {
175 const $el = $(el);
176 const titleEl = $el.find(".list-rst__rst-name a, h3 a, .rstlist-info__name a").first();
177 const title = titleEl.text().trim();
178 const url = titleEl.attr("href") || "";
179 const rating = $el.find(".list-rst__rating-val, [class*='rating'] b, .c-rating__val").first().text().trim();
180 const reviewText = $el.find(".list-rst__rvw-count, [class*='review-count']").first().text().trim();
181 const reviewCount = reviewText ? parseInt(reviewText.replace(/[^0-9]/g, ""), 10) || undefined : undefined;
182 const category = $el.find(".list-rst__catg, [class*='category']").first().text().trim();
183 const location = $el.find(".list-rst__area, [class*='area']").first().text().trim();
184 const price = $el.find(".list-rst__budget, [class*='budget']").first().text().trim();
185
186 if (title && url) {
187 items.push({
188 source: "tabelog-restaurants",
189 title,
190 url: url.startsWith("http") ? url : `https://tabelog.com${url}`,
191 category: category || undefined,
192 rating: rating || undefined,
193 reviewCount,
194 location: location || undefined,
195 price: price || undefined,
196 extractedAt: new Date().toISOString(),
197 });
198 }
199 });
200 return items;
201 },
202 },
203
204 "hotpepper-gourmet": {
205 buildStartUrl: (query) => {
206 const area = encodeURIComponent(query || "東京");
207 return `https://www.hotpepper.jp/CSP/psh010/doRefine/?FW=${area}`;
208 },
209 buildPageUrl: (baseUrl, page) => {
210 const pageParam = page > 1 ? `&PN=${page}` : "";
211 return `${baseUrl}${pageParam}`;
212 },
213 parseListPage: ($, _url) => {
214 const items: ScrapedItem[] = [];
215 $(".shopListItem, [class*='shop-card'], .cassetteitem").each((_, el) => {
216 const $el = $(el);
217 const titleEl = $el.find("h3 a, .shopDetailStoreName a, .shopName a").first();
218 const title = titleEl.text().trim();
219 const url = titleEl.attr("href") || "";
220 const category = $el.find(".shopDetailCatch, [class*='genre']").first().text().trim();
221 const location = $el.find(".shopDetailAddress, [class*='address']").first().text().trim();
222 const price = $el.find(".shopDetailBudget, [class*='budget']").first().text().trim();
223 const imageUrl = $el.find("img[src*='imgfp']").first().attr("src") || undefined;
224
225 if (title && url) {
226 items.push({
227 source: "hotpepper-gourmet",
228 title,
229 url: url.startsWith("http") ? url : `https://www.hotpepper.jp${url}`,
230 category: category || undefined,
231 location: location || undefined,
232 price: price || undefined,
233 imageUrl,
234 extractedAt: new Date().toISOString(),
235 });
236 }
237 });
238 return items;
239 },
240 },
241};
242
243
244
245const customSourceConfig: SourceConfig = {
246 buildStartUrl: () => "",
247 buildPageUrl: (baseUrl) => baseUrl,
248 parseListPage: ($, url) => {
249 const items: ScrapedItem[] = [];
250 const title = $("title").text().trim();
251 const description = $("meta[name='description']").attr("content") || "";
252 const content = $("main, article, #content, .content, body")
253 .first()
254 .text()
255 .trim()
256 .slice(0, 5000);
257
258
259 const links: Record<string, string> = {};
260 $("a[href]").each((_, el) => {
261 const href = $(el).attr("href") || "";
262 const text = $(el).text().trim();
263 if (text && href && !href.startsWith("#") && !href.startsWith("javascript:")) {
264 links[text.slice(0, 100)] = href;
265 }
266 });
267
268 items.push({
269 source: "custom",
270 title,
271 url,
272 description,
273 content,
274 metadata: links,
275 extractedAt: new Date().toISOString(),
276 });
277 return items;
278 },
279};
280
281
282
283await Actor.init();
284
285const input = (await Actor.getInput<Input>()) ?? ({} as Input);
286
287const {
288 source = "yahoo-news",
289 searchQuery,
290 category,
291 startUrls,
292 maxItems = 50,
293 maxPages = 5,
294 extractFullContent = false,
295 proxyConfiguration,
296} = input;
297
298log.info(`Starting ${source} scraper. Query: "${searchQuery || "none"}", Max: ${maxItems} items`);
299
300const sourceConfig = source === "custom" ? customSourceConfig : SOURCES[source];
301
302if (!sourceConfig) {
303 throw new Error(`Unknown source: ${source}. Available: ${Object.keys(SOURCES).join(", ")}, custom`);
304}
305
306
307let urls: string[] = [];
308if (source === "custom" && startUrls) {
309 urls = startUrls.map((u) => (typeof u === "string" ? u : u.url));
310} else {
311 const startUrl = sourceConfig.buildStartUrl(searchQuery, category);
312
313 for (let page = 1; page <= maxPages; page++) {
314 urls.push(sourceConfig.buildPageUrl(startUrl, page));
315 }
316}
317
318if (urls.length === 0) {
319 throw new Error("No URLs to scrape. Provide startUrls for custom mode or searchQuery for other modes.");
320}
321
322let totalItems = 0;
323const detailUrls: string[] = [];
324
325
326const proxyConfig = proxyConfiguration
327 ? await Actor.createProxyConfiguration(proxyConfiguration)
328 : undefined;
329
330
331const listCrawler = new CheerioCrawler({
332 proxyConfiguration: proxyConfig,
333 maxRequestsPerCrawl: maxPages * 2,
334 requestHandlerTimeoutSecs: 60,
335 additionalMimeTypes: ["text/html; charset=euc-jp", "text/html; charset=shift_jis"],
336 requestHandler: async ({ $, request }) => {
337 log.info(`Crawling list page: ${request.url}`);
338
339 const items = sourceConfig.parseListPage($, request.url);
340 log.info(`Found ${items.length} items on ${request.url}`);
341
342 for (const item of items) {
343 if (totalItems >= maxItems) break;
344 await Actor.pushData(item);
345 totalItems++;
346
347
348 if (extractFullContent && sourceConfig.parseDetailPage && item.url) {
349 detailUrls.push(item.url);
350 }
351 }
352
353 if (totalItems >= maxItems) {
354 log.info(`Reached max items (${maxItems}). Stopping.`);
355 }
356 },
357 failedRequestHandler: async ({ request }) => {
358 log.warning(`Failed: ${request.url}`);
359 },
360});
361
362await listCrawler.run(urls);
363
364
365if (extractFullContent && sourceConfig.parseDetailPage && detailUrls.length > 0) {
366 log.info(`Extracting full content from ${detailUrls.length} detail pages...`);
367
368 const detailCrawler = new CheerioCrawler({
369 proxyConfiguration: proxyConfig,
370 maxRequestsPerCrawl: Math.min(detailUrls.length, maxItems),
371 requestHandlerTimeoutSecs: 30,
372 requestHandler: async ({ $, request }) => {
373 if (!sourceConfig.parseDetailPage) return;
374 const detail = sourceConfig.parseDetailPage($, request.url);
375
376
377
378 if (detail.content) {
379 await Actor.pushData({
380 source: `${source}-detail`,
381 url: request.url,
382 ...detail,
383 extractedAt: new Date().toISOString(),
384 });
385 }
386 },
387 failedRequestHandler: async ({ request }) => {
388 log.warning(`Detail page failed: ${request.url}`);
389 },
390 });
391
392 await detailCrawler.run(detailUrls);
393}
394
395log.info(`Scraping complete. Total items: ${totalItems}`);
396await Actor.exit();