1import { Actor } from 'apify';
2import axios from 'axios';
3
4const API_BASE = 'https://similarsites.com/api/site';
5const SITE_BASE = 'https://similarsites.com/site';
6
7const DEFAULT_HEADERS = {
8 accept: '*/*',
9 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
10 'cache-control': 'no-cache',
11 pragma: 'no-cache',
12 'sec-ch-ua': '"Chromium";v="148", "Google Chrome";v="148", "Not/A)Brand";v="99"',
13 'sec-ch-ua-mobile': '?0',
14 'sec-ch-ua-platform': '"macOS"',
15 'sec-fetch-dest': 'empty',
16 'sec-fetch-mode': 'cors',
17 'sec-fetch-site': 'same-origin',
18 'user-agent':
19 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36',
20};
21
22class SimilarSitesScraper {
23 normalizeDomain(value) {
24 let domain = String(value || '').trim();
25 if (!domain) return '';
26
27 domain = domain.replace(/^https?:\/\//i, '');
28 domain = domain.replace(/^www\./i, '');
29 domain = domain.split('/')[0];
30 domain = domain.split('?')[0];
31
32 return domain.toLowerCase();
33 }
34
35 buildApiUrl(domain) {
36 return `${API_BASE}/${encodeURIComponent(domain)}`;
37 }
38
39 buildReferer(domain) {
40 return `${SITE_BASE}/${encodeURIComponent(domain)}`;
41 }
42
43 getAxiosConfig(proxyUrl) {
44 if (!proxyUrl) return {};
45
46 const parsed = new URL(proxyUrl);
47 return {
48 proxy: {
49 protocol: parsed.protocol.replace(':', ''),
50 host: parsed.hostname,
51 port: Number(parsed.port),
52 ...(parsed.username
53 ? {
54 auth: {
55 username: decodeURIComponent(parsed.username),
56 password: decodeURIComponent(parsed.password),
57 },
58 }
59 : {}),
60 },
61 };
62 }
63
64 mapSimilarSiteItem(sourceDomain, sourceInfo, item) {
65 const site = item.Site || null;
66
67 return {
68 sourceDomain,
69 sourceTitle: sourceInfo.title,
70 sourceDescription: sourceInfo.description,
71 sourceCategory: sourceInfo.category,
72 sourceTotalVisits: sourceInfo.totalVisits,
73 site,
74 description: item.Description || null,
75 category: item.Category || null,
76 topCountryRank: item.TopCountryRank ?? null,
77 totalVisits: item.TotalVisits ?? null,
78 similarityRank: item.SimilarityRank ?? null,
79 grade: item.Grade ?? null,
80 thumbnail: item.Thumbnail || null,
81 siteUrl: site ? `https://${site}` : null,
82 scrapedAt: new Date().toISOString(),
83 };
84 }
85
86 extractSourceInfo(responseBody) {
87 return {
88 title: responseBody?.Title || null,
89 description: responseBody?.Description || null,
90 category: responseBody?.Category || null,
91 totalVisits: responseBody?.TotalVisits ?? null,
92 categoryRank: responseBody?.CategoryRank ?? null,
93 thumbnail: responseBody?.Thumbnail || null,
94 favicon: responseBody?.Favicon || null,
95 tags: responseBody?.Tags || null,
96 };
97 }
98
99 async run(input) {
100 const { domains, maxItems = Infinity, proxyConfiguration } = input;
101
102 if (!Array.isArray(domains) || domains.length === 0) {
103 throw new Error('Input must include a non-empty domains array');
104 }
105
106 this.maxItems = maxItems;
107
108 const proxyConfig = proxyConfiguration
109 ? await Actor.createProxyConfiguration(proxyConfiguration)
110 : undefined;
111
112 for (const rawDomain of domains) {
113 const domain = this.normalizeDomain(rawDomain);
114 if (!domain) continue;
115
116 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
117 const axiosConfig = this.getAxiosConfig(proxyUrl);
118
119 await this.scrapeDomain(domain, axiosConfig);
120 await this.randomDelay(500, 1500);
121 }
122 }
123
124 async scrapeDomain(domain, axiosConfig) {
125 const apiUrl = this.buildApiUrl(domain);
126 console.log(`Fetching similar sites for "${domain}"...`);
127
128 let responseBody;
129 try {
130 const response = await axios.get(apiUrl, {
131 ...axiosConfig,
132 headers: {
133 ...DEFAULT_HEADERS,
134 Referer: this.buildReferer(domain),
135 },
136 timeout: 60000,
137 });
138 responseBody = response.data;
139 } catch (error) {
140 const message = error.response?.data?.message || error.message;
141 console.error(`API request failed for "${domain}":`, message);
142 await Actor.pushData([
143 {
144 sourceDomain: domain,
145 error: message,
146 scrapedAt: new Date().toISOString(),
147 },
148 ]);
149 return;
150 }
151
152 const similarSites = Array.isArray(responseBody?.SimilarSites) ? responseBody.SimilarSites : [];
153 const sourceInfo = this.extractSourceInfo(responseBody);
154
155 if (similarSites.length === 0) {
156 console.log(`No similar sites found for "${domain}"`);
157 await Actor.pushData([
158 {
159 sourceDomain: domain,
160 error: 'No similar sites found',
161 scrapedAt: new Date().toISOString(),
162 },
163 ]);
164 return;
165 }
166
167 const limit = Number.isFinite(this.maxItems) ? this.maxItems : similarSites.length;
168 const results = similarSites
169 .slice(0, limit)
170 .map((item) => this.mapSimilarSiteItem(domain, sourceInfo, item));
171
172 console.log(`Saved ${results.length} similar sites for "${domain}"`);
173 await Actor.pushData(results);
174 }
175
176 async randomDelay(min = 500, max = 1500) {
177 const delay = Math.floor(Math.random() * (max - min + 1) + min);
178 await new Promise((resolve) => setTimeout(resolve, delay));
179 }
180}
181
182await Actor.init();
183
184Actor.main(async () => {
185
186
187 const input = {
188 "domains": ["apify.com", "tiktok.com"],
189
190 "proxyConfiguration": {
191 "useApifyProxy": false
192 }
193 };
194
195 const scraper = new SimilarSitesScraper();
196 await scraper.run(input);
197});