1import { Actor } from 'apify';
2import { Dataset, createCheerioRouter, log } from 'crawlee';
3
4export enum Label {
5 advertiserSearch = 'advertiserSearch',
6 search = 'search',
7 detail = 'detail',
8}
9
10export const router = createCheerioRouter();
11const ADS_PAGE_SIZE = 12;
12const ADVERTISERS_PAGE_SIZE = 12;
13
14const parseJsonBody = (body: string, url: string) => {
15 try {
16 return JSON.parse(body);
17 } catch {
18 const preview = body.slice(0, 200).replace(/\s+/g, ' ').trim();
19 throw new Error(`Expected JSON from ${url}, got: ${preview}`);
20 }
21};
22
23const requestJson = async (sendRequest: any, url: string) => {
24 const response = await sendRequest({
25 url,
26 http2: false,
27 });
28
29 if (response.statusCode >= 400) {
30 throw new Error(
31 `Request failed with status ${response.statusCode} for ${url}`
32 );
33 }
34
35 return parseJsonBody(response.body, url);
36};
37
38router.addDefaultHandler(async ({ crawler }) => {
39 const input = await Actor.getInput<any>();
40 const maxPages = Math.max(1, Number(input?.maxPages ?? 1) || 1);
41 const query = typeof input?.query === 'string' ? input.query.trim() : '';
42 const advertiserId =
43 typeof input?.advertiserId === 'string' ? input.advertiserId.trim() : '';
44
45 if (!query && !advertiserId) {
46 throw new Error('Either "query" or "advertiserId" must be provided.');
47 }
48
49 log.info('Default handler', {
50 query: query || undefined,
51 advertiserId: advertiserId || undefined,
52 maxPages,
53 });
54
55 if (advertiserId) {
56 await crawler.addRequests([
57 {
58 url: `https://adlibrary.ads.microsoft.com/search/${advertiserId}/1`,
59 label: Label.search,
60 userData: {
61 advertiserId,
62 query,
63 maxPages,
64 page: 1,
65 },
66 skipNavigation: true,
67 },
68 ]);
69
70 return;
71 }
72
73 await crawler.addRequests(
74 Array.from({ length: maxPages }).map((_, i) => {
75 const url = new URL(
76 'https://adlibrary.api.bingads.microsoft.com/api/v1/Advertisers'
77 );
78 url.searchParams.set('searchText', query);
79 url.searchParams.set('top', ADVERTISERS_PAGE_SIZE.toString());
80 url.searchParams.set('skip', (i * ADVERTISERS_PAGE_SIZE).toString());
81
82 return {
83 url: `https://adlibrary.ads.microsoft.com/advertisers/${i}`,
84 label: Label.advertiserSearch,
85 userData: {
86 query,
87 page: i + 1,
88 url: url.toString(),
89 maxPages,
90 },
91 skipNavigation: true,
92 };
93 })
94 );
95});
96
97router.addHandler(
98 Label.advertiserSearch,
99 async ({ request, crawler, sendRequest }) => {
100 const { query, page, url, maxPages } = request.userData;
101
102 log.info('Advertiser search handler', {
103 page,
104 query,
105 maxPages,
106 });
107
108 const res: any = await requestJson(sendRequest, url.toString());
109
110 const advertisers = Array.isArray(res?.value) ? res.value : [];
111
112 log.info('Advertiser search result', {
113 page,
114 items: advertisers.length,
115 });
116
117 await crawler.addRequests(
118 advertisers.flatMap((advertiser: any) => {
119 const advertiserId =
120 advertiser?.Id ?? advertiser?.AdvertiserId ?? advertiser?.id;
121
122 if (advertiserId == null) {
123 return [];
124 }
125
126 return {
127 url: `https://adlibrary.ads.microsoft.com/search/${advertiserId}/1`,
128 label: Label.search,
129 skipNavigation: true,
130 userData: {
131 advertiserId,
132 maxPages,
133 page: 1,
134 },
135 };
136 })
137 );
138 }
139);
140
141router.addHandler(Label.search, async ({ request, crawler, sendRequest }) => {
142 const { advertiserId, query, maxPages, page } = request.userData;
143
144 const apiUrl = new URL(
145 'https://adlibrary.api.bingads.microsoft.com/api/v1/Ads'
146 );
147 apiUrl.searchParams.set('advertiserId', String(advertiserId));
148 apiUrl.searchParams.set('top', ADS_PAGE_SIZE.toString());
149 apiUrl.searchParams.set('skip', ((page - 1) * ADS_PAGE_SIZE).toString());
150
151 if (query) {
152 apiUrl.searchParams.set('searchText', String(query));
153 }
154
155 const res: any = await requestJson(sendRequest, apiUrl.toString());
156 const ads = Array.isArray(res?.value) ? res.value : [];
157
158 log.info('Search handler', {
159 page,
160 items: ads.length,
161 });
162
163 await crawler.addRequests(
164 ads.map((v: any) => {
165 const detailUrl = `https://adlibrary.api.bingads.microsoft.com/api/v1/Ads/${v.AdId}?expand=AdDetails(expand=ImpressionsByCountry,Targets)`;
166
167 return {
168 url: `https://adlibrary.ads.microsoft.com/detail/${v.AdId}`,
169 label: Label.detail,
170 skipNavigation: true,
171 forefront: true,
172 userData: {
173 adId: v.AdId,
174 url: detailUrl.toString(),
175 },
176 };
177 })
178 );
179
180 if (ads.length === ADS_PAGE_SIZE && page < maxPages) {
181 await crawler.addRequests([
182 {
183 url: `https://adlibrary.ads.microsoft.com/search/${advertiserId}/${page + 1}`,
184 label: Label.search,
185 skipNavigation: true,
186 userData: {
187 advertiserId,
188 query,
189 maxPages,
190 page: page + 1,
191 },
192 },
193 ]);
194 }
195});
196
197router.addHandler(Label.detail, async ({ request, sendRequest }) => {
198 const { adId, url } = request.userData;
199
200 log.info('Detail handler', {
201 adId,
202 });
203
204 const res: any = await requestJson(sendRequest, url.toString());
205
206 const ad = {
207 adId: res.AdId,
208 adTitle: res.Title,
209 adDescription: res.Description,
210 adDestination: res.DestinationUrl,
211 advertiserId: res.AdvertiserId,
212 advertiserName: res.AdvertiserName,
213 advertiserPaidForBy: res.AdDetails.PaidForByName,
214 adStartDate: res.AdDetails.StartDate,
215 adEndDate: res.AdDetails.EndDate,
216 adImpressions: res.AdDetails.TotalImpressionsRange,
217 impressionsByCountry: (res.AdDetails.ImpressionsByCountry ?? [])
218 .map((v) => ({
219 country: v.Country,
220 impressionShare: Number(v.ImpressionShare.replace('%', '')),
221 }))
222 .sort((a, b) => b.impressionShare - a.impressionShare),
223 targets: (res.AdDetails.Targets ?? []).map((v) => ({
224 targetType: v.TargetType,
225 usedForExclusion: v.UsedForExclusion,
226 })),
227 };
228
229 await Dataset.pushData(ad);
230});