1from apify_client import ApifyClient
2
3
4
5client = ApifyClient("<YOUR_API_TOKEN>")
6
7
8run_input = {
9 "datasetType": "jobOffers",
10 "jobOfferFilterMinSalaryPeriod": "month",
11 "inputExtendFromFunction": """
12/**
13 * Inputs:
14 *
15 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
16 * `ctx.input` - The input object that was passed to this Actor.
17 * `ctx.state` - An object you can use to persist state across all your custom functions.
18 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
19 * See https://crawlee.dev/docs/guides/got-scraping
20 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
21 * It takes the entry itself, and a list of properties to be used for hashing.
22 * By default, you should pass `input.cachePrimaryKeys` to it.
23 *
24 */
25// async ({ io, input, state, sendRequest, itemCacheKey }) => {
26// // Example: Load Actor config from GitHub URL (public)
27// const config = await sendRequest.get('https://raw.githubusercontent.com/username/project/main/config.json').json();
28//
29// // Increase concurrency during off-peak hours
30// // NOTE: Imagine we're targetting a small server, that can be slower during the day
31// const hours = new Date().getUTCHours();
32// const isOffPeak = hours < 6 || hours > 20;
33// config.maxConcurrency = isOffPeak ? 8 : 3;
34//
35// return config;
36//
37// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
38//
39// /**
40// * ======= ACCESSING DATASET ========
41// * To save/load/access entries in Dataset.
42// * Docs:
43// * - https://docs.apify.com/platform/storage/dataset
44// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
45// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
46// */
47// // const dataset = await io.openDataset('MyDatasetId');
48// // const info = await dataset.getInfo();
49// // console.log(info.itemCount);
50// // // => 0
51//
52// /**
53// * ======= ACCESSING REMOTE DATA ========
54// * Use `sendRequest` to get data from the internet:
55// * Docs:
56// * - https://github.com/apify/got-scraping
57// */
58// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
59// // console.log(catFact.text);
60// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
61//
62// /**
63// * ======= USING CACHE ========
64// * To save the entry to the KeyValue cache (or retrieve it), you can use
65// * `itemCacheKey` to create the entry's ID for you:
66// */
67// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
68// // const cache = await io.openKeyValueStore('MyStoreId');
69// // cache.setValue(cacheId, entry);
70// };""",
71 "startUrlsFromFunction": """
72/**
73 * Inputs:
74 *
75 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
76 * `ctx.input` - The input object that was passed to this Actor.
77 * `ctx.state` - An object you can use to persist state across all your custom functions.
78 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
79 * See https://crawlee.dev/docs/guides/got-scraping
80 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
81 * It takes the entry itself, and a list of properties to be used for hashing.
82 * By default, you should pass `input.cachePrimaryKeys` to it.
83 *
84 */
85// async ({ io, input, state, sendRequest, itemCacheKey }) => {
86// // Example: Create and load URLs from a Dataset by combining multiple fields
87// const dataset = await io.openDataset(datasetNameOrId);
88// const data = await dataset.getData();
89// const urls = data.items.map((item) => `https://example.com/u/${item.userId}/list/${item.listId}`);
90// return urls;
91//
92// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
93//
94// /**
95// * ======= ACCESSING DATASET ========
96// * To save/load/access entries in Dataset.
97// * Docs:
98// * - https://docs.apify.com/platform/storage/dataset
99// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
100// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
101// */
102// // const dataset = await io.openDataset('MyDatasetId');
103// // const info = await dataset.getInfo();
104// // console.log(info.itemCount);
105// // // => 0
106//
107// /**
108// * ======= ACCESSING REMOTE DATA ========
109// * Use `sendRequest` to get data from the internet:
110// * Docs:
111// * - https://github.com/apify/got-scraping
112// */
113// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
114// // console.log(catFact.text);
115// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
116//
117// /**
118// * ======= USING CACHE ========
119// * To save the entry to the KeyValue cache (or retrieve it), you can use
120// * `itemCacheKey` to create the entry's ID for you:
121// */
122// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
123// // const cache = await io.openKeyValueStore('MyStoreId');
124// // cache.setValue(cacheId, entry);
125// };""",
126 "requestMaxEntries": 50,
127 "requestTransform": """
128/**
129 * Inputs:
130 * `request` - Request holding URL to be scraped.
131 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
132 * `ctx.input` - The input object that was passed to this Actor.
133 * `ctx.state` - An object you can use to persist state across all your custom functions.
134 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
135 * See https://crawlee.dev/docs/guides/got-scraping
136 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
137 * It takes the entry itself, and a list of properties to be used for hashing.
138 * By default, you should pass `input.cachePrimaryKeys` to it.
139 *
140 */
141// async (request, { io, input, state, sendRequest, itemCacheKey }) => {
142// // Example: Tag requests
143// // (maybe because we use RequestQueue that pools multiple scrapers)
144// request.userData.tag = \"VARIANT_A\";
145// return requestQueue;
146//
147// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
148//
149// /**
150// * ======= ACCESSING DATASET ========
151// * To save/load/access entries in Dataset.
152// * Docs:
153// * - https://docs.apify.com/platform/storage/dataset
154// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
155// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
156// */
157// // const dataset = await io.openDataset('MyDatasetId');
158// // const info = await dataset.getInfo();
159// // console.log(info.itemCount);
160// // // => 0
161//
162// /**
163// * ======= ACCESSING REMOTE DATA ========
164// * Use `sendRequest` to get data from the internet:
165// * Docs:
166// * - https://github.com/apify/got-scraping
167// */
168// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
169// // console.log(catFact.text);
170// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
171//
172// /**
173// * ======= USING CACHE ========
174// * To save the entry to the KeyValue cache (or retrieve it), you can use
175// * `itemCacheKey` to create the entry's ID for you:
176// */
177// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
178// // const cache = await io.openKeyValueStore('MyStoreId');
179// // cache.setValue(cacheId, entry);
180// };""",
181 "requestTransformBefore": """
182/**
183 * Inputs:
184 *
185 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
186 * `ctx.input` - The input object that was passed to this Actor.
187 * `ctx.state` - An object you can use to persist state across all your custom functions.
188 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
189 * See https://crawlee.dev/docs/guides/got-scraping
190 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
191 * It takes the entry itself, and a list of properties to be used for hashing.
192 * By default, you should pass `input.cachePrimaryKeys` to it.
193 *
194 */
195// async ({ io, input, state, sendRequest, itemCacheKey }) => {
196// // Example: Fetch data or run code BEFORE requests are processed.
197// state.categories = await sendRequest.get('https://example.com/my-categories').json();
198//
199// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
200//
201// /**
202// * ======= ACCESSING DATASET ========
203// * To save/load/access entries in Dataset.
204// * Docs:
205// * - https://docs.apify.com/platform/storage/dataset
206// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
207// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
208// */
209// // const dataset = await io.openDataset('MyDatasetId');
210// // const info = await dataset.getInfo();
211// // console.log(info.itemCount);
212// // // => 0
213//
214// /**
215// * ======= ACCESSING REMOTE DATA ========
216// * Use `sendRequest` to get data from the internet:
217// * Docs:
218// * - https://github.com/apify/got-scraping
219// */
220// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
221// // console.log(catFact.text);
222// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
223//
224// /**
225// * ======= USING CACHE ========
226// * To save the entry to the KeyValue cache (or retrieve it), you can use
227// * `itemCacheKey` to create the entry's ID for you:
228// */
229// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
230// // const cache = await io.openKeyValueStore('MyStoreId');
231// // cache.setValue(cacheId, entry);
232// };""",
233 "requestTransformAfter": """
234/**
235 * Inputs:
236 *
237 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
238 * `ctx.input` - The input object that was passed to this Actor.
239 * `ctx.state` - An object you can use to persist state across all your custom functions.
240 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
241 * See https://crawlee.dev/docs/guides/got-scraping
242 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
243 * It takes the entry itself, and a list of properties to be used for hashing.
244 * By default, you should pass `input.cachePrimaryKeys` to it.
245 *
246 */
247// async ({ io, input, state, sendRequest, itemCacheKey }) => {
248// // Example: Fetch data or run code AFTER requests are processed.
249// delete state.categories;
250//
251// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
252//
253// /**
254// * ======= ACCESSING DATASET ========
255// * To save/load/access entries in Dataset.
256// * Docs:
257// * - https://docs.apify.com/platform/storage/dataset
258// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
259// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
260// */
261// // const dataset = await io.openDataset('MyDatasetId');
262// // const info = await dataset.getInfo();
263// // console.log(info.itemCount);
264// // // => 0
265//
266// /**
267// * ======= ACCESSING REMOTE DATA ========
268// * Use `sendRequest` to get data from the internet:
269// * Docs:
270// * - https://github.com/apify/got-scraping
271// */
272// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
273// // console.log(catFact.text);
274// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
275//
276// /**
277// * ======= USING CACHE ========
278// * To save the entry to the KeyValue cache (or retrieve it), you can use
279// * `itemCacheKey` to create the entry's ID for you:
280// */
281// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
282// // const cache = await io.openKeyValueStore('MyStoreId');
283// // cache.setValue(cacheId, entry);
284// };""",
285 "requestFilter": """
286/**
287 * Inputs:
288 * `request` - Request holding URL to be scraped.
289 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
290 * `ctx.input` - The input object that was passed to this Actor.
291 * `ctx.state` - An object you can use to persist state across all your custom functions.
292 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
293 * See https://crawlee.dev/docs/guides/got-scraping
294 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
295 * It takes the entry itself, and a list of properties to be used for hashing.
296 * By default, you should pass `input.cachePrimaryKeys` to it.
297 *
298 */
299// async (request, { io, input, state, sendRequest, itemCacheKey }) => {
300// // Example: Filter requests based on their tag
301// // (maybe because we use RequestQueue that pools multiple scrapers)
302// return request.userData.tag === \"VARIANT_A\";
303//
304// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
305//
306// /**
307// * ======= ACCESSING DATASET ========
308// * To save/load/access entries in Dataset.
309// * Docs:
310// * - https://docs.apify.com/platform/storage/dataset
311// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
312// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
313// */
314// // const dataset = await io.openDataset('MyDatasetId');
315// // const info = await dataset.getInfo();
316// // console.log(info.itemCount);
317// // // => 0
318//
319// /**
320// * ======= ACCESSING REMOTE DATA ========
321// * Use `sendRequest` to get data from the internet:
322// * Docs:
323// * - https://github.com/apify/got-scraping
324// */
325// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
326// // console.log(catFact.text);
327// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
328//
329// /**
330// * ======= USING CACHE ========
331// * To save the entry to the KeyValue cache (or retrieve it), you can use
332// * `itemCacheKey` to create the entry's ID for you:
333// */
334// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
335// // const cache = await io.openKeyValueStore('MyStoreId');
336// // cache.setValue(cacheId, entry);
337// };""",
338 "requestFilterBefore": """
339/**
340 * Inputs:
341 *
342 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
343 * `ctx.input` - The input object that was passed to this Actor.
344 * `ctx.state` - An object you can use to persist state across all your custom functions.
345 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
346 * See https://crawlee.dev/docs/guides/got-scraping
347 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
348 * It takes the entry itself, and a list of properties to be used for hashing.
349 * By default, you should pass `input.cachePrimaryKeys` to it.
350 *
351 */
352// async ({ io, input, state, sendRequest, itemCacheKey }) => {
353// // Example: Fetch data or run code BEFORE requests are processed.
354// state.categories = await sendRequest.get('https://example.com/my-categories').json();
355//
356// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
357//
358// /**
359// * ======= ACCESSING DATASET ========
360// * To save/load/access entries in Dataset.
361// * Docs:
362// * - https://docs.apify.com/platform/storage/dataset
363// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
364// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
365// */
366// // const dataset = await io.openDataset('MyDatasetId');
367// // const info = await dataset.getInfo();
368// // console.log(info.itemCount);
369// // // => 0
370//
371// /**
372// * ======= ACCESSING REMOTE DATA ========
373// * Use `sendRequest` to get data from the internet:
374// * Docs:
375// * - https://github.com/apify/got-scraping
376// */
377// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
378// // console.log(catFact.text);
379// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
380//
381// /**
382// * ======= USING CACHE ========
383// * To save the entry to the KeyValue cache (or retrieve it), you can use
384// * `itemCacheKey` to create the entry's ID for you:
385// */
386// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
387// // const cache = await io.openKeyValueStore('MyStoreId');
388// // cache.setValue(cacheId, entry);
389// };""",
390 "requestFilterAfter": """
391/**
392 * Inputs:
393 *
394 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
395 * `ctx.input` - The input object that was passed to this Actor.
396 * `ctx.state` - An object you can use to persist state across all your custom functions.
397 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
398 * See https://crawlee.dev/docs/guides/got-scraping
399 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
400 * It takes the entry itself, and a list of properties to be used for hashing.
401 * By default, you should pass `input.cachePrimaryKeys` to it.
402 *
403 */
404// async ({ io, input, state, sendRequest, itemCacheKey }) => {
405// // Example: Fetch data or run code AFTER requests are processed.
406// delete state.categories;
407//
408// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
409//
410// /**
411// * ======= ACCESSING DATASET ========
412// * To save/load/access entries in Dataset.
413// * Docs:
414// * - https://docs.apify.com/platform/storage/dataset
415// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
416// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
417// */
418// // const dataset = await io.openDataset('MyDatasetId');
419// // const info = await dataset.getInfo();
420// // console.log(info.itemCount);
421// // // => 0
422//
423// /**
424// * ======= ACCESSING REMOTE DATA ========
425// * Use `sendRequest` to get data from the internet:
426// * Docs:
427// * - https://github.com/apify/got-scraping
428// */
429// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
430// // console.log(catFact.text);
431// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
432//
433// /**
434// * ======= USING CACHE ========
435// * To save the entry to the KeyValue cache (or retrieve it), you can use
436// * `itemCacheKey` to create the entry's ID for you:
437// */
438// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
439// // const cache = await io.openKeyValueStore('MyStoreId');
440// // cache.setValue(cacheId, entry);
441// };""",
442 "outputMaxEntries": 50,
443 "outputTransform": """
444/**
445 * Inputs:
446 * `entry` - Scraped entry.
447 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
448 * `ctx.input` - The input object that was passed to this Actor.
449 * `ctx.state` - An object you can use to persist state across all your custom functions.
450 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
451 * See https://crawlee.dev/docs/guides/got-scraping
452 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
453 * It takes the entry itself, and a list of properties to be used for hashing.
454 * By default, you should pass `input.cachePrimaryKeys` to it.
455 *
456 */
457// async (entry, { io, input, state, sendRequest, itemCacheKey }) => {
458// // Example: Add extra custom fields like aggregates
459// return {
460// ...entry,
461// imagesCount: entry.images.length,
462// };
463//
464// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
465//
466// /**
467// * ======= ACCESSING DATASET ========
468// * To save/load/access entries in Dataset.
469// * Docs:
470// * - https://docs.apify.com/platform/storage/dataset
471// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
472// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
473// */
474// // const dataset = await io.openDataset('MyDatasetId');
475// // const info = await dataset.getInfo();
476// // console.log(info.itemCount);
477// // // => 0
478//
479// /**
480// * ======= ACCESSING REMOTE DATA ========
481// * Use `sendRequest` to get data from the internet:
482// * Docs:
483// * - https://github.com/apify/got-scraping
484// */
485// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
486// // console.log(catFact.text);
487// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
488//
489// /**
490// * ======= USING CACHE ========
491// * To save the entry to the KeyValue cache (or retrieve it), you can use
492// * `itemCacheKey` to create the entry's ID for you:
493// */
494// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
495// // const cache = await io.openKeyValueStore('MyStoreId');
496// // cache.setValue(cacheId, entry);
497// };""",
498 "outputTransformBefore": """
499/**
500 * Inputs:
501 *
502 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
503 * `ctx.input` - The input object that was passed to this Actor.
504 * `ctx.state` - An object you can use to persist state across all your custom functions.
505 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
506 * See https://crawlee.dev/docs/guides/got-scraping
507 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
508 * It takes the entry itself, and a list of properties to be used for hashing.
509 * By default, you should pass `input.cachePrimaryKeys` to it.
510 *
511 */
512// async ({ io, input, state, sendRequest, itemCacheKey }) => {
513// // Example: Fetch data or run code BEFORE entries are scraped.
514// state.categories = await sendRequest.get('https://example.com/my-categories').json();
515//
516// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
517//
518// /**
519// * ======= ACCESSING DATASET ========
520// * To save/load/access entries in Dataset.
521// * Docs:
522// * - https://docs.apify.com/platform/storage/dataset
523// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
524// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
525// */
526// // const dataset = await io.openDataset('MyDatasetId');
527// // const info = await dataset.getInfo();
528// // console.log(info.itemCount);
529// // // => 0
530//
531// /**
532// * ======= ACCESSING REMOTE DATA ========
533// * Use `sendRequest` to get data from the internet:
534// * Docs:
535// * - https://github.com/apify/got-scraping
536// */
537// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
538// // console.log(catFact.text);
539// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
540//
541// /**
542// * ======= USING CACHE ========
543// * To save the entry to the KeyValue cache (or retrieve it), you can use
544// * `itemCacheKey` to create the entry's ID for you:
545// */
546// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
547// // const cache = await io.openKeyValueStore('MyStoreId');
548// // cache.setValue(cacheId, entry);
549// };""",
550 "outputTransformAfter": """
551/**
552 * Inputs:
553 *
554 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
555 * `ctx.input` - The input object that was passed to this Actor.
556 * `ctx.state` - An object you can use to persist state across all your custom functions.
557 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
558 * See https://crawlee.dev/docs/guides/got-scraping
559 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
560 * It takes the entry itself, and a list of properties to be used for hashing.
561 * By default, you should pass `input.cachePrimaryKeys` to it.
562 *
563 */
564// async ({ io, input, state, sendRequest, itemCacheKey }) => {
565// // Example: Fetch data or run code AFTER entries are scraped.
566// delete state.categories;
567//
568// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
569//
570// /**
571// * ======= ACCESSING DATASET ========
572// * To save/load/access entries in Dataset.
573// * Docs:
574// * - https://docs.apify.com/platform/storage/dataset
575// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
576// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
577// */
578// // const dataset = await io.openDataset('MyDatasetId');
579// // const info = await dataset.getInfo();
580// // console.log(info.itemCount);
581// // // => 0
582//
583// /**
584// * ======= ACCESSING REMOTE DATA ========
585// * Use `sendRequest` to get data from the internet:
586// * Docs:
587// * - https://github.com/apify/got-scraping
588// */
589// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
590// // console.log(catFact.text);
591// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
592//
593// /**
594// * ======= USING CACHE ========
595// * To save the entry to the KeyValue cache (or retrieve it), you can use
596// * `itemCacheKey` to create the entry's ID for you:
597// */
598// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
599// // const cache = await io.openKeyValueStore('MyStoreId');
600// // cache.setValue(cacheId, entry);
601// };""",
602 "outputFilter": """
603/**
604 * Inputs:
605 * `entry` - Scraped entry.
606 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
607 * `ctx.input` - The input object that was passed to this Actor.
608 * `ctx.state` - An object you can use to persist state across all your custom functions.
609 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
610 * See https://crawlee.dev/docs/guides/got-scraping
611 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
612 * It takes the entry itself, and a list of properties to be used for hashing.
613 * By default, you should pass `input.cachePrimaryKeys` to it.
614 *
615 */
616// async (entry, { io, input, state, sendRequest, itemCacheKey }) => {
617// // Example: Filter entries based on number of images they have (at least 5)
618// return entry.images.length > 5;
619//
620// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
621//
622// /**
623// * ======= ACCESSING DATASET ========
624// * To save/load/access entries in Dataset.
625// * Docs:
626// * - https://docs.apify.com/platform/storage/dataset
627// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
628// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
629// */
630// // const dataset = await io.openDataset('MyDatasetId');
631// // const info = await dataset.getInfo();
632// // console.log(info.itemCount);
633// // // => 0
634//
635// /**
636// * ======= ACCESSING REMOTE DATA ========
637// * Use `sendRequest` to get data from the internet:
638// * Docs:
639// * - https://github.com/apify/got-scraping
640// */
641// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
642// // console.log(catFact.text);
643// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
644//
645// /**
646// * ======= USING CACHE ========
647// * To save the entry to the KeyValue cache (or retrieve it), you can use
648// * `itemCacheKey` to create the entry's ID for you:
649// */
650// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
651// // const cache = await io.openKeyValueStore('MyStoreId');
652// // cache.setValue(cacheId, entry);
653// };""",
654 "outputFilterBefore": """
655/**
656 * Inputs:
657 *
658 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
659 * `ctx.input` - The input object that was passed to this Actor.
660 * `ctx.state` - An object you can use to persist state across all your custom functions.
661 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
662 * See https://crawlee.dev/docs/guides/got-scraping
663 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
664 * It takes the entry itself, and a list of properties to be used for hashing.
665 * By default, you should pass `input.cachePrimaryKeys` to it.
666 *
667 */
668// async ({ io, input, state, sendRequest, itemCacheKey }) => {
669// // Example: Fetch data or run code BEFORE entries are scraped.
670// state.categories = await sendRequest.get('https://example.com/my-categories').json();
671//
672// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
673//
674// /**
675// * ======= ACCESSING DATASET ========
676// * To save/load/access entries in Dataset.
677// * Docs:
678// * - https://docs.apify.com/platform/storage/dataset
679// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
680// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
681// */
682// // const dataset = await io.openDataset('MyDatasetId');
683// // const info = await dataset.getInfo();
684// // console.log(info.itemCount);
685// // // => 0
686//
687// /**
688// * ======= ACCESSING REMOTE DATA ========
689// * Use `sendRequest` to get data from the internet:
690// * Docs:
691// * - https://github.com/apify/got-scraping
692// */
693// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
694// // console.log(catFact.text);
695// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
696//
697// /**
698// * ======= USING CACHE ========
699// * To save the entry to the KeyValue cache (or retrieve it), you can use
700// * `itemCacheKey` to create the entry's ID for you:
701// */
702// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
703// // const cache = await io.openKeyValueStore('MyStoreId');
704// // cache.setValue(cacheId, entry);
705// };""",
706 "outputFilterAfter": """
707/**
708 * Inputs:
709 *
710 * `ctx.io` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
711 * `ctx.input` - The input object that was passed to this Actor.
712 * `ctx.state` - An object you can use to persist state across all your custom functions.
713 * `ctx.sendRequest` - Fetch remote data. Uses 'got-scraping', same as Apify's `sendRequest`.
714 * See https://crawlee.dev/docs/guides/got-scraping
715 * `ctx.itemCacheKey` - A function you can use to get cacheID for current `entry`.
716 * It takes the entry itself, and a list of properties to be used for hashing.
717 * By default, you should pass `input.cachePrimaryKeys` to it.
718 *
719 */
720// async ({ io, input, state, sendRequest, itemCacheKey }) => {
721// // Example: Fetch data or run code AFTER entries are scraped.
722// delete state.categories;
723//
724// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
725//
726// /**
727// * ======= ACCESSING DATASET ========
728// * To save/load/access entries in Dataset.
729// * Docs:
730// * - https://docs.apify.com/platform/storage/dataset
731// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
732// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
733// */
734// // const dataset = await io.openDataset('MyDatasetId');
735// // const info = await dataset.getInfo();
736// // console.log(info.itemCount);
737// // // => 0
738//
739// /**
740// * ======= ACCESSING REMOTE DATA ========
741// * Use `sendRequest` to get data from the internet:
742// * Docs:
743// * - https://github.com/apify/got-scraping
744// */
745// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
746// // console.log(catFact.text);
747// // // => \"Cats make about 100 different sounds. Dogs make only about 10.\",
748//
749// /**
750// * ======= USING CACHE ========
751// * To save the entry to the KeyValue cache (or retrieve it), you can use
752// * `itemCacheKey` to create the entry's ID for you:
753// */
754// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
755// // const cache = await io.openKeyValueStore('MyStoreId');
756// // cache.setValue(cacheId, entry);
757// };""",
758 "maxRequestRetries": 3,
759 "maxRequestsPerMinute": 120,
760 "minConcurrency": 1,
761 "requestHandlerTimeoutSecs": 180,
762 "logLevel": "info",
763 "errorReportingDatasetId": "REPORTING",
764}
765
766
767run = client.actor("jurooravec/profesia-sk-scraper").call(run_input=run_input)
768
769
770print("💾 Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"])
771for item in client.dataset(run["defaultDatasetId"]).iterate_items():
772 print(item)
773
774