1const Apify = require('apify');
2const cheerio = require('cheerio');
3
4Apify.main(async () => {
5 const input = await Apify.getInput();
6
7 const xml = await Apify.utils.requestAsBrowser({
8 url: input?.url || 'http://beachwaver.com/sitemap_products_1.xml',
9 headers: {
10 'User-Agent': 'curl/7.54.0'
11 }
12 });
13
14
15 const $ = cheerio.load(xml.toString());
16 const sources = [];
17 $('loc').each(function (val) {
18 const url = $(this).text().trim();
19 sources.push({
20 url,
21 headers: {
22
23 'User-Agent': 'curl/7.54.0',
24 }
25 });
26 });
27 console.log(`Found ${sources.length} URLs in the sitemap`)
28 const requestList = new Apify.RequestList({
29 sources,
30 });
31 await requestList.initialize();
32
33
34 const crawler = new Apify.CheerioCrawler({
35 requestList,
36 handlePageFunction: async ({ $, request }) => {
37 console.log(`Processing ${request.url}...`);
38 await Apify.pushData({
39 url: request.url,
40 title: $('title').text(),
41 });
42 },
43 });
44
45 await crawler.run();
46 console.log('Done.');
47});