Actor picture

Podcasts

zyberg/podcasts

Gets the url of the first podcast from google podcasts

No credit card required

Author's avatarNikolajus Elmutis
  • Modified
  • Users4
  • Runs56

Based on the apify/actor-node-basic Docker image (see docs).

const Apify = require('apify');

Apify.main(async () => {
    const input = await Apify.getInput();
    const requestQueue = await Apify.openRequestQueue('google-podcasts');
    const dataset = await Apify.openDataset('google-podcasts');
    let output = [];

    for(const link of input.links)
        await requestQueue.addRequest({
            url: link,
            uniqueKey: link + (new Date).toString()
        });

    const crawler = new Apify.CheerioCrawler({
        requestQueue,

        // The crawler downloads and processes the web pages in parallel, with a concurrency
        // automatically managed based on the available system memory and CPU (see AutoscaledPool class).
        // Here we define some hard limits for the concurrency.
        minConcurrency: 10,
        maxConcurrency: 50,

        // On error, retry each page at most once.
        maxRequestRetries: 1,

        // Increase the timeout for processing of each page.
        handlePageTimeoutSecs: 30,

        // Limit to 10 requests per one crawl
        maxRequestsPerCrawl: 10,

        // This function will be called for each URL to crawl.
        // It accepts a single parameter, which is an object with options as:
        // https://sdk.apify.com/docs/typedefs/cheerio-crawler-options#handlepagefunction
        // We use for demonstration only 2 of them:
        // - request: an instance of the Request class with information such as URL and HTTP method
        // - $: the cheerio object containing parsed HTML
        handlePageFunction: async ({ request, $ }) => {
            console.log("Handling " + request.url)

            const pattern = /\/feed\/(\w|\/|\?|\=|\&|;)+"/
            let url_podcast = $.html().match(pattern) // This will return the first link

            if (url_podcast != null) {
                url_podcast = url_podcast[0].replace('"', '') // take the first item and remove the trailing quote symbol (")

                const out = {
                    url: request.url,
                    url_podcast
                };

                await dataset.pushData(out);
                output.push(out)
            }
            
        },
    });

    await crawler.run();
    console.log(output)

    await Apify.setValue('OUTPUT', output);
});