Podcasts
Deprecated
Pricing
Pay per usage
Go to Store
Podcasts
Deprecated
Gets the url of the first podcast from google podcasts
0.0 (0)
Pricing
Pay per usage
1
Total users
6
Monthly users
1
Last modified
3 years ago
Dockerfile
# This is a template for a Dockerfile used to run acts in Actor system.# The base image name below is set during the act build, based on user settings.# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/userFROM apify/actor-node-basic
# Second, copy just package.json and package-lock.json since it should be# the only file that affects "npm install" in the next step, to speed up the buildCOPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Copy source code to container# Do this in the last step, to have fast build if only the source code changedCOPY . ./
# NOTE: The CMD is already defined by the base image.# Uncomment this for local node inspector debugging:# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
{ "name": "apify-project", "version": "0.0.1", "description": "", "author": "It's not you it's me", "license": "ISC", "dependencies": { "apify": "0.22.4" }, "scripts": { "start": "node main.js" }}
main.js
1const Apify = require('apify');2
3Apify.main(async () => {4 const input = await Apify.getInput();5 const requestQueue = await Apify.openRequestQueue('google-podcasts');6 const dataset = await Apify.openDataset('google-podcasts');7 let output = [];8
9 for(const link of input.links)10 await requestQueue.addRequest({11 url: link,12 uniqueKey: link + (new Date).toString()13 });14
15 const crawler = new Apify.CheerioCrawler({16 requestQueue,17
18 // The crawler downloads and processes the web pages in parallel, with a concurrency19 // automatically managed based on the available system memory and CPU (see AutoscaledPool class).20 // Here we define some hard limits for the concurrency.21 minConcurrency: 10,22 maxConcurrency: 50,23
24 // On error, retry each page at most once.25 maxRequestRetries: 1,26
27 // Increase the timeout for processing of each page.28 handlePageTimeoutSecs: 30,29
30 // Limit to 10 requests per one crawl31 maxRequestsPerCrawl: 10,32
33 // This function will be called for each URL to crawl.34 // It accepts a single parameter, which is an object with options as:35 // https://sdk.apify.com/docs/typedefs/cheerio-crawler-options#handlepagefunction36 // We use for demonstration only 2 of them:37 // - request: an instance of the Request class with information such as URL and HTTP method38 // - $: the cheerio object containing parsed HTML39 handlePageFunction: async ({ request, $ }) => {40 console.log("Handling " + request.url)41
42 const pattern = /\/feed\/(\w|\/|\?|\=|\&|;)+"/43 let url_podcast = $.html().match(pattern) // This will return the first link44
45 if (url_podcast != null) {46 url_podcast = url_podcast[0].replace('"', '') // take the first item and remove the trailing quote symbol (")47
48 const out = {49 url: request.url,50 url_podcast51 };52
53 await dataset.pushData(out);54 output.push(out)55 }56 57 },58 });59
60 await crawler.run();61 console.log(output)62
63 await Apify.setValue('OUTPUT', output);64});