$echo '{
< "startUrls": [
< {
< "url": "https://crawlee.dev/js"
< }
< ],
< "respectRobotsTxtFile": true,
< "globs": [
< {
< "glob": "https://crawlee.dev/js/*/*"
< }
< ],
< "pseudoUrls": [],
< "excludes": [
< {
< "glob": "/**/*.{png,jpg,jpeg,pdf}"
< }
< ],
< "linkSelector": "a[href]",
< "pageFunction": "async function pageFunction(context) {\\n const { window, request, log } = context;\\n\\n // The \\"window\\" property contains the JSDOM object which is useful\\n // for querying DOM elements and extracting data from them.\\n const pageTitle = window.document.title;\\n\\n // The \\"request\\" property contains various information about the web page loaded. \\n const url = request.url;\\n \\n // Use \\"log\\" object to print information to Actor log.\\n log.info('\''Page scraped'\'', { url, pageTitle });\\n\\n // Return an object with the data extracted from the page.\\n // It will be stored to the resulting dataset.\\n return {\\n url,\\n pageTitle\\n };\\n}",
< "proxyConfiguration": {
< "useApifyProxy": true
< },
< "initialCookies": [],
< "additionalMimeTypes": [],
< "preNavigationHooks": "// We need to return array of (possibly async) functions here.\\n// The functions accept two arguments: the \\"crawlingContext\\" object\\n// and \\"requestAsBrowserOptions\\" which are passed to the `requestAsBrowser()`\\n// function the crawler calls to navigate..\\n[\\n async (crawlingContext, requestAsBrowserOptions) => {\\n // ...\\n }\\n]",
< "postNavigationHooks": "// We need to return array of (possibly async) functions here.\\n// The functions accept a single argument: the \\"crawlingContext\\" object.\\n[\\n async (crawlingContext) => {\\n // ...\\n },\\n]",
< "customData": {}
<}' |
<apify call apify/jsdom-scraper --silent --output-dataset