$echo '{
< "runMode": "DEVELOPMENT",
< "startUrls": [
< {
< "url": "https://crawlee.dev/js"
< }
< ],
< "respectRobotsTxtFile": true,
< "linkSelector": "a[href]",
< "globs": [
< {
< "glob": "https://crawlee.dev/js/*/*"
< }
< ],
< "pseudoUrls": [],
< "excludes": [
< {
< "glob": "/**/*.{png,jpg,jpeg,pdf}"
< }
< ],
< "pageFunction": "// The function accepts a single argument: the \\"context\\" object.\\n// For a complete list of its properties and functions,\\n// see https://apify.com/apify/web-scraper#page-function \\nasync function pageFunction(context) {\\n // This statement works as a breakpoint when you'\''re trying to debug your code. Works only with Run mode: DEVELOPMENT!\\n // debugger; \\n\\n // jQuery is handy for finding DOM elements and extracting data from them.\\n // To use it, make sure to enable the \\"Inject jQuery\\" option.\\n const $ = context.jQuery;\\n const pageTitle = $('\''title'\'').first().text();\\n const h1 = $('\''h1'\'').first().text();\\n const first_h2 = $('\''h2'\'').first().text();\\n const random_text_from_the_page = $('\''p'\'').first().text();\\n\\n\\n // Print some information to Actor log\\n context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);\\n\\n // Manually add a new page to the queue for scraping.\\n await context.enqueueRequest({ url: '\''http://www.example.com'\'' });\\n\\n // Return an object with the data extracted from the page.\\n // It will be stored to the resulting dataset.\\n return {\\n url: context.request.url,\\n pageTitle,\\n h1,\\n first_h2,\\n random_text_from_the_page\\n };\\n}",
< "proxyConfiguration": {
< "useApifyProxy": true
< },
< "initialCookies": [],
< "waitUntil": [
< "networkidle2"
< ],
< "preNavigationHooks": "// We need to return array of (possibly async) functions here.\\n// The functions accept two arguments: the \\"crawlingContext\\" object\\n// and \\"gotoOptions\\".\\n[\\n async (crawlingContext, gotoOptions) => {\\n // ...\\n },\\n]\\n",
< "postNavigationHooks": "// We need to return array of (possibly async) functions here.\\n// The functions accept a single argument: the \\"crawlingContext\\" object.\\n[\\n async (crawlingContext) => {\\n // ...\\n },\\n]",
< "breakpointLocation": "NONE",
< "customData": {}
<}' |
<apify call apify/web-scraper --silent --output-dataset