WS

Web Scraper Task

  • undrtkr984/web-scraper-task
  • Modified
  • Users 22
  • Runs 347
  • Created by Author's avatarMatt

To run the code examples, you need to have an Apify account. Replace <YOUR_API_TOKEN> in the code with your API token. For a more detailed explanation, please read about running Actors via the API in Apify Docs.

import { ApifyClient } from 'apify-client';

// Initialize the ApifyClient with API token
const client = new ApifyClient({
    token: '<YOUR_API_TOKEN>',
});

// Prepare Actor input
const input = {
    "runMode": "DEVELOPMENT",
    "startUrls": [
        {
            "url": "https://crawlee.dev"
        }
    ],
    "linkSelector": "a[href]",
    "globs": [
        {
            "glob": "https://crawlee.dev/*/*"
        }
    ],
    "pseudoUrls": [],
    "pageFunction": // The function accepts a single argument: the "context" object.
    // For a complete list of its properties and functions,
    // see https://apify.com/apify/web-scraper#page-function 
    async function pageFunction(context) {
        // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
        // debugger; 
    
        // jQuery is handy for finding DOM elements and extracting data from them.
        // To use it, make sure to enable the "Inject jQuery" option.
        const $ = context.jQuery;
        const pageTitle = $('title').first().text();
        const h1 = $('h1').first().text();
        const first_h2 = $('h2').first().text();
        const random_text_from_the_page = $('p').first().text();
    
    
        // Print some information to actor log
        context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);
    
        // Manually add a new page to the queue for scraping.
       await context.enqueueRequest({ url: 'http://www.example.com' });
    
        // Return an object with the data extracted from the page.
        // It will be stored to the resulting dataset.
        return {
            url: context.request.url,
            pageTitle,
            h1,
            first_h2,
            random_text_from_the_page
        };
    },
    "proxyConfiguration": {
        "useApifyProxy": true
    },
    "initialCookies": [],
    "waitUntil": [
        "networkidle2"
    ],
    "preNavigationHooks": `// We need to return array of (possibly async) functions here.
        // The functions accept two arguments: the "crawlingContext" object
        // and "gotoOptions".
        [
            async (crawlingContext, gotoOptions) => {
                // ...
            },
        ]`,
    "postNavigationHooks": `// We need to return array of (possibly async) functions here.
        // The functions accept a single argument: the "crawlingContext" object.
        [
            async (crawlingContext) => {
                // ...
            },
        ]`,
    "breakpointLocation": "NONE",
    "customData": {}
};

(async () => {
    // Run the Actor and wait for it to finish
    const run = await client.actor("undrtkr984/web-scraper-task").call(input);

    // Fetch and print Actor results from the run's dataset (if any)
    console.log('Results from dataset');
    const { items } = await client.dataset(run.defaultDatasetId).listItems();
    items.forEach((item) => {
        console.dir(item);
    });
})();