Actor picture

Example Hacker News

mtrunkat/example-hacker-news

Example crawler for news.ycombinator.com built using Apify SDK.

No credit card required

Author's avatarMarek Trunkát
  • Modified
  • Users107
  • Runs1,524
Actor picture

Example Hacker News

Based on the apify/actor-node-chrome:v0.21.10 Docker image (see docs).

const Apify = require('apify');

Apify.main(async () => {
    // Get queue and enqueue first url.
    const requestQueue = await Apify.openRequestQueue();
    const enqueueUrl = async url => requestQueue.addRequest({ url });
    await enqueueUrl('https://news.ycombinator.com/');

    // Create crawler.
    const crawler = new Apify.PuppeteerCrawler({
        requestQueue,
        
        launchPuppeteerOptions: {
          liveView: true, 
        },

        // This page is executed for each request.
        // If request failes then it's retried 3 times.
        // Parameter page is Puppeteers page object with loaded page.
        handlePageFunction: async ({ page, request }) => {
            console.log(`Request ${request.url} succeeded!`);

            // We inject JQuery for easier data extracting
            await Apify.utils.puppeteer.injectJQuery(page)

            // Extract all posts. This is a function that gets executed inside a browser context
            // $ is JQuery variable that is actualy defined on the browser itself 
            // so don't worry about the red line warning
            const data = await page.evaluate(() => {
                let posts = [];
                $('.athing').each(function() {
                    posts.push({
                        rank: Number($(this).find('.rank').text().replace('.', '').trim()),
                        title: $(this).find('.storylink').text().trim(),
                        link: $(this).find('.storylink').attr('href'),
                        domain: $(this).find('.sitestr').text().trim(),
                        score: Number($(this).next().find('.score').text().replace('points', '').replace(',', '').trim()),
                        author: $(this).next().find('.hnuser').text().trim(),
                        posted: $(this).next().find('.age').text().trim(),
                        comments: Number($(this).next().find('a:contains("comments")').text().replace('comments', '').replace(',', '').trim()),
                        url: window.location.href,
                    })
                })
                return posts;
            });
            
            // Save data.
            await Apify.pushData(data);
            
            // Enqueue next page.
            try {
                const nextHref = await page.$eval('.morelink', el => el.href);
                await enqueueUrl(nextHref);
            } catch (err) {
                console.log(`Url ${request.url} is the last page!`);
            }
        },

        // If request failed 4 times then this function is executed.
        handleFailedRequestFunction: async ({ request }) => {
            console.log(`Request ${request.url} failed 4 times`);
            
            await Apify.pushData({
                url: request.url,
                errors: request.errorMessages,
            })
        },
    });
    
    // Run crawler.
    await crawler.run();
});