
Cheerio Scraper
- apify/cheerio-scraper
- Modified
- Users 2.2k
- Runs 37M
- Created by
Apify
Crawls websites using raw HTTP requests, parses the HTML with the Cheerio library, and extracts data from the pages using a Node.js code. Supports both recursive crawling and lists of URLs. This actor is a high-performance alternative to apify/web-scraper for websites that do not require JavaScript.
To run the code examples, you need to have an Apify account. Replace <YOUR_API_TOKEN> in the code with your API token. For a more detailed explanation, please read about running actors via the API in Apify Docs.
import { ApifyClient } from 'apify-client';
// Initialize the ApifyClient with API token
const client = new ApifyClient({
token: '<YOUR_API_TOKEN>',
});
// Prepare actor input
const input = {
"startUrls": [
{
"url": "https://crawlee.dev"
}
],
"globs": [
{
"glob": "https://crawlee.dev/*/*"
}
],
"pseudoUrls": [],
"linkSelector": "a[href]",
"pageFunction": async function pageFunction(context) {
const { $, request, log } = context;
// The "$" property contains the Cheerio object which is useful
// for querying DOM elements and extracting data from them.
const pageTitle = $('title').first().text();
// The "request" property contains various information about the web page loaded.
const url = request.url;
// Use "log" object to print information to actor log.
log.info('Page scraped', { url, pageTitle });
// Return an object with the data extracted from the page.
// It will be stored to the resulting dataset.
return {
url,
pageTitle
};
},
"proxyConfiguration": {
"useApifyProxy": true
},
"initialCookies": [],
"additionalMimeTypes": [],
"preNavigationHooks": `// We need to return array of (possibly async) functions here.
// The functions accept two arguments: the "crawlingContext" object
// and "requestAsBrowserOptions" which are passed to the `requestAsBrowser()`
// function the crawler calls to navigate..
[
async (crawlingContext, requestAsBrowserOptions) => {
// ...
}
]`,
"postNavigationHooks": `// We need to return array of (possibly async) functions here.
// The functions accept a single argument: the "crawlingContext" object.
[
async (crawlingContext) => {
// ...
},
]`,
"customData": {}
};
(async () => {
// Run the actor and wait for it to finish
const run = await client.actor("apify/cheerio-scraper").call(input);
// Fetch and print actor results from the run's dataset (if any)
console.log('Results from dataset');
const { items } = await client.dataset(run.defaultDatasetId).listItems();
items.forEach((item) => {
console.dir(item);
});
})();