Apify Blog Scraper

  • perci/apify-blog-scraper
  • Modified
  • Users 11
  • Runs 288
  • Created by Author's avatarPercival Villalva

Actor to extract data from blog posts in the Apify Blog.

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify"
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
node_modules

apify_storage

Dockerfile

# First, specify the base Docker image. You can read more about
# the available images at https://sdk.apify.com/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16

# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && (npm list --only=prod --no-optional --all || true) \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./

# Optionally, specify how to launch the source code of your actor.
# By default, Apify's base Docker images define the CMD instruction
# that runs the Node.js source code using the command specified
# in the "scripts.start" section of the package.json file.
# In short, the instruction looks something like this:
#
# CMD npm start

INPUT_SCHEMA.json

This file is 127 lines long. Only the first 50 are shown. Show all

{
    "title": "Input schema for the apify_project actor.",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "filter": {
            "title": "Filter",
            "type": "string",
            "description": "In its default input, 'All', the actor will extract data from all the published blog posts. If you choose to filter the extracted pots by either tag or author, then make sure to fill in the 'Tag' or 'Author' input fields",
            "editor": "select",
            "prefill": "all",
            "enum": ["all", "tag", "author"],
            "enumTitles": ["All", "Tag", "Author"]
        },
        "tag": {
            "title": "Tag",
            "type": "string",
            "description": "Filter the posts you want to scrape by choosing a specific tag (e.g., 'api', 'engineering', 'web-scraping').",
            "editor": "select",
            "enum": [
                "web-scraping",
                "tutorial",
                "data-mining",
                "company",
                "api",
                "automation",
                "engineering",
                "data-extraction",
                "use-cases",
                "apify-freelancers",
                "web-crawling",
                "data-analysis",
                "careers",
                "seo"
            ],
            "enumTitles": [
                "Web Scraping",
                "Tutorial",
                "Data mining",
                "Company",
                "API",
                "Automation",
                "Engineering",
                "Data Extraction",
                "Use Cases",
                "Apify Freelancers",
                "Web Crawling",
                "Data Analysis",
                "Careers",
                "SEO"

README.md

# What does this actor do?

Extracts data from all posts in the Apify blog, or filter the posts to be scraped by choosing either a specific tag or author name. 

If you're looking for examples or want to learn more about Apify visit:

- [Tutorial](https://sdk.apify.com/docs/guides/getting-started)
- [Documentation](https://sdk.apify.com/docs/api/apify)
- [Examples](https://sdk.apify.com/docs/examples/crawl-multiple-urls)

# Documentation reference
This actor was built using Apify SDK. To know more about Apify's full-fledged web scraping Node.js library, follow the links below 👇🏻
- [Apify SDK](https://sdk.apify.com/)
- [Apify Actor documentation](https://docs.apify.com/actor)
- [Apify CLI](https://docs.apify.com/cli)

apify.json

{
	"name": "apify-blog-scraper-v2",
	"version": "0.0",
	"buildTag": "latest",
	"env": null,
	"template": "project_empty"
}

main.js

const Apify = require("apify");
const cheerio = require("cheerio");
const { gotScraping } = require("got-scraping");

Apify.main(async () => {
    const { tag, author, filter } = await Apify.getInput();

    if (filter == "all") {
        url = "https://blog.apify.com";
    } else if (filter == "tag") {
        url = `https://blog.apify.com/tag/${tag}`;
    } else if (filter == "author") {
        url = `https://blog.apify.com/author/${author}`;
    } else {
        url = "https://blog.apify.com";
    }

    const sources = [];
    let n = 1;

    for (let i = 0; i < n; i++) {
        const response = await gotScraping(`${url}/page/${n}`);
        const html = response.body;
        const $ = cheerio.load(html);

        $(".post-card__title a").each((index, element) => {
            sources.push("https://blog.apify.com" + $(element).prop("href"));
        });
        if ($(".post-card").length >= 12) {
            n = n + 1;
        }
    }

    const requestList = new Apify.RequestList({
        sources,
    });
    await requestList.initialize();

    const crawler = new Apify.CheerioCrawler({
        requestList,
        handlePageFunction: async ({ request, $ }) => {
            console.log(request.url);
            const data = $(".content")
                .map((index, element) => {
                    return {
                        Title: $(".post-hero__title").text().trim(),
                        Author: $(".post-hero__author div:nth-child(2)").text().trim(),
                        Excerpt: $(element)
                            .find(".context-excerpt")
                            .text()
                            .trim(),
                        Tags: $(element)
                            .find(".post-tags a")
                            .text()
                            .trim()
                            .split("#")
                            .splice(1)
                            .toString()
                            .replace(/,/g, ", "),
                        publicationDate: $("meta[property='article:published_time']")
                            .prop("content")
                            .split("T")[0],
                        Link: request.url,
                        Text: $(element)
                            .find("p")
                            .not(".context-excerpt")
                            .text()
                            .trim(),
                    };
                })
                .toArray();

            // Store the results to the default dataset.
            await Apify.pushData(data);
        },
    });

    await crawler.run();
});

package-lock.json

This file is 8430 lines long. Only the first 50 are shown. Show all

{
	"name": "apify-blog-scraper-v2",
	"version": "0.0.1",
	"lockfileVersion": 2,
	"requires": true,
	"packages": {
		"": {
			"name": "apify-blog-scraper-v2",
			"version": "0.0.1",
			"license": "ISC",
			"dependencies": {
				"apify": "^2.3.2",
				"cheerio": "^1.0.0-rc.12",
				"got-scraping": "^3.2.9",
				"prompt-sync": "^4.2.0"
			},
			"devDependencies": {
				"@apify/eslint-config": "^0.1.3",
				"eslint": "^7.0.0"
			}
		},
		"node_modules/@apify/consts": {
			"version": "1.11.0",
			"resolved": "https://registry.npmjs.org/@apify/consts/-/consts-1.11.0.tgz",
			"integrity": "sha512-5tFbj/t+zqQDxMYKKUbAvgdkalZPNWWJjcL7AlyslwdZXkNaxUWD3CCXLz/hHvQ5otLuxGa7GuRQEcrkL6+jYw=="
		},
		"node_modules/@apify/datastructures": {
			"version": "1.0.1",
			"resolved": "https://registry.npmjs.org/@apify/datastructures/-/datastructures-1.0.1.tgz",
			"integrity": "sha512-AgnrfMjzDph+Te5WGNnIsz3+dJM7v/Sqo82nWwSqca292paRotUhORXr9Ik+d0yurC5LutDAhcvu8VZ8SfANGg=="
		},
		"node_modules/@apify/eslint-config": {
			"version": "0.1.4",
			"resolved": "https://registry.npmjs.org/@apify/eslint-config/-/eslint-config-0.1.4.tgz",
			"integrity": "sha512-sbEpFJk+drdTxRVRoL3Ou0h9pmfu/BAiAxZDH3ANHuF7NoprLV1tQvs3PRu+IsFhxIHihI/6znY19KnPOq1dpA==",
			"dev": true,
			"dependencies": {
				"eslint-config-airbnb": "^18.2.0",
				"eslint-config-airbnb-base": "^14.2.0",
				"eslint-import-resolver-typescript": "^2.2.1",
				"eslint-plugin-import": "^2.22.0",
				"eslint-plugin-jsx-a11y": "^6.2.3",
				"eslint-plugin-promise": "^4.2.1",
				"eslint-plugin-react": "^7.20.0",
				"eslint-plugin-react-hooks": "^4.1.0"
			},
			"peerDependencies": {
				"eslint": "*"
			}
		},

package.json

{
	"name": "apify-blog-scraper-v2",
	"version": "0.0.1",
	"description": "This is a boilerplate of an Apify actor.",
	"dependencies": {
		"apify": "^2.3.2",
		"cheerio": "^1.0.0-rc.12",
		"got-scraping": "^3.2.9",
		"prompt-sync": "^4.2.0"
	},
	"devDependencies": {
		"@apify/eslint-config": "^0.1.3",
		"eslint": "^7.0.0"
	},
	"scripts": {
		"start": "node main.js",
		"lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx",
		"lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix",
		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
	},
	"author": "It's not you it's me",
	"license": "ISC"
}