LD+JSON Schema scraper

  • pocesar/json-ld-schema
  • Modified
  • Users 82
  • Runs 740
  • Created by Author's avatarPaulo Cesar

Extract all LD+JSON tags from the given URLs.

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify"
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
node_modules

Dockerfile

# First, specify the base Docker image. You can read more about
# the available images at https://sdk.apify.com/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16

# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && (npm list --only=prod --no-optional --all || true) \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./

# Optionally, specify how to launch the source code of your actor.
# By default, Apify's base Docker images define the CMD instruction
# that runs the Node.js source code using the command specified
# in the "scripts.start" section of the package.json file.
# In short, the instruction looks something like this:
#
# CMD npm start

INPUT_SCHEMA.json

{
    "title": "LD JSON Schema scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "startUrls": {
            "title": "Start Urls",
            "type": "array",
            "description": "The URLs to extract all LD+JSON data",
            "default": [],
            "prefill": [{
                "url": "https://blog.apify.com/"
            }],
            "editor": "requestListSources"
        },
        "proxyConfiguration": {
            "title": "Proxy configuration",
            "description": "A proxy required for scraping",
            "type": "object",
            "default": { "useApifyProxy": true },
            "prefill": { "useApifyProxy": true },
            "editor": "proxy"
        },
        "customData": {
            "title": "Custom data",
            "description": "Provide some custom data to output",
            "type": "object",
            "default": {},
            "prefill": {},
            "editor": "json"
        }
    },
    "required": [
        "startUrls",
        "proxyConfiguration"
    ]
}

README.md

# LD+JSON scraper

Extract LD+JSON data from websites

apify.json

{
    "env": { "npm_config_loglevel": "silent" }
}

main.js

const Apify = require('apify');

const pageFunction = async (context) => {
    const { request, $, log, customData } = context;

    const { url } = request;

    const lds = $('script[type="application/ld+json"]');

    if (!lds.length) {
        log.warning('No LD+JSON found on page', { url });
        return {
            data: {},
            url,
            customData,
        };
    }

    return lds
        .map((_, el) => $(el).html().trim())
        .get()
        .map((html) => {
            try {
                return JSON.parse(html);
            } catch (e) {
                log.exception(e, 'Invalid JSON', { url });
            }
        })
        .filter(Boolean)
        .map((data) => {
            return {
                data,
                url,
                customData,
            }
        });
};

Apify.main(async () => {
    const { proxyConfiguration, startUrls, customData } = await Apify.getInput();

    if (!proxyConfiguration) {
        throw new Error('You require a proxy to run');
    }

    // test proxy
    const proxy = await Apify.createProxyConfiguration(proxyConfiguration);

    if (!proxy) {
        throw new Error('Invalid proxy configuration');
    }

    if (!startUrls?.length) {
        throw new Error('Provide a RequestList sources array on "startUrls" input'); 
    }

    await Apify.metamorph('apify/cheerio-scraper', {
        startUrls,
        pageFunction: pageFunction.toString(),  
        proxyConfiguration,  
        customData,
        ignoreSslErrors: true,
    });
});

package.json

{
    "name": "project-empty",
    "version": "0.0.1",
    "description": "This is a boilerplate of an Apify actor.",
    "dependencies": {
        "apify": "^2.2.1"
    },
    "scripts": {
        "start": "node main.js",
        "lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx",
        "lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix",
        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
    },
    "author": "It's not you it's me",
    "license": "ISC"
}