Actor picture

Web Scraper Experimental Debug

mtrunkat/web-scraper-experimental-dbgr

Experimental version of Apify Web Scraper with Chrome debugger integrated

No credit card required

Author's avatarMarek Trunkát
  • Modified
  • Users17
  • Runs400
Actor picture
Web Scraper Experimental Debug

.eslintrc.json

{
  "extends": "airbnb-base",
  "plugins": [
    "import",
    "promise"
  ],
  "rules": {
    "indent": ["error", 4],
    "no-underscore-dangle": [2, {
      "allow": ["_id"],
      "allowAfterThis": true
    }],
    "no-use-before-define": 0,
    "no-param-reassign": 0,
    "consistent-return": 0,
    "array-callback-return": 0,
    "object-curly-newline": 0,
    "arrow-body-style": 0,
    "no-plusplus": 0,
    "strict": ["error", "global"],
    "max-len": ["error", 150],
    "no-undef": 0,
    "func-names": 0,
    "import/prefer-default-export": 0,
    "import/no-absolute-path": 0,
    "import/no-extraneous-dependencies": ["error", { "devDependencies": ["**/test/**/*.js"] }],
    "no-await-in-loop": 0,
    "no-restricted-syntax": ["error", "ForInStatement", "LabeledStatement", "WithStatement"]
  }
}

.gitignore

.idea
apify_storage
node_modules
.nyc_output
coverage
src/page_function.js
.DS_Store

Dockerfile

FROM apify/actor-node-chrome-xvfb

COPY package.json package-lock.json ./

# Install default dependencies, print versions of everything
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && npm list || true \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version

COPY . ./

INPUT_SCHEMA.json

This file is 224 lines long. Only the first 50 are shown. Show all

{
    "title": "Web Scraper Input",
    "type": "object",
    "description": "Use the following form to configure Web Scraper. The Start URLs and Page function are required and all other fields are optional. If you want to use Pseudo URLs, you also need to Use request queue. To learn more about the different options, click on their title or on the nearby question marks. For details about the Page function visit the actor's README in the ACTOR INFO tab. We also have a <a href=\"https://apify.com/docs/scraping/tutorial/introduction\" target=\"_blank\">step by step tutorial</a> to help you with the basics.\n\nIf you get stuck and it seems that you can no longer progress with the Web Scraper, try <a href=\"https://apify.com/docs/scraping/tutorial/puppeteer-scraper\" target=\"_blank\">Puppeteer Scraper</a>. It's a more powerful tool.",
    "schemaVersion": 1,
    "properties": {
        "startUrls": {
            "sectionCaption": "Basic configuration",
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with",
            "prefill": [
                { "url": "https://apify.com" }
            ],
            "editor": "requestListSources"
        },
        "useRequestQueue": {
            "title": "Use request queue",
            "type": "boolean",
            "description": "Request queue enables recursive crawling and the use of Pseudo-URLs, Link selector and <code>context.enqueueRequest()</code>.",
            "default": true
        },
        "pseudoUrls": {
            "title": "Pseudo-URLs",
            "type": "array",
            "description": "Pseudo-URLs to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Pseudo-URLs will cause the scraper to enqueue all links matched by the Link selector.",
            "editor": "pseudoUrls",
            "default": [],
            "prefill": [
                {
                    "purl": "https://apify.com[(/[\\w-]+)?]"
                }
            ]
        },
        "linkSelector": {
            "title": "Link selector",
            "type": "string",
            "description": "CSS selector matching elements with 'href' attributes that should be enqueued. To enqueue urls from <code><div class=\"my-class\" href=...></code> tags, you would enter <strong>div.my-class</strong>. Leave empty to ignore all links.",
            "editor": "textfield",
            "prefill": "a"
        },
        "keepUrlFragments": {
            "title": "Keep URL fragments",
            "type": "boolean",
            "description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
            "default": false
        },
        "pageFunction": {
            "title": "Page function",
            "type": "string",

LICENSE

This file is 201 lines long. Only the first 50 are shown. Show all

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally

README.md

This file is 249 lines long. Only the first 50 are shown. Show all

# Experimental version of [Apify Web Scraper](https://apify.com/apify/web-scraper) with Chrome debugger integrated

<!-- toc -->

- [How it works](#how-it-works)
- [Getting Started](#getting-started)
- [Input](#input)
- [Page function](#page-function)
- [`context`](#context)
  * [Data structures](#data-structures)
  * [Functions](#functions)
  * [Class instances and namespaces](#class-instances-and-namespaces)
    + [Request](#request)
    + [Response](#response)
    + [Global Store](#global-store)
    + [Log](#log)
    + [Underscore](#underscore)
- [Output](#output)
  * [Dataset](#dataset)

<!-- tocstop -->

## How it works
Web Scraper is a ready-made solution for scraping the web using the Chrome browser. It takes away all
the work necessary to set up a browser for crawling, controls the browser automatically and produces
machine readable results in several common formats.

Underneath, it uses the [Puppeteer](https://github.com/GoogleChrome/puppeteer/) library to control
the browser, but you don't need to worry about that. Using a simple web UI and a little of basic
JavaScript, you can tweak it to serve almost any scraping need.

## Getting Started
If you're new to scraping or Apify, be sure to [visit our tutorial](https://apify.com/docs/scraping/web-scraper-tutorial)
to walk you through creating your first scraping task step by step.

## Input
Input is provided via the pre-configured UI. See the tooltips for more info on the available options.

## Page function
Page function is a single JavaScript function that enables the user to control the Scraper's operation,
manipulate the visited pages and extract data as needed. It is invoked with a `context` object
containing the following properties:

```js
const context = {
    // USEFUL DATA
    input, // Unaltered original input as parsed from the UI
    env, // Contains information about the run such as actorId or runId
    customData, // Value of the 'Custom data' scraper option.

apify.json

{
	"name": "web-scraper-experimental-debug",
	"version": "0.1",
	"buildTag": "latest",
	"env": null
}

main.js

const { runActor } = require('@apify/scraper-tools');
const CrawlerSetup = require('./src/crawler_setup');

runActor(CrawlerSetup);

package-lock.json

This file is 5471 lines long. Only the first 50 are shown. Show all

{
  "name": "actor-web-scraper",
  "version": "1.0.0",
  "lockfileVersion": 1,
  "requires": true,
  "dependencies": {
    "@apify/http-request": {
      "version": "1.1.2",
      "resolved": "https://registry.npmjs.org/@apify/http-request/-/http-request-1.1.2.tgz",
      "integrity": "sha512-u3MyDXQ4H2CkITJNr2HNwIQZBTsdkewBd6jJaXSBSzaJ5TdBlY+OKfBpVY3kTb87yl5Ses443cAxen5B8Cy3ZQ==",
      "requires": {
        "got": "^9.6.0",
        "proxy-agent": "^3.1.0",
        "underscore": "^1.9.1"
      }
    },
    "@apify/ps-tree": {
      "version": "1.1.3",
      "resolved": "https://registry.npmjs.org/@apify/ps-tree/-/ps-tree-1.1.3.tgz",
      "integrity": "sha512-+hIr8EaTRd9fsOiNNzf1Fi8Tm9qs8cdPBZjuq5fXDV6SOCdi2ZyQlcQSzc8lY0hb+UhBib1WPixtCdKLL169WA==",
      "requires": {
        "event-stream": "3.3.4"
      }
    },
    "@apify/scraper-tools": {
      "version": "0.1.2",
      "resolved": "https://registry.npmjs.org/@apify/scraper-tools/-/scraper-tools-0.1.2.tgz",
      "integrity": "sha512-KsTDfLvDlQhujkTNNNjITSTHX/B56NmIczAGfZw69yO2H5tVRQE/ZL58TNgPYhcHk42/NAMe6FeF+AgkNwNGzw==",
      "requires": {
        "ajv": "^6.5.5",
        "underscore": "^1.9.1"
      }
    },
    "@babel/code-frame": {
      "version": "7.5.5",
      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.5.5.tgz",
      "integrity": "sha512-27d4lZoomVyo51VegxI20xZPuSHusqbQag/ztrBC7wegWoQ1nLREPVSKSW8byhTlzTKyNE4ifaTA6lCp7JjpFw==",
      "dev": true,
      "requires": {
        "@babel/highlight": "^7.0.0"
      }
    },
    "@babel/generator": {
      "version": "7.6.2",
      "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.6.2.tgz",
      "integrity": "sha512-j8iHaIW4gGPnViaIHI7e9t/Hl8qLjERI6DcV9kEpAIDJsAOrcnXqRS7t+QbhL76pwbtqP+QCQLL0z1CyVmtjjQ==",
      "dev": true,
      "requires": {
        "@babel/types": "^7.6.0",
        "jsesc": "^2.5.1",

package.json

{
  "name": "actor-web-scraper",
  "version": "1.0.0",
  "description": "Crawl web pages using Apify, headless Chrome and Puppeteer",
  "main": "src/main.js",
  "dependencies": {
    "@apify/scraper-tools": "^0.1.2",
    "apify": "^0.16.0",
    "content-type": "^1.0.4",
    "http-proxy": "^1.18.0",
    "request-promise": "^4.2.4",
    "underscore": "^1.9.1",
    "url": "^0.11.0"
  },
  "devDependencies": {
    "@types/puppeteer": "^1.19.0",
    "@types/underscore": "^1.9.2",
    "chai": "^4.2.0",
    "eslint": "^6.1.0",
    "eslint-config-airbnb": "^17.1.1",
    "eslint-config-airbnb-base": "^13.2.0",
    "eslint-plugin-import": "^2.18.2",
    "eslint-plugin-jsx-a11y": "^6.2.3",
    "eslint-plugin-promise": "^4.0.1",
    "eslint-plugin-react": "^7.14.3",
    "markdown-toc": "^1.2.0",
    "mocha": "^6.2.0",
    "nyc": "^14.1.1",
    "sinon": "^7.1.1"
  },
  "scripts": {
    "start": "node main.js",
    "build-toc": "./node_modules/.bin/markdown-toc README.md -i",
    "test": "nyc --reporter=html --reporter=text mocha --timeout 60000 --recursive",
    "prepare": "npm run build-toc",
    "lint": "eslint src test"
  },
  "repository": {
    "type": "git",
    "url": "https://github.com/apifytech/actor-scraper"
  },
  "author": {
    "name": "Apify",
    "email": "support@apify.com",
    "url": "https://www.apify.com"
  },
  "contributors": [
    "Marek Trunkat <marek@apify.com>",
    "Ondra Urban <ondra@apify.com>"
  ],
  "license": "Apache-2.0",
  "homepage": "https://github.com/apifytech/actor-scraper"
}

src/bundle.browser.js

This file is 226 lines long. Only the first 50 are shown. Show all

/* istanbul ignore next */
/**
 * Command to be evaluated for Browser side code injection.
 * @param apifyNamespace
 */
module.exports = (apifyNamespace) => {
    (function (global, namespace) {
        if (typeof window[namespace] !== 'object') window[namespace] = {};
        /**
         * Takes a configuration object with function references
         * as an input and creates a dummy class that proxies
         * function invocations from Browser context to Node context.
         *
         * The configuration is to be provided by the
         * tools.createBrowserHandlesForObject function.
         */
        class NodeProxy {
            constructor(config) {
                if (!config || typeof config !== 'object') {
                    throw new Error('NodeProxy: Parameter config of type Object must be provided.');
                }

                Object.entries(config)
                    .forEach(([key, { value, type }]) => {
                        if (type === 'METHOD') {
                            this[key] = (...args) => global[value](...args);
                        } else if (type === 'GETTER') {
                            Object.defineProperty(this, key, {
                                get: () => global[value](),
                            });
                        } else if (type === 'VALUE') {
                            this[key] = value;
                        } else {
                            throw new Error(`Unsupported function type: ${type} for function: ${key}.`);
                        }
                    });
            }
        }

        /**
         * Exposed factory.
         * @param config
         * @return {NodeProxy}
         */
        global[namespace].createNodeProxy = config => new NodeProxy(config);

        const setup = Symbol('crawler-setup');
        const internalState = Symbol('request-internal-state');

        /**

src/crawler_setup.js

This file is 580 lines long. Only the first 50 are shown. Show all

const Apify = require('apify');
const _ = require('underscore');
const httpProxy = require('http-proxy');
const url = require('url');
const contentType = require('content-type');
const {
    tools,
    browserTools,
    constants: { META_KEY, DEFAULT_VIEWPORT, DEVTOOLS_TIMEOUT_SECS },
} = require('@apify/scraper-tools');
const rp = require('request-promise');

const GlobalStore = require('./global_store');
const createBundle = require('./bundle.browser');
const SCHEMA = require('../INPUT_SCHEMA');

const CHROME_DEBUGGER_PORT = 4322;

const { utils: { log, puppeteer } } = Apify;

/**
 * Replicates the INPUT_SCHEMA with JavaScript types for quick reference
 * and IDE type check integration.
 *
 * @typedef {Object} Input
 * @property {Object[]} startUrls
 * @property {boolean} useRequestQueue
 * @property {Object[]} pseudoUrls
 * @property {string} linkSelector
 * @property {boolean} keepUrlFragments
 * @property {string} pageFunction
 * @property {Object} proxyConfiguration
 * @property {boolean} debugLog
 * @property {boolean} browserLog
 * @property {boolean} injectJQuery
 * @property {boolean} injectUnderscore
 * @property {boolean} downloadMedia
 * @property {boolean} downloadCss
 * @property {number} maxRequestRetries
 * @property {number} maxPagesPerCrawl
 * @property {number} maxResultsPerCrawl
 * @property {number} maxCrawlingDepth
 * @property {number} maxConcurrency
 * @property {number} pageLoadTimeoutSecs
 * @property {number} pageFunctionTimeoutSecs
 * @property {Object} customData
 * @property {Array} initialCookies
 * @property {Array} waitUntil
 * @property {boolean} useChrome
 * @property {boolean} useStealth

src/global_store.js

/**
 * GlobalStore is a trivial storage that resembles a Map to be used from Browser contexts
 * to retain data through page navigations and browser instances. It limits Map's functionality
 * because it's currently impossible for functions and object references to cross Node-Browser threshold.
 */
class GlobalStore extends Map {
    /* eslint-disable class-methods-use-this */
    get(key) {
        if (typeof key !== 'string') throw new Error('GlobalStore#get parameter "key" must be a string.');
        return super.get(key);
    }

    set(key, value) {
        if (typeof key !== 'string') throw new Error('GlobalStore#set parameter "key" must be a string.');
        return super.set(key, value);
    }

    forEach() {
        throw new Error('GlobalStore#forEach function is not available due to underlying technology limitations.');
    }

    values() {
        return Array.from(super.values());
    }

    keys() {
        return Array.from(super.keys());
    }

    entries() {
        return Array.from(super.entries());
    }

    get size() {
        return super.size;
    }
}

module.exports = GlobalStore;

test/bundle.browser.js

const { expect } = require('chai');
const Apify = require('apify');
const createBundle = require('../src/bundle.browser');

const NAMESPACE = 'Apify';

describe('Bundle', () => {
    let browser;
    let page;
    before(async () => {
        browser = await Apify.launchPuppeteer({ headless: true });
    });
    after(async () => {
        await browser.close();
    });
    beforeEach(async () => {
        page = await browser.newPage();
        await page.evaluateOnNewDocument(createBundle, NAMESPACE);
    });
    afterEach(async () => {
        await page.close();
    });
    describe('Context', () => {
        const CONTEXT_OPTIONS = {
            crawlerSetup: {
                rawInput: '{}',

            },
            browserHandles: {
                apify: {},
                globalStore: {},
                log: {},
            },
            pageFunctionArguments: {
                request: {},
            },
        };
        beforeEach(async () => {
            await page.goto('about:chrome');
            await page.waitFor(namespace => !!window[namespace], {}, NAMESPACE);
            await page.evaluate((namespace, contextOptions) => {
                window.contextInstance = window[namespace].createContext(contextOptions);
            }, NAMESPACE, CONTEXT_OPTIONS);
        });
        describe('waitFor', async () => {
            it('should work with a number', async () => {
                const millis = await page.evaluate(async () => {
                    const ctx = window.contextInstance;
                    const start = Date.now();
                    await ctx.waitFor(10);
                    return Date.now() - start;
                });
                expect(millis).to.be.above(9);
            });
            it('should work with a selector', async () => {
                const millis = await page.evaluate(async () => {
                    const ctx = window.contextInstance;
                    const start = Date.now();
                    setTimeout(() => {
                        const el = document.createElement('div');
                        el.id = 'very-unique-id';
                        document.body.appendChild(el);
                    }, 10);
                    await ctx.waitFor('#very-unique-id');
                    return Date.now() - start;
                });
                expect(millis).to.be.above(9);
            });
            it('should work with a function', async () => {
                const millis = await page.evaluate(async () => {
                    const ctx = window.contextInstance;
                    let done = false;
                    const start = Date.now();
                    setTimeout(() => {
                        done = true;
                    }, 10);
                    await ctx.waitFor(() => done);
                    return Date.now() - start;
                });
                expect(millis).to.be.above(9);
            });
        });
    });
});