
Article Text Extractor
Pricing
Pay per usage
Go to Store

Article Text Extractor
Simply extracts article texts and other meta info from the given URL. Uses https://github.com/ageitgey/node-unfluff which is a NodeJS implementation of https://github.com/grangier/python-goose.
5.0 (1)
Pricing
Pay per usage
12
Total users
1k
Monthly users
16
Runs succeeded
>99%
Last modified
2 years ago
Dockerfile
# This is a template for a Dockerfile used to run acts in Actor system.# The base image name below is set during the act build, based on user settings.# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/userFROM apify/actor-node-chrome:v0.21.10
# Second, copy just package.json and package-lock.json since it should be# the only file that affects "npm install" in the next step, to speed up the buildCOPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Copy source code to container# Do this in the last step, to have fast build if only the source code changedCOPY . ./
# NOTE: The CMD is already defined by the base image.# Uncomment this for local node inspector debugging:# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
{ "name": "apify-project", "version": "0.0.1", "description": "", "author": "It's not you it's me", "license": "ISC", "dependencies": { "apify": "0.21.10", "request-promise": "latest", "unfluff": "latest" }, "scripts": { "start": "node main.js" }}
main.js
1const Apify = require('apify');2const request = require('request-promise');3const extractor = require('unfluff');4
5Apify.main(async () => {6 const { url } = await Apify.getValue('INPUT');7 8 if (!url) throw new Error('INPUT.url must be provided!!!');9 10 console.log('Opening browser ...');11 const browser = await Apify.launchPuppeteer();12 13 console.log('Loading url ...');14 const page = await browser.newPage();15 await page.goto(url, { waitUntil: 'domcontentloaded' });16 const html = await page.evaluate(() => document.documentElement.outerHTML);17
18 await Apify.setValue('page.html', html, { contentType: 'text/html' });19 20 console.log('Extracting article data and saving results to key-value store ...');21 await Apify.setValue('OUTPUT', extractor(html));22 23 console.log('Done!');24});