
PDF to HTML Converter
Deprecated
Pricing
Pay per usage
Go to Store

PDF to HTML Converter
Deprecated
Converts a PDF document to HTML using the pdf2htmlEX tool.
0.0 (0)
Pricing
Pay per usage
7
Total users
444
Monthly users
1
Runs succeeded
>99%
Last modified
2 years ago
Dockerfile
FROM debian:jessie
RUN apt-get update --fix-missing \ && DEBIAN_FRONTEND=noninteractive apt-get -y upgrade \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl ca-certificates pdf2htmlex \ && curl -sL https://deb.nodesource.com/setup_10.x | bash - \ && apt-get install -y nodejs \ && node -v \ && rm -rf /var/lib/apt/lists/*
RUN mkdir -p /pdf/kv-store-dev
WORKDIR /pdf
# Copy all files and directories from the directory to the Docker imageCOPY main.js package.json ./
# Install NPM packages, skip optional and development dependencies to keep the image small,# avoid logging to much and show log the dependency treeRUN npm install --quiet --only=prod --no-optional \ && npm list \ && pwd \ && ls -l
# Define that start commandCMD [ "node", "main.js" ]
INPUT_SCHEMA.json
{ "title": "PDF to HTML input", "type": "object", "schemaVersion": 1, "properties": { "url": { "title": "URL", "type": "string", "description": "URL that links to a PDF file", "editor": "textfield", "prefill": "https://apify.com/ext/ycf_application.pdf" } }, "required": ["url"]}
main.js
1const fs = require('fs');2const util = require('util');3const exec = util.promisify(require('child_process').exec);4const Apify = require('apify');5const requestPromise = require('request-promise');6
7Apify.main(async () => {8 // Fetch the input and check it has a valid format9 // You don't need to check the input, but it's a good practice.10 const input = await Apify.getValue('INPUT');11 if (!input || !input.url) throw new Error('Received invalid input');12
13 console.log(`Downloading PDF file: ${input.url}`);14 const options = {15 url: input.url,16 encoding: null // set to `null`, if you expect binary data.17 };18 const response = await requestPromise(options);19 const buffer = Buffer.from(response);20
21 const tmpTarget = 'temp.pdf';22 console.log('Saving PDF file to: ' + tmpTarget);23 fs.writeFileSync(tmpTarget, buffer);24
25 const { stdout, stderr } = await exec('pdf2htmlEX --zoom 1.3 temp.pdf');26 console.log('stdout:', stdout);27 console.log('stderr:', stderr);28
29 const htmlBuffer = fs.readFileSync('temp.html');30
31 console.log(`Saving HTML (size: ${htmlBuffer.length} bytes) to output...`);32 await Apify.setValue('OUTPUT', htmlBuffer, { contentType: 'text/html' });33
34 const storeId = process.env.APIFY_DEFAULT_KEY_VALUE_STORE_ID;35
36 // NOTE: Adding disableRedirect=1 param, because for some reason Chrome doesn't allow pasting URLs to PDF37 // that redirect into the browser address bar (yeah, wtf...)38 console.log('HTML file has been stored to:');39 console.log(`https://api.apify.com/v2/key-value-stores/${storeId}/records/OUTPUT`);40});
package.json
{ "name": "act-pdf-to-html", "version": "0.0.1", "private": true, "dependencies": { "apify": "^0.15.2", "request-promise": "^4.2.4" }, "devDependencies": {}, "scripts": { "test-local": "APIFY_DEV_KEY_VALUE_STORE_DIR=./kv-store-dev/ node main.js" }}