PDF to HTML Converter avatar
PDF to HTML Converter

Deprecated

Pricing

Pay per usage

Go to Store
PDF to HTML Converter

PDF to HTML Converter

Deprecated

Developed by

Jan Čurn

Jan Čurn

Maintained by Community

Converts a PDF document to HTML using the pdf2htmlEX tool.

0.0 (0)

Pricing

Pay per usage

7

Total users

444

Monthly users

1

Runs succeeded

>99%

Last modified

2 years ago

Dockerfile

FROM debian:jessie
RUN apt-get update --fix-missing \
&& DEBIAN_FRONTEND=noninteractive apt-get -y upgrade \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl ca-certificates pdf2htmlex \
&& curl -sL https://deb.nodesource.com/setup_10.x | bash - \
&& apt-get install -y nodejs \
&& node -v \
&& rm -rf /var/lib/apt/lists/*
RUN mkdir -p /pdf/kv-store-dev
WORKDIR /pdf
# Copy all files and directories from the directory to the Docker image
COPY main.js package.json ./
# Install NPM packages, skip optional and development dependencies to keep the image small,
# avoid logging to much and show log the dependency tree
RUN npm install --quiet --only=prod --no-optional \
&& npm list \
&& pwd \
&& ls -l
# Define that start command
CMD [ "node", "main.js" ]

INPUT_SCHEMA.json

{
"title": "PDF to HTML input",
"type": "object",
"schemaVersion": 1,
"properties": {
"url": {
"title": "URL",
"type": "string",
"description": "URL that links to a PDF file",
"editor": "textfield",
"prefill": "https://apify.com/ext/ycf_application.pdf"
}
},
"required": ["url"]
}

main.js

1const fs = require('fs');
2const util = require('util');
3const exec = util.promisify(require('child_process').exec);
4const Apify = require('apify');
5const requestPromise = require('request-promise');
6
7Apify.main(async () => {
8 // Fetch the input and check it has a valid format
9 // You don't need to check the input, but it's a good practice.
10 const input = await Apify.getValue('INPUT');
11 if (!input || !input.url) throw new Error('Received invalid input');
12
13 console.log(`Downloading PDF file: ${input.url}`);
14 const options = {
15 url: input.url,
16 encoding: null // set to `null`, if you expect binary data.
17 };
18 const response = await requestPromise(options);
19 const buffer = Buffer.from(response);
20
21 const tmpTarget = 'temp.pdf';
22 console.log('Saving PDF file to: ' + tmpTarget);
23 fs.writeFileSync(tmpTarget, buffer);
24
25 const { stdout, stderr } = await exec('pdf2htmlEX --zoom 1.3 temp.pdf');
26 console.log('stdout:', stdout);
27 console.log('stderr:', stderr);
28
29 const htmlBuffer = fs.readFileSync('temp.html');
30
31 console.log(`Saving HTML (size: ${htmlBuffer.length} bytes) to output...`);
32 await Apify.setValue('OUTPUT', htmlBuffer, { contentType: 'text/html' });
33
34 const storeId = process.env.APIFY_DEFAULT_KEY_VALUE_STORE_ID;
35
36 // NOTE: Adding disableRedirect=1 param, because for some reason Chrome doesn't allow pasting URLs to PDF
37 // that redirect into the browser address bar (yeah, wtf...)
38 console.log('HTML file has been stored to:');
39 console.log(`https://api.apify.com/v2/key-value-stores/${storeId}/records/OUTPUT`);
40});

package.json

{
"name": "act-pdf-to-html",
"version": "0.0.1",
"private": true,
"dependencies": {
"apify": "^0.15.2",
"request-promise": "^4.2.4"
},
"devDependencies": {},
"scripts": {
"test-local": "APIFY_DEV_KEY_VALUE_STORE_DIR=./kv-store-dev/ node main.js"
}
}