PDF Text Extractor avatar
PDF Text Extractor
Try for free

No credit card required

View all Actors
PDF Text Extractor

PDF Text Extractor

jirimoravcik/pdf-text-extractor
Try for free

No credit card required

PDF Text Extractor allows you to extract text from PDF files. It also supports chunking of the text to prepare the data for usage with large language models.

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Specify how to launch the source code of your Actor.
29# By default, the "python3 -m src" command is run
30CMD ["python3", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-7",
4    "title": "Scrape single page in Python",
5    "description": "Scrape data from single page with provided URL.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "python-start"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Scrape data from a web page",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "urls": {
7            "title": "URLs of the PDFs",
8            "type": "array",
9            "description": "The URLs of the PDF files.",
10            "prefill": ["https://arxiv.org/pdf/2307.12856"],
11            "editor": "stringList"
12        },
13        "performChunking": {
14            "title": "Perform chunking",
15            "description": "If set to true, the resulting text will be chunked according to the settings below",
16            "default": false,
17            "type": "boolean"
18        },
19        "chunkSize": {
20            "title": "Chunk size",
21            "type": "integer",
22            "description": "The maximum character length of each text chunk",
23            "default": 1000,
24            "minimum": 1
25        },
26        "chunkOverlap": {
27            "title": "Chunk overlap",
28            "type": "integer",
29            "description": "The character overlap between text chunks that are next to each other",
30            "default": 0,
31            "minimum": 0
32        }
33    },
34    "required": ["urls", "performChunking"]
35}

src/__init__.py

src/__main__.py

1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8# Set up logging of messages from the Apify SDK
9handler = logging.StreamHandler()
10handler.setFormatter(ActorLogFormatter())
11
12apify_client_logger = logging.getLogger('apify_client')
13apify_client_logger.setLevel(logging.INFO)
14apify_client_logger.addHandler(handler)
15
16apify_logger = logging.getLogger('apify')
17apify_logger.setLevel(logging.DEBUG)
18apify_logger.addHandler(handler)
19
20asyncio.run(main())

src/main.py

1from apify import Actor
2import pypdfium2 as pdfium
3import httpx, io, logging
4from langchain.text_splitter import RecursiveCharacterTextSplitter
5
6async def main():
7    async with Actor:
8        actor_input = await Actor.get_input() or {}
9        urls = actor_input.get('urls')
10        perform_chunking = actor_input.get('performChunking')
11
12        for url in urls:
13            try:
14                pdf = httpx.get(url)
15                pdf_document = pdfium.PdfDocument(io.BytesIO(pdf.content))
16                page_texts = []
17                for page in pdf_document:
18                    textpage = page.get_textpage()
19                    page_texts.append(textpage.get_text_range().replace('\r\n', '\n'))
20
21                if perform_chunking:
22                    chunk_size, chunk_overlap = actor_input.get('chunkSize'), actor_input.get('chunkOverlap')
23                    text_splitter = RecursiveCharacterTextSplitter(
24                        chunk_size = chunk_size,
25                        chunk_overlap  = chunk_overlap,
26                    )
27                    page_texts = text_splitter.split_text('\n'.join(page_texts))
28                
29                for i, text in enumerate(page_texts):
30                    await Actor.push_data({
31                        'url': url,
32                        'index': i,
33                        'text': text,
34                    })
35            except Exception as e:
36                logging.error(f'Could not process URL {url} due to exception', e)

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json
13
14.venv/
15.env/
16__pypackages__
17dist/
18build/
19*.egg-info/
20*.egg
21
22__pycache__
23
24.mypy_cache
25.dmypy.json
26dmypy.json
27.pytest_cache
28
29.scrapy
30*.log

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.5
5httpx ~= 0.25.0
6pypdfium2 ~= 4.23.1
7langchain ~= 0.0.329
Developer
Maintained by Community
Actor metrics
  • 33 monthly users
  • 2 stars
  • 100.0% runs succeeded
  • 2 days response time
  • Created in Oct 2023
  • Modified about 2 months ago