# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.13

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q src/

# Create and run as a non-root user.
RUN useradd --create-home apify && \
    chown -R apify:apify ./
USER apify

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
  "actorSpecification": 1,
  "name": "finstat-revenue-scraper",
  "title": "Finstat Revenue Scraper",
  "description": "Scrapes Slovak companies from Finstat and extracts revenue, NACE, employees, and VAT info.",
  "version": "0.1",
  "buildTag": "latest",
  "storages": {
    "dataset": {
      "actorSpecification": 1,
      "views": {
        "overview": {
          "title": "Overview",
          "transformation": {
            "fields": [
              "ico",
              "ic_dph",
              "sidlo",
              "sk_nace",
              "established_date",
              "employees_text",
              "min_employees",
              "revenue_2019",
              "revenue_2020",
              "revenue_2021",
              "revenue_2022",
              "revenue_2023",
              "revenue_2024"
            ]
          },
          "display": {
            "component": "table",
            "properties": {
              "ico": { "label": "ICO", "format": "text" },
              "ic_dph": { "label": "IČ DPH", "format": "text" },
              "sidlo": { "label": "Sídlo", "format": "text" },
              "sk_nace": { "label": "SK NACE", "format": "text" },
              "established_date": { "label": "Dátum vzniku", "format": "text" },
              "employees_text": { "label": "Počet zamestnancov", "format": "text" },
              "min_employees": { "label": "Min. zamestnanci", "format": "number" },
              "revenue_2019": { "label": "Tržby 2019", "format": "number" },
              "revenue_2020": { "label": "Tržby 2020", "format": "number" },
              "revenue_2021": { "label": "Tržby 2021", "format": "number" },
              "revenue_2022": { "label": "Tržby 2022", "format": "number" },
              "revenue_2023": { "label": "Tržby 2023", "format": "number" },
              "revenue_2024": { "label": "Tržby 2024", "format": "number" }
            }
          }
        }
      }
    }
  },
  "input": "./input_schema.json",
  "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Finstat Revenue Scraper",
    "description": "Scrapes Tržby (Revenue) data from Finstat for given ICOs.",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "icos": {
            "title": "ICO List",
            "type": "array",
            "description": "List of ICOs to process (e.g. company identifiers in Slovakia)",
            "editor": "stringList",
            "prefill": ["35882085", "36468924", "36249955"]
        }
    },
    "required": ["icos"]
}

src/init.py

src/main.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())

src/main.py

1from apify import Actor
2from bs4 import BeautifulSoup
3from curl_cffi.requests import AsyncSession
4import asyncio
5import json
6import re
7import time
8
9semaphore = asyncio.Semaphore(3)
10
11
12def extract_trzby_data(text):
13    series_marker = '[{"name":"Tr\\u017Eby","showInLegend":false,"data":['
14    start_idx = text.find(series_marker)
15    if start_idx == -1:
16        return None
17
18    count = 0
19    end_idx = None
20    for i in range(start_idx, len(text)):
21        if text[i] == '[':
22            count += 1
23        elif text[i] == ']':
24            count -= 1
25            if count == 0:
26                end_idx = i + 1
27                break
28
29    if end_idx is None:
30        return None
31
32    series_json_str = text[start_idx:end_idx]
33    categories_match = re.search(r'categories\s*:\s*(\[[^\]]+\])', text)
34    if not categories_match:
35        return None
36
37    try:
38        categories = json.loads(categories_match.group(1))
39        series = json.loads(series_json_str)
40
41        if isinstance(series, list) and series and "data" in series[0]:
42            y_values = [entry.get("y") for entry in series[0]["data"] if "y" in entry]
43            return dict(zip(categories, y_values))
44    except Exception:
45        return None
46
47
48def extract_ic_dph_sidlo(text):
49    soup = BeautifulSoup(text, 'html.parser')
50    ic_dph = None
51    sidlo = None
52
53    ic_dph_element = soup.find('strong', string=lambda t: t and "IČ DPH" in t)
54    if ic_dph_element:
55        span_text = ic_dph_element.find_next('span').get_text(" ", strip=True)
56        match = re.search(r'(SK\d+)', span_text)
57        if match:
58            ic_dph = match.group(1)
59
60    sidlo_element = soup.find('strong', string=lambda t: t and "Sídlo" in t)
61    if sidlo_element:
62        sidlo = sidlo_element.find_next('span').get_text(" ", strip=True)
63
64    return ic_dph, sidlo
65
66
67def extract_sk_nace(text):
68    soup = BeautifulSoup(text, 'html.parser')
69    nace_element = soup.find('strong', string=lambda t: t and "SK NACE" in t)
70    if nace_element:
71        span_element = nace_element.find_next('span')
72        if span_element:
73            div_element = span_element.find('div')
74            return div_element.get_text(" ", strip=True) if div_element else span_element.get_text(" ", strip=True)
75    return None
76
77
78def extract_employees(text):
79    soup = BeautifulSoup(text, 'html.parser')
80    emp_element = soup.find('strong', string=lambda t: t and "Počet zamestnancov" in t)
81    if emp_element:
82        span = emp_element.find_next('span')
83        if span:
84            return span.get_text(" ", strip=True)
85    return None
86
87
88def extract_min_employees(employees_text):
89    if not employees_text:
90        return 0
91    match = re.search(r'(\d+)\s*-\s*\d+', employees_text)
92    return int(match.group(1)) if match else 0
93
94
95def extract_established_date(text):
96    soup = BeautifulSoup(text, 'html.parser')
97    est_element = soup.find('strong', string=lambda t: t and "Dátum vzniku" in t)
98    if est_element:
99        span = est_element.find_next('span')
100        if span:
101            return span.get_text(" ", strip=True)
102    return None
103
104
105async def fetch_and_parse(ico, session):
106    url = f"https://www.finstat.sk/{ico}"
107    async with semaphore:
108        try:
109            resp = await session.get(url, timeout=20)
110            html = resp.text
111
112            revenue_data = extract_trzby_data(html)
113            revenue_by_year = extract_trzby_data(html) or {}
114            flat_revenue = {f"revenue_{year}": value for year, value in revenue_by_year.items()}
115            ic_dph, sidlo = extract_ic_dph_sidlo(html)
116            sk_nace = extract_sk_nace(html)
117            employees = extract_employees(html)
118            min_employees = extract_min_employees(employees)
119            established_date = extract_established_date(html)
120
121            return {
122            "ico": ico,
123            "ic_dph": ic_dph,
124            "sidlo": sidlo,
125            "sk_nace": sk_nace,
126            "employees_text": employees,
127            "min_employees": min_employees,
128            "established_date": established_date,
129            **flat_revenue  # Spread the revenue_YYYY keys directly
130        }
131        except Exception as e:
132            return {
133                "ico": ico,
134                "error": str(e)
135            }
136
137
138async def main():
139    async with Actor:
140        input_data = await Actor.get_input()
141        icos = input_data.get("icos", [])
142        if not icos:
143            Actor.log.warning("No ICOs provided in input.")
144            return
145
146        start = time.time()
147
148        async with AsyncSession() as session:
149            tasks = [fetch_and_parse(ico, session) for ico in icos]
150            results = await asyncio.gather(*tasks)
151
152        await Actor.push_data(results)
153        Actor.log.info(f"Finished in {time.time() - start:.2f} seconds")
154
155
156if __name__ == "__main__":
157    asyncio.run(main())

src/py.typed

.dockerignore

.git
.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.gitignore

.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

# Zed editor
#  Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out
#  to share Project Settings within a team 
.zed

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5beautifulsoup4[lxml]
6httpx
7types-beautifulsoup4
8curl_cffi

Rental as PPE Example

lukaskrivka/rental-as-ppe-example

Example Actor that simulated rental payments (with free results) in PPE billing system

Lukáš Křivka

ScraperCodeGenerator

ohlava/ScraperCodeGenerator

An intelligent web scraping tool that automatically generates custom scraping code for any website.

Ondřej Hlava

🔍 Bing Copilot [API]

openapi/bing-copilot

Use Bing's Copilot via API! No API key. Fast, cheap and reliable.

Open API

5.0

Getlatka Scraper

build_matrix/getlatka-scraper

Get detailed information about SaaS companies from GetLatka.

Build Matrix

Pinterest Ads Scraper

lexis-solutions/pinterest-ads-scraper

Scrape Pinterest ads to extract trending product insights, campaign performance, and audience data. Ideal for market research, competitor tracking, and digital marketing optimization. Fast, structured, and customizable ad data.

Lexis Solutions

5.0

Phone Number Formatter

dominic-quaiser/phone-number-formatter

Easily parse and format phone numbers in bulk with this Apify Actor. Supports E.164, International, National, and RFC3966 formats, configurable regions, batch processing, concurrency, rate limiting, and retries. Ideal for CRMs, SMS campaigns, and data migrations.

Dominic M. Quaiser

🔍 GPT Search [Private API]

openapi/gpt-search-private-api

Use OpenAI's GPT4o Search mode via API! No cookie or proxy is required. Fast, cheap and reliable.

Open API

5.0

Free Proxy Scraper

codepoetry/free-proxy-fetcher

Scrape and filter 100% working free proxies with our tool. Filter by country codes and protocols (HTTP, SSL, SOCKS4, SOCKS5), then sort by speed. Ideal for web scraping, SEO, or bypassing restrictions. Get a ready-to-use list in seconds with automated validation and sorting.