Finstat Scraper avatar
Finstat Scraper

Pricing

$2.00 / 1,000 results

Go to Store
Finstat Scraper

Finstat Scraper

Developed by

Miso

Miso

Maintained by Community

Scrape info about Slovak companies. Finstat scraper na sťahovanie info o slovenských firmách podľa IČO.

0.0 (0)

Pricing

$2.00 / 1,000 results

0

Total users

2

Monthly users

2

Runs succeeded

>99%

Last modified

2 days ago

.actor/Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.13
# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q src/
# Create and run as a non-root user.
RUN useradd --create-home apify && \
chown -R apify:apify ./
USER apify
# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "finstat-revenue-scraper",
"title": "Finstat Revenue Scraper",
"description": "Scrapes Slovak companies from Finstat and extracts revenue, NACE, employees, and VAT info.",
"version": "0.1",
"buildTag": "latest",
"storages": {
"dataset": {
"actorSpecification": 1,
"views": {
"overview": {
"title": "Overview",
"transformation": {
"fields": [
"ico",
"ic_dph",
"sidlo",
"sk_nace",
"established_date",
"employees_text",
"min_employees",
"revenue_2019",
"revenue_2020",
"revenue_2021",
"revenue_2022",
"revenue_2023",
"revenue_2024"
]
},
"display": {
"component": "table",
"properties": {
"ico": { "label": "ICO", "format": "text" },
"ic_dph": { "label": "IČ DPH", "format": "text" },
"sidlo": { "label": "Sídlo", "format": "text" },
"sk_nace": { "label": "SK NACE", "format": "text" },
"established_date": { "label": "Dátum vzniku", "format": "text" },
"employees_text": { "label": "Počet zamestnancov", "format": "text" },
"min_employees": { "label": "Min. zamestnanci", "format": "number" },
"revenue_2019": { "label": "Tržby 2019", "format": "number" },
"revenue_2020": { "label": "Tržby 2020", "format": "number" },
"revenue_2021": { "label": "Tržby 2021", "format": "number" },
"revenue_2022": { "label": "Tržby 2022", "format": "number" },
"revenue_2023": { "label": "Tržby 2023", "format": "number" },
"revenue_2024": { "label": "Tržby 2024", "format": "number" }
}
}
}
}
}
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Finstat Revenue Scraper",
"description": "Scrapes Tržby (Revenue) data from Finstat for given ICOs.",
"type": "object",
"schemaVersion": 1,
"properties": {
"icos": {
"title": "ICO List",
"type": "array",
"description": "List of ICOs to process (e.g. company identifiers in Slovakia)",
"editor": "stringList",
"prefill": ["35882085", "36468924", "36249955"]
}
},
"required": ["icos"]
}

src/__init__.py

1

src/__main__.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())

src/main.py

1from apify import Actor
2from bs4 import BeautifulSoup
3from curl_cffi.requests import AsyncSession
4import asyncio
5import json
6import re
7import time
8
9semaphore = asyncio.Semaphore(3)
10
11
12def extract_trzby_data(text):
13 series_marker = '[{"name":"Tr\\u017Eby","showInLegend":false,"data":['
14 start_idx = text.find(series_marker)
15 if start_idx == -1:
16 return None
17
18 count = 0
19 end_idx = None
20 for i in range(start_idx, len(text)):
21 if text[i] == '[':
22 count += 1
23 elif text[i] == ']':
24 count -= 1
25 if count == 0:
26 end_idx = i + 1
27 break
28
29 if end_idx is None:
30 return None
31
32 series_json_str = text[start_idx:end_idx]
33 categories_match = re.search(r'categories\s*:\s*(\[[^\]]+\])', text)
34 if not categories_match:
35 return None
36
37 try:
38 categories = json.loads(categories_match.group(1))
39 series = json.loads(series_json_str)
40
41 if isinstance(series, list) and series and "data" in series[0]:
42 y_values = [entry.get("y") for entry in series[0]["data"] if "y" in entry]
43 return dict(zip(categories, y_values))
44 except Exception:
45 return None
46
47
48def extract_ic_dph_sidlo(text):
49 soup = BeautifulSoup(text, 'html.parser')
50 ic_dph = None
51 sidlo = None
52
53 ic_dph_element = soup.find('strong', string=lambda t: t and "IČ DPH" in t)
54 if ic_dph_element:
55 span_text = ic_dph_element.find_next('span').get_text(" ", strip=True)
56 match = re.search(r'(SK\d+)', span_text)
57 if match:
58 ic_dph = match.group(1)
59
60 sidlo_element = soup.find('strong', string=lambda t: t and "Sídlo" in t)
61 if sidlo_element:
62 sidlo = sidlo_element.find_next('span').get_text(" ", strip=True)
63
64 return ic_dph, sidlo
65
66
67def extract_sk_nace(text):
68 soup = BeautifulSoup(text, 'html.parser')
69 nace_element = soup.find('strong', string=lambda t: t and "SK NACE" in t)
70 if nace_element:
71 span_element = nace_element.find_next('span')
72 if span_element:
73 div_element = span_element.find('div')
74 return div_element.get_text(" ", strip=True) if div_element else span_element.get_text(" ", strip=True)
75 return None
76
77
78def extract_employees(text):
79 soup = BeautifulSoup(text, 'html.parser')
80 emp_element = soup.find('strong', string=lambda t: t and "Počet zamestnancov" in t)
81 if emp_element:
82 span = emp_element.find_next('span')
83 if span:
84 return span.get_text(" ", strip=True)
85 return None
86
87
88def extract_min_employees(employees_text):
89 if not employees_text:
90 return 0
91 match = re.search(r'(\d+)\s*-\s*\d+', employees_text)
92 return int(match.group(1)) if match else 0
93
94
95def extract_established_date(text):
96 soup = BeautifulSoup(text, 'html.parser')
97 est_element = soup.find('strong', string=lambda t: t and "Dátum vzniku" in t)
98 if est_element:
99 span = est_element.find_next('span')
100 if span:
101 return span.get_text(" ", strip=True)
102 return None
103
104
105async def fetch_and_parse(ico, session):
106 url = f"https://www.finstat.sk/{ico}"
107 async with semaphore:
108 try:
109 resp = await session.get(url, timeout=20)
110 html = resp.text
111
112 revenue_data = extract_trzby_data(html)
113 revenue_by_year = extract_trzby_data(html) or {}
114 flat_revenue = {f"revenue_{year}": value for year, value in revenue_by_year.items()}
115 ic_dph, sidlo = extract_ic_dph_sidlo(html)
116 sk_nace = extract_sk_nace(html)
117 employees = extract_employees(html)
118 min_employees = extract_min_employees(employees)
119 established_date = extract_established_date(html)
120
121 return {
122 "ico": ico,
123 "ic_dph": ic_dph,
124 "sidlo": sidlo,
125 "sk_nace": sk_nace,
126 "employees_text": employees,
127 "min_employees": min_employees,
128 "established_date": established_date,
129 **flat_revenue # Spread the revenue_YYYY keys directly
130 }
131 except Exception as e:
132 return {
133 "ico": ico,
134 "error": str(e)
135 }
136
137
138async def main():
139 async with Actor:
140 input_data = await Actor.get_input()
141 icos = input_data.get("icos", [])
142 if not icos:
143 Actor.log.warning("No ICOs provided in input.")
144 return
145
146 start = time.time()
147
148 async with AsyncSession() as session:
149 tasks = [fetch_and_parse(ico, session) for ico in icos]
150 results = await asyncio.gather(*tasks)
151
152 await Actor.push_data(results)
153 Actor.log.info(f"Finished in {time.time() - start:.2f} seconds")
154
155
156if __name__ == "__main__":
157 asyncio.run(main())

src/py.typed

.dockerignore

.git
.mise.toml
.nvim.lua
storage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.gitignore

.mise.toml
.nvim.lua
storage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# Zed editor
# Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out
# to share Project Settings within a team
.zed

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5beautifulsoup4[lxml]
6httpx
7types-beautifulsoup4
8curl_cffi