
Finstat Scraper
Pricing
$2.00 / 1,000 results
Go to Store

Finstat Scraper
Scrape info about Slovak companies. Finstat scraper na sťahovanie info o slovenských firmách podľa IČO.
0.0 (0)
Pricing
$2.00 / 1,000 results
0
Total users
2
Monthly users
2
Runs succeeded
>99%
Last modified
2 days ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.13
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q src/
# Create and run as a non-root user.RUN useradd --create-home apify && \ chown -R apify:apify ./USER apify
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "finstat-revenue-scraper", "title": "Finstat Revenue Scraper", "description": "Scrapes Slovak companies from Finstat and extracts revenue, NACE, employees, and VAT info.", "version": "0.1", "buildTag": "latest", "storages": { "dataset": { "actorSpecification": 1, "views": { "overview": { "title": "Overview", "transformation": { "fields": [ "ico", "ic_dph", "sidlo", "sk_nace", "established_date", "employees_text", "min_employees", "revenue_2019", "revenue_2020", "revenue_2021", "revenue_2022", "revenue_2023", "revenue_2024" ] }, "display": { "component": "table", "properties": { "ico": { "label": "ICO", "format": "text" }, "ic_dph": { "label": "IČ DPH", "format": "text" }, "sidlo": { "label": "Sídlo", "format": "text" }, "sk_nace": { "label": "SK NACE", "format": "text" }, "established_date": { "label": "Dátum vzniku", "format": "text" }, "employees_text": { "label": "Počet zamestnancov", "format": "text" }, "min_employees": { "label": "Min. zamestnanci", "format": "number" }, "revenue_2019": { "label": "Tržby 2019", "format": "number" }, "revenue_2020": { "label": "Tržby 2020", "format": "number" }, "revenue_2021": { "label": "Tržby 2021", "format": "number" }, "revenue_2022": { "label": "Tržby 2022", "format": "number" }, "revenue_2023": { "label": "Tržby 2023", "format": "number" }, "revenue_2024": { "label": "Tržby 2024", "format": "number" } } } } } } }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Finstat Revenue Scraper", "description": "Scrapes Tržby (Revenue) data from Finstat for given ICOs.", "type": "object", "schemaVersion": 1, "properties": { "icos": { "title": "ICO List", "type": "array", "description": "List of ICOs to process (e.g. company identifiers in Slovakia)", "editor": "stringList", "prefill": ["35882085", "36468924", "36249955"] } }, "required": ["icos"]}
src/__init__.py
1
src/__main__.py
1import asyncio2
3from .main import main4
5# Execute the Actor entry point.6asyncio.run(main())
src/main.py
1from apify import Actor2from bs4 import BeautifulSoup3from curl_cffi.requests import AsyncSession4import asyncio5import json6import re7import time8
9semaphore = asyncio.Semaphore(3)10
11
12def extract_trzby_data(text):13 series_marker = '[{"name":"Tr\\u017Eby","showInLegend":false,"data":['14 start_idx = text.find(series_marker)15 if start_idx == -1:16 return None17
18 count = 019 end_idx = None20 for i in range(start_idx, len(text)):21 if text[i] == '[':22 count += 123 elif text[i] == ']':24 count -= 125 if count == 0:26 end_idx = i + 127 break28
29 if end_idx is None:30 return None31
32 series_json_str = text[start_idx:end_idx]33 categories_match = re.search(r'categories\s*:\s*(\[[^\]]+\])', text)34 if not categories_match:35 return None36
37 try:38 categories = json.loads(categories_match.group(1))39 series = json.loads(series_json_str)40
41 if isinstance(series, list) and series and "data" in series[0]:42 y_values = [entry.get("y") for entry in series[0]["data"] if "y" in entry]43 return dict(zip(categories, y_values))44 except Exception:45 return None46
47
48def extract_ic_dph_sidlo(text):49 soup = BeautifulSoup(text, 'html.parser')50 ic_dph = None51 sidlo = None52
53 ic_dph_element = soup.find('strong', string=lambda t: t and "IČ DPH" in t)54 if ic_dph_element:55 span_text = ic_dph_element.find_next('span').get_text(" ", strip=True)56 match = re.search(r'(SK\d+)', span_text)57 if match:58 ic_dph = match.group(1)59
60 sidlo_element = soup.find('strong', string=lambda t: t and "Sídlo" in t)61 if sidlo_element:62 sidlo = sidlo_element.find_next('span').get_text(" ", strip=True)63
64 return ic_dph, sidlo65
66
67def extract_sk_nace(text):68 soup = BeautifulSoup(text, 'html.parser')69 nace_element = soup.find('strong', string=lambda t: t and "SK NACE" in t)70 if nace_element:71 span_element = nace_element.find_next('span')72 if span_element:73 div_element = span_element.find('div')74 return div_element.get_text(" ", strip=True) if div_element else span_element.get_text(" ", strip=True)75 return None76
77
78def extract_employees(text):79 soup = BeautifulSoup(text, 'html.parser')80 emp_element = soup.find('strong', string=lambda t: t and "Počet zamestnancov" in t)81 if emp_element:82 span = emp_element.find_next('span')83 if span:84 return span.get_text(" ", strip=True)85 return None86
87
88def extract_min_employees(employees_text):89 if not employees_text:90 return 091 match = re.search(r'(\d+)\s*-\s*\d+', employees_text)92 return int(match.group(1)) if match else 093
94
95def extract_established_date(text):96 soup = BeautifulSoup(text, 'html.parser')97 est_element = soup.find('strong', string=lambda t: t and "Dátum vzniku" in t)98 if est_element:99 span = est_element.find_next('span')100 if span:101 return span.get_text(" ", strip=True)102 return None103
104
105async def fetch_and_parse(ico, session):106 url = f"https://www.finstat.sk/{ico}"107 async with semaphore:108 try:109 resp = await session.get(url, timeout=20)110 html = resp.text111
112 revenue_data = extract_trzby_data(html)113 revenue_by_year = extract_trzby_data(html) or {}114 flat_revenue = {f"revenue_{year}": value for year, value in revenue_by_year.items()}115 ic_dph, sidlo = extract_ic_dph_sidlo(html)116 sk_nace = extract_sk_nace(html)117 employees = extract_employees(html)118 min_employees = extract_min_employees(employees)119 established_date = extract_established_date(html)120
121 return {122 "ico": ico,123 "ic_dph": ic_dph,124 "sidlo": sidlo,125 "sk_nace": sk_nace,126 "employees_text": employees,127 "min_employees": min_employees,128 "established_date": established_date,129 **flat_revenue # Spread the revenue_YYYY keys directly130 }131 except Exception as e:132 return {133 "ico": ico,134 "error": str(e)135 }136
137
138async def main():139 async with Actor:140 input_data = await Actor.get_input()141 icos = input_data.get("icos", [])142 if not icos:143 Actor.log.warning("No ICOs provided in input.")144 return145
146 start = time.time()147
148 async with AsyncSession() as session:149 tasks = [fetch_and_parse(ico, session) for ico in icos]150 results = await asyncio.gather(*tasks)151
152 await Actor.push_data(results)153 Actor.log.info(f"Finished in {time.time() - start:.2f} seconds")154
155
156if __name__ == "__main__":157 asyncio.run(main())
src/py.typed
.dockerignore
.git.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
.gitignore
.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
# Zed editor# Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out# to share Project Settings within a team .zed
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify < 3.05beautifulsoup4[lxml]6httpx7types-beautifulsoup48curl_cffi