1from apify import Actor
2from bs4 import BeautifulSoup
3from curl_cffi.requests import AsyncSession
4import asyncio
5import json
6import re
7import time
8
9semaphore = asyncio.Semaphore(3)
10
11
12def extract_trzby_data(text):
13 series_marker = '[{"name":"Tr\\u017Eby","showInLegend":false,"data":['
14 start_idx = text.find(series_marker)
15 if start_idx == -1:
16 return None
17
18 count = 0
19 end_idx = None
20 for i in range(start_idx, len(text)):
21 if text[i] == '[':
22 count += 1
23 elif text[i] == ']':
24 count -= 1
25 if count == 0:
26 end_idx = i + 1
27 break
28
29 if end_idx is None:
30 return None
31
32 series_json_str = text[start_idx:end_idx]
33 categories_match = re.search(r'categories\s*:\s*(\[[^\]]+\])', text)
34 if not categories_match:
35 return None
36
37 try:
38 categories = json.loads(categories_match.group(1))
39 series = json.loads(series_json_str)
40
41 if isinstance(series, list) and series and "data" in series[0]:
42 y_values = [entry.get("y") for entry in series[0]["data"] if "y" in entry]
43 return dict(zip(categories, y_values))
44 except Exception:
45 return None
46
47
48def extract_ic_dph_sidlo(text):
49 soup = BeautifulSoup(text, 'html.parser')
50 ic_dph = None
51 sidlo = None
52
53 ic_dph_element = soup.find('strong', string=lambda t: t and "IČ DPH" in t)
54 if ic_dph_element:
55 span_text = ic_dph_element.find_next('span').get_text(" ", strip=True)
56 match = re.search(r'(SK\d+)', span_text)
57 if match:
58 ic_dph = match.group(1)
59
60 sidlo_element = soup.find('strong', string=lambda t: t and "Sídlo" in t)
61 if sidlo_element:
62 sidlo = sidlo_element.find_next('span').get_text(" ", strip=True)
63
64 return ic_dph, sidlo
65
66
67def extract_sk_nace(text):
68 soup = BeautifulSoup(text, 'html.parser')
69 nace_element = soup.find('strong', string=lambda t: t and "SK NACE" in t)
70 if nace_element:
71 span_element = nace_element.find_next('span')
72 if span_element:
73 div_element = span_element.find('div')
74 return div_element.get_text(" ", strip=True) if div_element else span_element.get_text(" ", strip=True)
75 return None
76
77
78def extract_employees(text):
79 soup = BeautifulSoup(text, 'html.parser')
80 emp_element = soup.find('strong', string=lambda t: t and "Počet zamestnancov" in t)
81 if emp_element:
82 span = emp_element.find_next('span')
83 if span:
84 return span.get_text(" ", strip=True)
85 return None
86
87
88def extract_min_employees(employees_text):
89 if not employees_text:
90 return 0
91 match = re.search(r'(\d+)\s*-\s*\d+', employees_text)
92 return int(match.group(1)) if match else 0
93
94
95def extract_established_date(text):
96 soup = BeautifulSoup(text, 'html.parser')
97 est_element = soup.find('strong', string=lambda t: t and "Dátum vzniku" in t)
98 if est_element:
99 span = est_element.find_next('span')
100 if span:
101 return span.get_text(" ", strip=True)
102 return None
103
104
105async def fetch_and_parse(ico, session):
106 url = f"https://www.finstat.sk/{ico}"
107 async with semaphore:
108 try:
109 resp = await session.get(url, timeout=20)
110 html = resp.text
111
112 revenue_data = extract_trzby_data(html)
113 revenue_by_year = extract_trzby_data(html) or {}
114 flat_revenue = {f"revenue_{year}": value for year, value in revenue_by_year.items()}
115 ic_dph, sidlo = extract_ic_dph_sidlo(html)
116 sk_nace = extract_sk_nace(html)
117 employees = extract_employees(html)
118 min_employees = extract_min_employees(employees)
119 established_date = extract_established_date(html)
120
121 return {
122 "ico": ico,
123 "ic_dph": ic_dph,
124 "sidlo": sidlo,
125 "sk_nace": sk_nace,
126 "employees_text": employees,
127 "min_employees": min_employees,
128 "established_date": established_date,
129 **flat_revenue
130 }
131 except Exception as e:
132 return {
133 "ico": ico,
134 "error": str(e)
135 }
136
137
138async def main():
139 async with Actor:
140 input_data = await Actor.get_input()
141 icos = input_data.get("icos", [])
142 if not icos:
143 Actor.log.warning("No ICOs provided in input.")
144 return
145
146 start = time.time()
147
148 async with AsyncSession() as session:
149 tasks = [fetch_and_parse(ico, session) for ico in icos]
150 results = await asyncio.gather(*tasks)
151
152 await Actor.push_data(results)
153 Actor.log.info(f"Finished in {time.time() - start:.2f} seconds")
154
155
156if __name__ == "__main__":
157 asyncio.run(main())